typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
const uint32_t(&)[4]> Shuffle8bpcArgs;
+#if USE_SIMD16_SHADERS
+ void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2);
+#else
void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
+#endif
typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
+#if USE_SIMD16_SHADERS
+ void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2);
+#else
void Shuffle16bpcGather(Shuffle16bpcArgs &args);
+#endif
void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+#if USE_SIMD16_SHADERS
+ Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
+#else
Value* GenerateCompCtrlVector(const ComponentControl ctrl);
+#endif
void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
+#if USE_SIMD16_SHADERS
+ void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2);
+#else
void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
+#endif
bool IsOddFormat(SWR_FORMAT format);
bool IsUniformFormat(SWR_FORMAT format);
std::vector<Value*> vtxInputIndices(2, C(0));
// GEP
pVtxOut = GEP(pVtxOut, C(0));
+#if USE_SIMD16_SHADERS
+#if 0
+ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth * 2), 0));
+#else
pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
+#endif
+#else
+ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
+#endif
// SWR_FETCH_CONTEXT::pStreams
Value* streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
Value* vIndices;
+#if USE_SIMD16_SHADERS
+ Value* indices2;
+ Value* vIndices2;
+#endif
switch(fetchState.indexType)
{
case R8_UINT:
indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
- if(fetchState.bDisableIndexOOBCheck){
+#if USE_SIMD16_SHADERS
+ indices2 = GEP(indices, C(8));
+#endif
+ if(fetchState.bDisableIndexOOBCheck)
+ {
vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
vIndices = Z_EXT(vIndices, mSimdInt32Ty);
+#if USE_SIMD16_SHADERS
+ vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
+ vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
+#endif
}
- else{
+ else
+ {
pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
+#if USE_SIMD16_SHADERS
+ pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
+ vIndices2 = GetSimdValid8bitIndices(indices2, pLastIndex);
+#endif
}
break;
case R16_UINT:
indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0));
- if(fetchState.bDisableIndexOOBCheck){
+#if USE_SIMD16_SHADERS
+ indices2 = GEP(indices, C(8));
+#endif
+ if(fetchState.bDisableIndexOOBCheck)
+ {
vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
vIndices = Z_EXT(vIndices, mSimdInt32Ty);
+#if USE_SIMD16_SHADERS
+ vIndices2 = LOAD(BITCAST(indices2, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), { (uint32_t)0 });
+ vIndices2 = Z_EXT(vIndices2, mSimdInt32Ty);
+#endif
}
- else{
+ else
+ {
pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
+#if USE_SIMD16_SHADERS
+ pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
+ vIndices2 = GetSimdValid16bitIndices(indices2, pLastIndex);
+#endif
}
break;
case R32_UINT:
+#if USE_SIMD16_SHADERS
+ indices2 = GEP(indices, C(8));
+#endif
(fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
: vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
+#if USE_SIMD16_SHADERS
+ (fetchState.bDisableIndexOOBCheck) ? vIndices2 = LOAD(BITCAST(indices2, PointerType::get(mSimdInt32Ty, 0)), { (uint32_t)0 })
+ : vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
+#endif
break; // incoming type is already 32bit int
default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
}
Value* vVertexId = vIndices;
+#if USE_SIMD16_SHADERS
+ Value* vVertexId2 = vIndices2;
+#endif
if (fetchState.bVertexIDOffsetEnable)
{
// Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
vVertexId = ADD(vIndices, vBaseVertex);
vVertexId = ADD(vVertexId, vStartVertex);
+#if USE_SIMD16_SHADERS
+ vVertexId2 = ADD(vIndices2, vBaseVertex);
+ vVertexId2 = ADD(vVertexId2, vStartVertex);
+#endif
}
// store out vertex IDs
STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
+#if USE_SIMD16_SHADERS
+ STORE(vVertexId2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
+#endif
// store out cut mask if enabled
if (fetchState.bEnableCutIndex)
Value* vCutIndex = VIMMED1(fetchState.cutIndex);
Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
+#if USE_SIMD16_SHADERS
+ Value* cutMask2 = VMASK(ICMP_EQ(vIndices2, vCutIndex));
+ STORE(cutMask2, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
+#endif
}
// Fetch attributes from memory and output to a simdvertex struct
// since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
+#if USE_SIMD16_SHADERS
+ if (fetchState.bDisableVGATHER)
+ {
+ JitLoadVertices(fetchState, streams, vIndices, pVtxOut);
+ JitLoadVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)));
+ }
+ else
+ {
+ JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false);
+ JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true);
+ }
+#else
(fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut)
: JitGatherVertices(fetchState, streams, vIndices, pVtxOut);
+#endif
RET_VOID();
for(uint32_t c = 0; c < 4; ++c)
{
+#if USE_SIMD16_SHADERS
+ Value* dest = GEP(pVtxOut, C(nelt * 8 + c * 2), "destGEP");
+#else
Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
+#endif
STORE(elements[c], dest);
}
}
/// @param streams - value pointer to the current vertex stream
/// @param vIndices - vector value of indices to gather
/// @param pVtxOut - value pointer to output simdvertex struct
+#if USE_SIMD16_SHADERS
+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
+ Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2)
+#else
void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
- Value* streams, Value* vIndices, Value* pVtxOut)
+ Value* streams, Value* vIndices, Value* pVtxOut)
+#endif
{
uint32_t currentVertexElement = 0;
uint32_t outputElt = 0;
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
// Shuffle gathered components into place in simdvertex struct
+#if USE_SIMD16_SHADERS
+ Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
+#else
Shuffle16bpcGather(args); // outputs to vVertexElements ref
+#endif
}
}
break;
}
else
{
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
}
else
{
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
// Shuffle gathered components into place in simdvertex struct
+#if USE_SIMD16_SHADERS
+ Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref
+#else
Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
+#endif
}
}
break;
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
// Shuffle gathered components into place in simdvertex struct
+#if USE_SIMD16_SHADERS
+ Shuffle16bpcGather(args, useVertexID2); // outputs to vVertexElements ref
+#else
Shuffle16bpcGather(args); // outputs to vVertexElements ref
+#endif
}
}
break;
}
else
{
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
/// @param compCtrl - component control val
/// @param vVertexElements[4] - vertex components to output
/// @param swizzle[4] - component swizzle location
+#if USE_SIMD16_SHADERS
+void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2)
+#else
void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
+#endif
{
// Unpack tuple args
Value*& vGatherResult = std::get<0>(args);
}
else
{
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
}
else
{
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
/// @param compMask - component packing mask
/// @param compCtrl - component control val
/// @param vVertexElements[4] - vertex components to output
+#if USE_SIMD16_SHADERS
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2)
+#else
void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
+#endif
{
// Unpack tuple args
Value* (&vGatherResult)[2] = std::get<0>(args);
}
else
{
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
}
else
{
+#if USE_SIMD16_SHADERS
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
+#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+#endif
}
if (currentVertexElement > 3)
#endif
// outputElt * 4 = offsetting by the size of a simdvertex
// + c offsets to a 32bit x vWidth row within the current vertex
+#if USE_SIMD16_SHADERS
+ Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP");
+#else
Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
+#endif
STORE(vVertexElements[c], dest);
}
}
/// @brief Generates a constant vector of values based on the
/// ComponentControl value
/// @param ctrl - ComponentControl value
+#if USE_SIMD16_SHADERS
+Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2)
+#else
Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
+#endif
{
switch(ctrl)
{
case Store1Int: return VIMMED1(1);
case StoreVertexId:
{
+#if USE_SIMD16_SHADERS
+ Value* pId;
+ if (useVertexID2)
+ {
+ pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
+ }
+ else
+ {
+ pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
+ }
+#else
Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
+#endif
return VBROADCAST(pId);
}
case StoreInstanceId: