//////////////////////////////////////////////////////////////////////////
/// Interface to Jitting a fetch shader
//////////////////////////////////////////////////////////////////////////
-struct FetchJit : public Builder
+struct FetchJit :
+ public Builder
{
- FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+ FetchJit(JitManager* pJitMgr) :
+ Builder(pJitMgr)
+ {}
Function* Create(const FETCH_COMPILE_STATE& fetchState);
Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
{
- std::stringstream fnName("FetchShader_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+ std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
Function* fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
auto argitr = fetch->arg_begin();
// Fetch shader arguments
+ Value* privateContext = &*argitr; ++argitr;
+ privateContext->setName("privateContext");
+ SetPrivateContext(privateContext);
+
mpFetchInfo = &*argitr; ++argitr;
mpFetchInfo->setName("fetchInfo");
Value* pVtxOut = &*argitr;
: vIndices2 = GetSimdValid32bitIndices(indices2, pLastIndex);
#endif
break; // incoming type is already 32bit int
- default: SWR_INVALID("Unsupported index type"); vIndices = nullptr; break;
+ default:
+ SWR_INVALID("Unsupported index type");
+ vIndices = nullptr;
+#if USE_SIMD16_SHADERS
+ vIndices2 = nullptr;
+#endif
+ break;
}
if(fetchState.bForceSequentialAccessEnable)
JitManager::DumpToFile(fetch, "opt");
+
return fetch;
}
}
else if (ied.InstanceStrideEnable)
{
+ // silence unused variable warnings
+ startOffset = C(0);
+ vCurIndices = vIndices;
+
SWR_ASSERT((0), "TODO: Fill out more once driver sends this down.");
}
else
}
// load SWR_VERTEX_BUFFER_STATE::pData
- Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
+ Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
// load SWR_VERTEX_BUFFER_STATE::pitch
Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
// only works if pixel size is <= 32bits
SWR_ASSERT(info.bpp <= 32);
- Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+ Value *pGather;
+ if (info.bpp == 32)
+ {
+ pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+ }
+ else
+ {
+ // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
+ Value *pMem = ALLOCA(mSimdInt32Ty);
+ STORE(VIMMED1(0u), pMem);
+
+ pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
+ Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
+
+ for (uint32_t lane = 0; lane < mVWidth; ++lane)
+ {
+ // Get index
+ Value* index = VEXTRACT(pOffsets, C(lane));
+ Value* mask = VEXTRACT(pMask, C(lane));
+ switch (info.bpp)
+ {
+ case 8:
+ {
+ Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt8Ty, 0));
+ Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt8Ty, 0));
+ STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+ break;
+ }
+
+ case 16:
+ {
+ Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
+ Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
+ STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+ break;
+ }
+ break;
+
+ case 24:
+ {
+ // First 16-bits of data
+ Value* pDst = BITCAST(GEP(pDstMem, C(lane)), PointerType::get(mInt16Ty, 0));
+ Value* pSrc = BITCAST(GEP(pBase, index), PointerType::get(mInt16Ty, 0));
+ STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+
+ // Last 8-bits of data
+ pDst = BITCAST(GEP(pDst, C(1)), PointerType::get(mInt8Ty, 0));
+ pSrc = BITCAST(GEP(pSrc, C(1)), PointerType::get(mInt8Ty, 0));
+ STORE(LOAD(SELECT(mask, pSrc, pDst)), pDst);
+ break;
+ }
+
+ default:
+ SWR_INVALID("Shouldn't have BPP = %d now", info.bpp);
+ break;
+ }
+ }
+
+ pGather = LOAD(pMem);
+ }
for (uint32_t comp = 0; comp < 4; ++comp)
{
SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
uint32_t bpc = info.bpp / info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
- Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
+ Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
// VGATHER* takes an *i8 src pointer
- Value *pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
+ Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
#if USE_SIMD16_GATHERS
// calculate byte offset to the start of the VB
Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
pStreamBase = GEP(pStreamBase, baseOffset);
+ Value* pStreamBaseGFX = ADD(stream, baseOffset);
// if we have a start offset, subtract from max vertex. Used for OOB check
maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
// But, we know that elements must be aligned for FETCH. :)
// Right shift the offset by a bit and then scale by 2 to remove the sign extension.
Value *shiftedOffsets16 = LSHR(vOffsets16, 1);
- pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2);
+ pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBaseGFX, shiftedOffsets16, vGatherMask16, 2, GFX_MEM_CLIENT_FETCH);
}
else
{
currentVertexElement = 0;
}
}
-
- // offset base to the next component in the vertex to gather
- pStreamBase = GEP(pStreamBase, C((char)4));
#else
if (isComponentEnabled(compMask, i))
{
// But, we know that elements must be aligned for FETCH. :)
// Right shift the offset by a bit and then scale by 2 to remove the sign extension.
Value *vShiftedOffsets = LSHR(vOffsets, 1);
- vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
+ vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
}
else
{
currentVertexElement = 0;
}
}
+#endif
// offset base to the next component in the vertex to gather
pStreamBase = GEP(pStreamBase, C((char)4));
-#endif
+ pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
}
}
break;
// if valid, load the index. if not, load 0 from the stack
Value* pValid = SELECT(mask, pIndex, pZeroIndex);
- Value *index = LOAD(pValid, "valid index");
+ Value *index = LOAD(pValid, "valid index", GFX_MEM_CLIENT_FETCH);
// zero extended index to 32 bits and insert into the correct simd lane
index = Z_EXT(index, mInt32Ty);
// vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
// vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
Value* vMaxIndex = VBROADCAST(numIndicesLeft);
- Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
-
- // VMASKLOAD takes an *i8 src pointer
- pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
+ Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
// Load the indices; OOB loads 0
- return MASKLOADD(pIndices,vIndexMask);
+ pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
+ return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
}
//////////////////////////////////////////////////////////////////////////
Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
return VBROADCAST_16(pId);
}
+
+
case StoreSrc:
- default:
+ default:
SWR_INVALID("Invalid component control");
return VUNDEF_I_16();
}
{
switch (ctrl)
{
- case NoStore:
- return VUNDEF_I();
- case Store0:
- return VIMMED1(0);
- case Store1Fp:
- return VIMMED1(1.0f);
- case Store1Int:
- return VIMMED1(1);
- case StoreVertexId:
+ case NoStore:
+ return VUNDEF_I();
+ case Store0:
+ return VIMMED1(0);
+ case Store1Fp:
+ return VIMMED1(1.0f);
+ case Store1Int:
+ return VIMMED1(1);
+ case StoreVertexId:
{
#if USE_SIMD16_SHADERS
Value *pId;
#endif
return pId;
}
- case StoreInstanceId:
+ case StoreInstanceId:
{
Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
return VBROADCAST(pId);
}
- case StoreSrc:
- default:
- SWR_INVALID("Invalid component control");
- return VUNDEF_I();
+
+
+ case StoreSrc:
+ default:
+ SWR_INVALID("Invalid component control");
+ return VUNDEF_I();
}
}
}
}
+// Don't want two threads compiling the same fetch shader simultaneously
+// Has problems in the JIT cache implementation
+// This is only a problem for fetch right now.
+static std::mutex gFetchCodegenMutex;
//////////////////////////////////////////////////////////////////////////
/// @brief JITs from fetch shader IR
JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
PFN_FETCH_FUNC pfnFetch;
+ gFetchCodegenMutex.lock();
pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
// MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
pJitMgr->mIsModuleFinalized = true;
#endif
pJitMgr->DumpAsm(const_cast<llvm::Function*>(func), "final");
+ gFetchCodegenMutex.unlock();
+
+
return pfnFetch;
}