Builder::Builder(JitManager *pJitMgr)
: mpJitMgr(pJitMgr)
{
+ SWR_ASSERT(pJitMgr->mVWidth == 8);
+
mVWidth = pJitMgr->mVWidth;
-#if USE_SIMD16_BUILDER
- mVWidth2 = pJitMgr->mVWidth * 2;
-#endif
+ mVWidth16 = pJitMgr->mVWidth * 2;
mpIRBuilder = &pJitMgr->mBuilder;
- mVoidTy = Type::getVoidTy(pJitMgr->mContext);
- mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
- mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
- mFP32PtrTy = PointerType::get(mFP32Ty, 0);
- mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
- mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
- mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
- mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
- mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
- mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+ // Built in types: scalar
+
+ mVoidTy = Type::getVoidTy(pJitMgr->mContext);
+ mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
+ mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
+ mFP32PtrTy = PointerType::get(mFP32Ty, 0);
+ mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
+ mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
+ mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
+ mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
+ mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+ mInt8PtrTy = PointerType::get(mInt8Ty, 0);
mInt16PtrTy = PointerType::get(mInt16Ty, 0);
mInt32PtrTy = PointerType::get(mInt32Ty, 0);
- mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
- mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
- mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
- mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
- mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
- mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
- mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
- mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
+ mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
+
+ // Built in types: simd8
+
+ mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
+ mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+ mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+ mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+ mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+ mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+ mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
-#if USE_SIMD16_BUILDER
- mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2);
- mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2);
- mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2);
- mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2);
- mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2);
- mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2);
- mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4);
- mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5);
-#endif
+
+ // Built in types: simd16
+
+ mSimd16Int1Ty = VectorType::get(mInt1Ty, mVWidth16);
+ mSimd16Int16Ty = VectorType::get(mInt16Ty, mVWidth16);
+ mSimd16Int32Ty = VectorType::get(mInt32Ty, mVWidth16);
+ mSimd16Int64Ty = VectorType::get(mInt64Ty, mVWidth16);
+ mSimd16FP16Ty = VectorType::get(mFP16Ty, mVWidth16);
+ mSimd16FP32Ty = VectorType::get(mFP32Ty, mVWidth16);
+ mSimd16VectorTy = ArrayType::get(mSimd16FP32Ty, 4);
+ mSimd16VectorTRTy = ArrayType::get(mSimd16FP32Ty, 5);
if (sizeof(uint32_t*) == 4)
{
mIntPtrTy = mInt32Ty;
mSimdIntPtrTy = mSimdInt32Ty;
-#if USE_SIMD16_BUILDER
- mSimd2IntPtrTy = mSimd2Int32Ty;
-#endif
+ mSimd16IntPtrTy = mSimd16Int32Ty;
}
else
{
mIntPtrTy = mInt64Ty;
mSimdIntPtrTy = mSimdInt64Ty;
-#if USE_SIMD16_BUILDER
- mSimd2IntPtrTy = mSimd2Int64Ty;
-#endif
+ mSimd16IntPtrTy = mSimd16Int64Ty;
}
}
}
return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
- Value *Builder::VIMMED1(uint32_t i)
+ Value *Builder::VIMMED1_16(int i)
{
- return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
}
- Value *Builder::VIMMED1(float i)
+ Value *Builder::VIMMED1(uint32_t i)
{
- return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
- Value *Builder::VIMMED1(bool i)
+ Value *Builder::VIMMED1_16(uint32_t i)
{
- return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
}
-#if USE_SIMD16_BUILDER
- Value *Builder::VIMMED2_1(int i)
+ Value *Builder::VIMMED1(float i)
{
- return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
}
- Value *Builder::VIMMED2_1(uint32_t i)
+ Value *Builder::VIMMED1_16(float i)
{
- return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
}
- Value *Builder::VIMMED2_1(float i)
+ Value *Builder::VIMMED1(bool i)
{
- return ConstantVector::getSplat(mVWidth2, cast<ConstantFP>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
- Value *Builder::VIMMED2_1(bool i)
+ Value *Builder::VIMMED1_16(bool i)
{
- return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
}
-#endif
Value *Builder::VUNDEF_IPTR()
{
return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
}
+ Value *Builder::VUNDEF(Type* t)
+ {
+ return UndefValue::get(VectorType::get(t, mVWidth));
+ }
+
Value *Builder::VUNDEF_I()
{
return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
}
- Value *Builder::VUNDEF(Type *ty, uint32_t size)
+ Value *Builder::VUNDEF_I_16()
{
- return UndefValue::get(VectorType::get(ty, size));
+ return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
}
Value *Builder::VUNDEF_F()
return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
}
-#if USE_SIMD16_BUILDER
- Value *Builder::VUNDEF2_F()
- {
- return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
- }
-
- Value *Builder::VUNDEF2_I()
+ Value *Builder::VUNDEF_F_16()
{
- return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2));
+ return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
}
-#endif
- Value *Builder::VUNDEF(Type* t)
+ Value *Builder::VUNDEF(Type *ty, uint32_t size)
{
- return UndefValue::get(VectorType::get(t, mVWidth));
+ return UndefValue::get(VectorType::get(ty, size));
}
Value *Builder::VBROADCAST(Value *src)
return VECTOR_SPLAT(mVWidth, src);
}
-#if USE_SIMD16_BUILDER
- Value *Builder::VBROADCAST2(Value *src)
+ Value *Builder::VBROADCAST_16(Value *src)
{
// check if src is already a vector
if (src->getType()->isVectorTy())
return src;
}
- return VECTOR_SPLAT(mVWidth2, src);
+ return VECTOR_SPLAT(mVWidth16, src);
}
-#endif
uint32_t Builder::IMMED(Value* v)
{
SWR_ASSERT(isa<ConstantInt>(v));
Value *val = LOAD(validAddress);
vGather = VINSERT(vGather,val,C(i));
}
+
STACKRESTORE(pStack);
}
return vGather;
}
-#if USE_SIMD16_BUILDER
Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
{
- Value *vGather = VUNDEF2_F();
+ Value *vGather = VUNDEF_F_16();
- // use avx512 gather instruction if available
+ // use AVX512F gather instruction if available
if (JM()->mArch.AVX512F())
{
// force mask to <N-bit Integer>, required by vgather2
}
else
{
- Value *src0 = EXTRACT2(vSrc, 0);
- Value *src1 = EXTRACT2(vSrc, 1);
+ Value *src0 = EXTRACT_16(vSrc, 0);
+ Value *src1 = EXTRACT_16(vSrc, 1);
- Value *indices0 = EXTRACT2(vIndices, 0);
- Value *indices1 = EXTRACT2(vIndices, 1);
+ Value *indices0 = EXTRACT_16(vIndices, 0);
+ Value *indices1 = EXTRACT_16(vIndices, 1);
- Value *mask0 = EXTRACT2(vMask, 0);
- Value *mask1 = EXTRACT2(vMask, 1);
+ Value *mask0 = EXTRACT_16(vMask, 0);
+ Value *mask1 = EXTRACT_16(vMask, 1);
Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
- vGather = JOIN2(gather0, gather1);
+ vGather = JOIN_16(gather0, gather1);
}
return vGather;
}
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Generate a masked gather operation in LLVM IR. If not
/// supported on the underlying platform, emulate it with loads
STACKRESTORE(pStack);
}
+
return vGather;
}
-#if USE_SIMD16_BUILDER
Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
{
- Value *vGather = VUNDEF2_F();
+ Value *vGather = VUNDEF_I_16();
- // use avx512 gather instruction if available
+ // use AVX512F gather instruction if available
if (JM()->mArch.AVX512F())
{
// force mask to <N-bit Integer>, required by vgather2
}
else
{
- Value *src0 = EXTRACT2(vSrc, 0);
- Value *src1 = EXTRACT2(vSrc, 1);
+ Value *src0 = EXTRACT_16(vSrc, 0);
+ Value *src1 = EXTRACT_16(vSrc, 1);
- Value *indices0 = EXTRACT2(vIndices, 0);
- Value *indices1 = EXTRACT2(vIndices, 1);
+ Value *indices0 = EXTRACT_16(vIndices, 0);
+ Value *indices1 = EXTRACT_16(vIndices, 1);
- Value *mask0 = EXTRACT2(vMask, 0);
- Value *mask1 = EXTRACT2(vMask, 1);
+ Value *mask0 = EXTRACT_16(vMask, 0);
+ Value *mask1 = EXTRACT_16(vMask, 1);
Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
- vGather = JOIN2(gather0, gather1);
+ vGather = JOIN_16(gather0, gather1);
}
return vGather;
}
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Generate a masked gather operation in LLVM IR. If not
/// supported on the underlying platform, emulate it with loads
return vGather;
}
-#if USE_SIMD16_BUILDER
- Value *Builder::EXTRACT2(Value *x, uint32_t imm)
+ Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
{
if (imm == 0)
- return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7});
+ {
+ return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
+ }
else
- return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15});
+ {
+ return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
+ }
}
- Value *Builder::JOIN2(Value *a, Value *b)
+ Value *Builder::JOIN_16(Value *a, Value *b)
{
- return VSHUFFLE(a, b,
- {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+ return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
}
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief convert x86 <N x float> mask to llvm <N x i1> mask
return ICMP_SLT(src, VIMMED1(0));
}
-#if USE_SIMD16_BUILDER
- Value *Builder::MASK2(Value *vmask)
+ Value *Builder::MASK_16(Value *vmask)
{
- Value *src = BITCAST(vmask, mSimd2Int32Ty);
- return ICMP_SLT(src, VIMMED2_1(0));
+ Value *src = BITCAST(vmask, mSimd16Int32Ty);
+ return ICMP_SLT(src, VIMMED1_16(0));
}
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
Value *Builder::VMASK(Value *mask)
return S_EXT(mask, mSimdInt32Ty);
}
-#if USE_SIMD16_BUILDER
- Value *Builder::VMASK2(Value *mask)
+ Value *Builder::VMASK_16(Value *mask)
{
- return S_EXT(mask, mSimd2Int32Ty);
+ return S_EXT(mask, mSimd16Int32Ty);
}
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Generate a VPSHUFB operation in LLVM IR. If not
/// supported on the underlying platform, emulate it
CONVERT_SFIXED,
};
+#if USE_SIMD16_SHADERS
+#define USE_SIMD16_GATHERS 0
+#define USE_SIMD16_BUILDER 0
+#endif
+
//////////////////////////////////////////////////////////////////////////
/// Interface to Jitting a fetch shader
//////////////////////////////////////////////////////////////////////////
void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
#if USE_SIMD16_SHADERS
-#define USE_SIMD16_GATHERS 0
#if USE_SIMD16_GATHERS
void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2);
pVtxOut = GEP(pVtxOut, C(0));
#if USE_SIMD16_SHADERS
#if 0// USE_SIMD16_BUILDER
- pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
#else
pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
#endif
Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
- Value* vBaseVertex16 = VBROADCAST2(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
+ Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
#else
Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
#endif
Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
- Value *vStride16 = VBROADCAST2(stride);
+ Value *vStride16 = VBROADCAST_16(stride);
#else
Value *vStride = VBROADCAST(stride);
#endif
Value *startOffset;
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
- Value *vInstanceStride16 = VIMMED2_1(0);
+ Value *vInstanceStride16 = VIMMED1_16(0);
#else
Value *vInstanceStride = VIMMED1(0);
#endif
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
- vCurIndices16 = VBROADCAST2(calcInstance);
+ vCurIndices16 = VBROADCAST_16(calcInstance);
#else
vCurIndices = VBROADCAST(calcInstance);
vCurIndices2 = VBROADCAST(calcInstance);
Value* stepRate = C(ied.InstanceAdvancementState);
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
- vInstanceStride16 = VBROADCAST2(MUL(curInstance, stepRate));
+ vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate));
#else
vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
#endif
// offset indices by baseVertex
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
- Value *vIndices16 = JOIN2(vIndices, vIndices2);
+ Value *vIndices16 = JOIN_16(vIndices, vIndices2);
vCurIndices16 = ADD(vIndices16, vBaseVertex16);
#else
// offset indices by baseVertex
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
- Value *vIndices16 = JOIN2(vIndices, vIndices2);
+ Value *vIndices16 = JOIN_16(vIndices, vIndices2);
vCurIndices16 = ADD(vIndices16, vBaseVertex16);
#else
partialInboundsSize = LOAD(partialInboundsSize);
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
- Value *vPartialVertexSize = VBROADCAST2(partialInboundsSize);
- Value *vBpp = VBROADCAST2(C(info.Bpp));
- Value *vAlignmentOffsets = VBROADCAST2(C(ied.AlignedByteOffset));
+ Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize);
+ Value *vBpp = VBROADCAST_16(C(info.Bpp));
+ Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset));
#else
Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
Value *vBpp = VBROADCAST(C(info.Bpp));
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
// override cur indices with 0 if pitch is 0
- Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED2_1(0));
- vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED2_1(0), vCurIndices16);
+ Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0));
+ vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16);
// are vertices partially OOB?
- Value *vMaxVertex16 = VBROADCAST2(maxVertex);
+ Value *vMaxVertex16 = VBROADCAST_16(maxVertex);
Value *vPartialOOBMask = ICMP_EQ(vCurIndices16, vMaxVertex16);
// are vertices fully in bounds?
if (fetchState.bPartialVertexBuffer)
{
// are vertices below minVertex limit?
- Value *vMinVertex16 = VBROADCAST2(minVertex);
+ Value *vMinVertex16 = VBROADCAST_16(minVertex);
Value *vMinGatherMask16 = ICMP_UGE(vCurIndices16, vMinVertex16);
// only fetch lanes that pass both tests
// TODO: remove the following simd8 interop stuff once all code paths are fully widened to SIMD16..
- Value *vGatherMask = EXTRACT2(vGatherMask16, 0);
- Value *vGatherMask2 = EXTRACT2(vGatherMask16, 1);
+ Value *vGatherMask = EXTRACT_16(vGatherMask16, 0);
+ Value *vGatherMask2 = EXTRACT_16(vGatherMask16, 1);
- Value *vOffsets = EXTRACT2(vOffsets16, 0);
- Value *vOffsets2 = EXTRACT2(vOffsets16, 1);
+ Value *vOffsets = EXTRACT_16(vOffsets16, 0);
+ Value *vOffsets2 = EXTRACT_16(vOffsets16, 1);
#else
// override cur indices with 0 if pitch is 0
Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
{
#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
- pVtxSrc2[currentVertexElement] = JOIN2(pResults[c], pResults2[c]);
+ pVtxSrc2[currentVertexElement] = JOIN_16(pResults[c], pResults2[c]);
#else
vVertexElements[currentVertexElement] = pResults[c];
{
#if USE_SIMD16_BUILDER
// store SIMD16s
- Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
#if USE_SIMD16_GATHERS
Value *gatherSrc2 = VIMMED1(0.0f);
#if USE_SIMD16_BUILDER
- Value *gatherSrc16 = VIMMED2_1(0.0f);
+ Value *gatherSrc16 = VIMMED1_16(0.0f);
#endif
#endif
}
else
{
- gatherResult[0] = VUNDEF2_I();
+ gatherResult[0] = VUNDEF_I_16();
}
// if we have at least one component out of z or w to fetch
}
else
{
- gatherResult[1] = VUNDEF2_I();
+ gatherResult[1] = VUNDEF_I_16();
}
#else
if (compMask)
{
#if USE_SIMD16_BUILDER
- Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
- pVtxSrc2[currentVertexElement] = JOIN2(vVertexElements[currentVertexElement],
+ pVtxSrc2[currentVertexElement] = JOIN_16(vVertexElements[currentVertexElement],
vVertexElements2[currentVertexElement]);
#endif
#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
- pVtxSrc2[currentVertexElement] = JOIN2(vVertexElements[currentVertexElement],
+ pVtxSrc2[currentVertexElement] = JOIN_16(vVertexElements[currentVertexElement],
vVertexElements2[currentVertexElement]);
#endif
{
#if USE_SIMD16_BUILDER
// store SIMD16s
- Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
#if USE_SIMD16_BUILDER
// pack adjacent pairs of SIMD8s into SIMD16s
- pVtxSrc2[currentVertexElement] = JOIN2(pGather, pGather2);
+ pVtxSrc2[currentVertexElement] = JOIN_16(pGather, pGather2);
#else
vVertexElements[currentVertexElement] = pGather;
{
#if USE_SIMD16_BUILDER
// store SIMD16s
- Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
#if USE_SIMD16_GATHERS
Value* gatherSrc2 = VIMMED1(0);
#if USE_SIMD16_BUILDER
- Value *gatherSrc16 = VIMMED2_1(0);
+ Value *gatherSrc16 = VIMMED1_16(0);
#endif
#endif
#endif
#if USE_SIMD16_BUILDER
- Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
}
else
{
- gatherResult[0] = VUNDEF2_I();
+ gatherResult[0] = VUNDEF_I_16();
}
// if we have at least one component out of z or w to fetch
}
else
{
- gatherResult[1] = VUNDEF2_I();
+ gatherResult[1] = VUNDEF_I_16();
}
#else
if (compMask)
{
#if USE_SIMD16_BUILDER
- Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
if (conversionType == CONVERT_USCALED)
{
- pGather = UI_TO_FP(pGather, mSimd2FP32Ty);
+ pGather = UI_TO_FP(pGather, mSimd16FP32Ty);
}
else if (conversionType == CONVERT_SSCALED)
{
- pGather = SI_TO_FP(pGather, mSimd2FP32Ty);
+ pGather = SI_TO_FP(pGather, mSimd16FP32Ty);
}
else if (conversionType == CONVERT_SFIXED)
{
- pGather = FMUL(SI_TO_FP(pGather, mSimd2FP32Ty), VBROADCAST2(C(1 / 65536.0f)));
+ pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f)));
}
#else
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
// store SIMD16s
- Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
#if USE_SIMD16_GATHERS
#if USE_SIMD16_BUILDER
// store SIMD16s
- Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0));
StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
- Value *vGatherResult_lo = EXTRACT2(vGatherResult, 0);
- Value *vGatherResult_hi = EXTRACT2(vGatherResult, 1);
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
- vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi);
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
currentVertexElement += 1;
}
break;
}
- Value *vGatherResult_lo = EXTRACT2(vGatherResult, 0);
- Value *vGatherResult_hi = EXTRACT2(vGatherResult, 1);
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
- vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi);
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
currentVertexElement += 1;
}
{
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
- Value *vGatherResult_lo = EXTRACT2(vGatherResult[0], 0);
- Value *vGatherResult_hi = EXTRACT2(vGatherResult[0], 1);
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1);
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
#if 0
- vi128XY = JOIN2(vi128XY_lo, vi128XY_hi);
+ vi128XY = JOIN_16(vi128XY_lo, vi128XY_hi);
#endif
}
Value *vi128ZW_hi = nullptr;
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
{
- Value *vGatherResult_lo = EXTRACT2(vGatherResult[1], 0);
- Value *vGatherResult_hi = EXTRACT2(vGatherResult[1], 1);
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1);
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
#if 0
- vi128ZW = JOIN2(vi128ZW_lo, vi128ZW_hi);
+ vi128ZW = JOIN_16(vi128ZW_lo, vi128ZW_hi);
#endif
}
Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
- vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi);
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
}
else
{
temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
- vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi);
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
}
currentVertexElement += 1;
// SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
- Value *vGatherResult_lo = EXTRACT2(vGatherResult[selectedGather], 0);
- Value *vGatherResult_hi = EXTRACT2(vGatherResult[selectedGather], 1);
+ Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
+ Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
}
- vVertexElements[currentVertexElement] = JOIN2(temp_lo, temp_hi);
+ vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
currentVertexElement += 1;
}
#if FETCH_DUMP_VERTEX
PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
#endif
- vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
+ vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty);
}
#if FETCH_DUMP_VERTEX
else
{
switch(ctrl)
{
- case NoStore: return VUNDEF_I();
- case Store0: return VIMMED1(0);
- case Store1Fp: return VIMMED1(1.0f);
- case Store1Int: return VIMMED1(1);
+ case NoStore:
+ return VUNDEF_I();
+ case Store0:
+ return VIMMED1(0);
+ case Store1Fp:
+ return VIMMED1(1.0f);
+ case Store1Int:
+ return VIMMED1(1);
case StoreVertexId:
{
#if USE_SIMD16_SHADERS
- Value* pId;
+ Value *pId;
if (useVertexID2)
{
pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
}
#else
- Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
+ Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
#endif
return pId;
}
case StoreInstanceId:
{
- Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
+ Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
return VBROADCAST(pId);
}
case StoreSrc:
- default: SWR_INVALID("Invalid component control"); return VUNDEF_I();
+ default:
+ SWR_INVALID("Invalid component control"); return VUNDEF_I();
}
}
{
switch (ctrl)
{
- case NoStore: return VUNDEF2_I();
- case Store0: return VIMMED2_1(0);
- case Store1Fp: return VIMMED2_1(1.0f);
- case Store1Int: return VIMMED2_1(1);
+ case NoStore:
+ return VUNDEF_I_16();
+ case Store0:
+ return VIMMED1_16(0);
+ case Store1Fp:
+ return VIMMED1_16(1.0f);
+ case Store1Int:
+ return VIMMED1_16(1);
case StoreVertexId:
{
- Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
- Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
+ Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
+ Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
- Value *pId = JOIN2(pId_lo, pId_hi);
+ Value *pId = JOIN_16(pId_lo, pId_hi);
return pId;
}
case StoreInstanceId:
{
- Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
- return VBROADCAST2(pId);
+ Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
+ return VBROADCAST_16(pId);
}
case StoreSrc:
- default: SWR_INVALID("Invalid component control"); return VUNDEF2_I();
+ default:
+ SWR_INVALID("Invalid component control"); return VUNDEF_I_16();
}
}