#include "builder.h"
#include "common/rdtsc_buckets.h"
+#include <cstdarg>
namespace SwrJit
{
/// number of mantissa bits.
/// @param val - 32-bit float
/// @todo Maybe move this outside of this file into a header?
- static uint16_t Convert32To16Float(float val)
+ static uint16_t ConvertFloat32ToFloat16(float val)
{
uint32_t sign, exp, mant;
uint32_t roundBits;
/// float
/// @param val - 16-bit float
/// @todo Maybe move this outside of this file into a header?
- static float ConvertSmallFloatTo32(UINT val)
+ static float ConvertFloat16ToFloat32(uint32_t val)
{
- UINT result;
+ uint32_t result;
if ((val & 0x7fff) == 0)
{
result = ((uint32_t)(val & 0x8000)) << 16;
return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
+#if USE_SIMD16_BUILDER
+ Value *Builder::VIMMED2_1(int i)
+ {
+ return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
+ }
+
+ Value *Builder::VIMMED2_1(uint32_t i)
+ {
+ return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
+ }
+
+ Value *Builder::VIMMED2_1(float i)
+ {
+ return ConstantVector::getSplat(mVWidth2, cast<ConstantFP>(C(i)));
+ }
+
+ Value *Builder::VIMMED2_1(bool i)
+ {
+ return ConstantVector::getSplat(mVWidth2, cast<ConstantInt>(C(i)));
+ }
+
+#endif
Value *Builder::VUNDEF_IPTR()
{
return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
}
- Value *Builder::VUNDEF(Type* t)
+#if USE_SIMD16_BUILDER
+ Value *Builder::VUNDEF2_F()
{
- return UndefValue::get(VectorType::get(t, mVWidth));
+ return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
}
- #if HAVE_LLVM == 0x306
- Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
+ Value *Builder::VUNDEF2_I()
{
- return VINSERT(vec, val, C((int64_t)index));
+ return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2));
+ }
+
+#endif
+ Value *Builder::VUNDEF(Type* t)
+ {
+ return UndefValue::get(VectorType::get(t, mVWidth));
}
- #endif
Value *Builder::VBROADCAST(Value *src)
{
return VECTOR_SPLAT(mVWidth, src);
}
+#if USE_SIMD16_BUILDER
+ Value *Builder::VBROADCAST2(Value *src)
+ {
+ // check if src is already a vector
+ if (src->getType()->isVectorTy())
+ {
+ return src;
+ }
+
+ return VECTOR_SPLAT(mVWidth2, src);
+ }
+
+#endif
uint32_t Builder::IMMED(Value* v)
{
SWR_ASSERT(isa<ConstantInt>(v));
return GEPA(ptr, indices);
}
+ Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+ {
+ std::vector<Value*> indices;
+ for (auto i : indexList)
+ indices.push_back(i);
+ return IN_BOUNDS_GEP(ptr, indices);
+ }
+
+ Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+ {
+ std::vector<Value*> indices;
+ for (auto i : indexList)
+ indices.push_back(C(i));
+ return IN_BOUNDS_GEP(ptr, indices);
+ }
+
LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
{
std::vector<Value*> valIndices;
return CALLA(Callee, args);
}
- #if HAVE_LLVM > 0x306
CallInst *Builder::CALL(Value *Callee, Value* arg)
{
std::vector<Value*> args;
args.push_back(arg3);
return CALLA(Callee, args);
}
- #endif
+
+ //////////////////////////////////////////////////////////////////////////
+ Value *Builder::DEBUGTRAP()
+ {
+ Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
+ return CALL(func);
+ }
Value *Builder::VRCP(Value *va)
{
// get a pointer to the first character in the constant string array
std::vector<Constant*> geplist{C(0),C(0)};
- #if HAVE_LLVM == 0x306
- Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
- #else
Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
- #endif
// insert the pointer to the format string in the argument vector
printCallArgs[0] = strGEP;
/// @param vIndices - SIMD wide value of VB byte offsets
/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
/// @param scale - value to scale indices by
- Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+ Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
{
- Value* vGather;
+ Value *vGather;
// use avx2 gather instruction if available
if(JM()->mArch.AVX2())
{
// force mask to <N x float>, required by vgather
- vMask = BITCAST(vMask, mSimdFP32Ty);
- vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
+ Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty);
+
+ vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale));
}
else
{
STORE(vSrc, vSrcPtr);
vGather = VUNDEF_F();
- Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
+ Value *vScaleVec = VIMMED1((uint32_t)scale);
Value *vOffsets = MUL(vIndices,vScaleVec);
- Value *mask = MASK(vMask);
for(uint32_t i = 0; i < mVWidth; ++i)
{
// single component byte index
loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
// pointer to the value to load if we're masking off a component
Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
- Value *selMask = VEXTRACT(mask,C(i));
+ Value *selMask = VEXTRACT(vMask,C(i));
// switch in a safe address to load if we're trying to access a vertex
Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
Value *val = LOAD(validAddress);
return vGather;
}
+#if USE_SIMD16_BUILDER
+ Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
+ {
+ Value *vGather = VUNDEF2_F();
+
+ // use avx512 gather instruction if available
+ if (JM()->mArch.AVX512F())
+ {
+ // force mask to <N-bit Integer>, required by vgather2
+ Value *mask = BITCAST(vMask, mInt16Ty);
+
+ vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
+ }
+ else
+ {
+ Value *src0 = EXTRACT2_F(vSrc, 0);
+ Value *src1 = EXTRACT2_F(vSrc, 1);
+
+ Value *indices0 = EXTRACT2_I(vIndices, 0);
+ Value *indices1 = EXTRACT2_I(vIndices, 1);
+
+ Value *vmask16 = VMASK2(vMask);
+
+ Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better..
+ Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
+
+ Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale);
+ Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale);
+
+ vGather = JOIN2(gather0, gather1);
+ }
+
+ return vGather;
+ }
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Generate a masked gather operation in LLVM IR. If not
/// supported on the underlying platform, emulate it with loads
/// @param vIndices - SIMD wide value of VB byte offsets
/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
/// @param scale - value to scale indices by
- Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
{
Value* vGather;
// use avx2 gather instruction if available
if(JM()->mArch.AVX2())
{
- vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
+ vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale));
}
else
{
STORE(vSrc, vSrcPtr);
vGather = VUNDEF_I();
- Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
+ Value *vScaleVec = VIMMED1((uint32_t)scale);
Value *vOffsets = MUL(vIndices, vScaleVec);
- Value *mask = MASK(vMask);
for(uint32_t i = 0; i < mVWidth; ++i)
{
// single component byte index
loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
// pointer to the value to load if we're masking off a component
Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
- Value *selMask = VEXTRACT(mask, C(i));
+ Value *selMask = VEXTRACT(vMask, C(i));
// switch in a safe address to load if we're trying to access a vertex
Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
Value *val = LOAD(validAddress, C(0));
return vGather;
}
+#if USE_SIMD16_BUILDER
+ Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale)
+ {
+ Value *vGather = VUNDEF2_F();
+
+ // use avx512 gather instruction if available
+ if (JM()->mArch.AVX512F())
+ {
+ // force mask to <N-bit Integer>, required by vgather2
+ Value *mask = BITCAST(vMask, mInt16Ty);
+
+ vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale));
+ }
+ else
+ {
+ Value *src0 = EXTRACT2_F(vSrc, 0);
+ Value *src1 = EXTRACT2_F(vSrc, 1);
+
+ Value *indices0 = EXTRACT2_I(vIndices, 0);
+ Value *indices1 = EXTRACT2_I(vIndices, 1);
+
+ Value *vmask16 = VMASK2(vMask);
+
+ Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better..
+ Value *mask1 = MASK(EXTRACT2_I(vmask16, 1));
+
+ Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale);
+ Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale);
+
+ vGather = JOIN2(gather0, gather1);
+ }
+
+ return vGather;
+ }
+
+#endif
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief Generate a masked gather operation in LLVM IR. If not
+ /// supported on the underlying platform, emulate it with loads
+ /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+ /// @param pBase - Int8* base VB address pointer value
+ /// @param vIndices - SIMD wide value of VB byte offsets
+ /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+ /// @param scale - value to scale indices by
+ Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
+ {
+ Value* vGather;
+
+ // use avx2 gather instruction if available
+ if(JM()->mArch.AVX2())
+ {
+ vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2));
+ vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
+ }
+ else
+ {
+ Value* pStack = STACKSAVE();
+
+ // store vSrc on the stack. this way we can select between a valid load address and the vSrc address
+ Value* vSrcPtr = ALLOCA(vSrc->getType());
+ STORE(vSrc, vSrcPtr);
+
+ vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
+ Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
+ Value *vOffsets = MUL(vIndices,vScaleVec);
+ for(uint32_t i = 0; i < mVWidth/2; ++i)
+ {
+ // single component byte index
+ Value *offset = VEXTRACT(vOffsets,C(i));
+ // byte pointer to component
+ Value *loadAddress = GEP(pBase,offset);
+ loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0));
+ // pointer to the value to load if we're masking off a component
+ Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
+ Value *selMask = VEXTRACT(vMask,C(i));
+ // switch in a safe address to load if we're trying to access a vertex
+ Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+ Value *val = LOAD(validAddress);
+ vGather = VINSERT(vGather,val,C(i));
+ }
+ STACKRESTORE(pStack);
+ }
+ return vGather;
+ }
+
+#if USE_SIMD16_BUILDER
+ Value *Builder::PSRLI(Value *a, Value *imm)
+ {
+ return VPSRLI(a, imm);
+ }
+
+ Value *Builder::PSRLI_16(Value *a, Value *imm)
+ {
+ Value *result = VUNDEF2_I();
+
+ // use avx512 shift right instruction if available
+ if (JM()->mArch.AVX512F())
+ {
+ result = VPSRLI_16(a, imm);
+ }
+ else
+ {
+ Value *a0 = EXTRACT2_I(a, 0);
+ Value *a1 = EXTRACT2_I(a, 1);
+
+ Value *result0 = PSRLI(a0, imm);
+ Value *result1 = PSRLI(a1, imm);
+
+ result = JOIN2(result0, result1);
+ }
+
+ return result;
+ }
+
+#endif
+#if USE_SIMD16_BUILDER
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief
+ Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm)
+ {
+ const uint32_t i0 = (imm > 0) ? mVWidth : 0;
+
+ Value *result = VUNDEF_F();
+
+ for (uint32_t i = 0; i < mVWidth; i += 1)
+ {
+#if 1
+ if (!a2->getType()->getScalarType()->isFloatTy())
+ {
+ a2 = BITCAST(a2, mSimd2FP32Ty);
+ }
+
+#endif
+ Value *temp = VEXTRACT(a2, C(i0 + i));
+
+ result = VINSERT(result, temp, C(i));
+ }
+
+ return result;
+ }
+
+ Value *Builder::EXTRACT2_I(Value *a2, uint32_t imm)
+ {
+ return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty);
+ }
+
+ Value *Builder::JOIN2(Value *a, Value *b)
+ {
+ return VSHUFFLE(a, b,
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+ }
+#endif
+
//////////////////////////////////////////////////////////////////////////
/// @brief convert x86 <N x float> mask to llvm <N x i1> mask
- Value* Builder::MASK(Value* vmask)
+ Value *Builder::MASK(Value *vmask)
{
- Value* src = BITCAST(vmask, mSimdInt32Ty);
+ Value *src = BITCAST(vmask, mSimdInt32Ty);
return ICMP_SLT(src, VIMMED1(0));
}
+#if USE_SIMD16_BUILDER
+ Value *Builder::MASK2(Value *vmask)
+ {
+ Value *src = BITCAST(vmask, mSimd2Int32Ty);
+ return ICMP_SLT(src, VIMMED2_1(0));
+ }
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
- Value* Builder::VMASK(Value* mask)
+ Value *Builder::VMASK(Value *mask)
{
return S_EXT(mask, mSimdInt32Ty);
}
+#if USE_SIMD16_BUILDER
+ Value *Builder::VMASK2(Value *mask)
+ {
+ return S_EXT(mask, mSimd2Int32Ty);
+ }
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Generate a VPSHUFB operation in LLVM IR. If not
/// supported on the underlying platform, emulate it
/// lower 8 values are used.
Value *Builder::PMOVSXBD(Value* a)
{
- // llvm-3.9 removed the pmovsxbd intrinsic
- #if HAVE_LLVM < 0x309
- // use avx2 byte sign extend instruction if available
- if(JM()->mArch.AVX2())
- {
- Function *pmovsxbd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxbd);
- return CALL(pmovsxbd, std::initializer_list<Value*>{a});
- }
- else
- #endif
- {
- // VPMOVSXBD output type
- Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
- // Extract 8 values from 128bit lane and sign extend
- return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
- }
+ // VPMOVSXBD output type
+ Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+ // Extract 8 values from 128bit lane and sign extend
+ return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
}
//////////////////////////////////////////////////////////////////////////
/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
Value *Builder::PMOVSXWD(Value* a)
{
- // llvm-3.9 removed the pmovsxwd intrinsic
- #if HAVE_LLVM < 0x309
- // use avx2 word sign extend if available
- if(JM()->mArch.AVX2())
- {
- Function *pmovsxwd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmovsxwd);
- return CALL(pmovsxwd, std::initializer_list<Value*>{a});
- }
- else
- #endif
- {
- // VPMOVSXWD output type
- Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
- // Extract 8 values from 128bit lane and sign extend
- return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
- }
+ // VPMOVSXWD output type
+ Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+ // Extract 8 values from 128bit lane and sign extend
+ return S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
}
//////////////////////////////////////////////////////////////////////////
else
{
FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
- Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
+ Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
- if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
+ if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
{
- sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
+ sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
}
Value* pResult = UndefValue::get(mSimdFP32Ty);
{
// call scalar C function for now
FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
- Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
+ Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
- if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
+ if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
{
- sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
+ sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
}
Value* pResult = UndefValue::get(mSimdInt16Ty);
Value *Builder::PMAXSD(Value* a, Value* b)
{
- // llvm-3.9 removed the pmax intrinsics
- #if HAVE_LLVM >= 0x309
Value* cmp = ICMP_SGT(a, b);
return SELECT(cmp, a, b);
- #else
- if (JM()->mArch.AVX2())
- {
- Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmaxs_d);
- return CALL(pmaxsd, {a, b});
- }
- else
- {
- // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
- Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
-
- // low 128
- Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
- Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
- Value* resLo = CALL(pmaxsd, {aLo, bLo});
-
- // high 128
- Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
- Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
- Value* resHi = CALL(pmaxsd, {aHi, bHi});
-
- // combine
- Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
- result = VINSERTI128(result, resHi, C((uint8_t)1));
-
- return result;
- }
- #endif
}
Value *Builder::PMINSD(Value* a, Value* b)
{
- // llvm-3.9 removed the pmin intrinsics
- #if HAVE_LLVM >= 0x309
Value* cmp = ICMP_SLT(a, b);
return SELECT(cmp, a, b);
- #else
- if (JM()->mArch.AVX2())
- {
- Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_pmins_d);
- return CALL(pminsd, {a, b});
- }
- else
- {
- // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
- Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
-
- // low 128
- Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
- Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
- Value* resLo = CALL(pminsd, {aLo, bLo});
-
- // high 128
- Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
- Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
- Value* resHi = CALL(pminsd, {aHi, bHi});
-
- // combine
- Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
- result = VINSERTI128(result, resHi, C((uint8_t)1));
-
- return result;
- }
- #endif
}
void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
const SWR_FORMAT_INFO &info = GetFormatInfo(format);
if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
{
- // ensure our mask is the correct type
- mask = BITCAST(mask, mSimdFP32Ty);
GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
}
else
{
- // ensure our mask is the correct type
- mask = BITCAST(mask, mSimdInt32Ty);
GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
}
}
void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
- Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+ Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
{
switch(info.bpp / info.numComps)
{
case 16:
{
Value* vGatherResult[2];
- Value *vMask;
// TODO: vGatherMaskedVal
Value* vGatherMaskedVal = VIMMED1((float)0);
// always have at least one component out of x or y to fetch
- // save mask as it is zero'd out after each gather
- vMask = mask;
-
- vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
{
// offset base to the next components(zw) in the vertex to gather
pSrcBase = GEP(pSrcBase, C((char)4));
- vMask = mask;
- vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of second 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
{
uint32_t swizzleIndex = info.swizzle[i];
- // save mask as it is zero'd out after each gather
- Value *vMask = mask;
-
// Gather a SIMD of components
- vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
// offset base to the next component to gather
pSrcBase = GEP(pSrcBase, C((char)4));
}
break;
default:
- SWR_ASSERT(0, "Invalid float format");
+ SWR_INVALID("Invalid float format");
break;
}
}
void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
- Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+ Value* vMask, Value* vGatherComponents[], bool bPackedOutput)
{
switch (info.bpp / info.numComps)
{
case 8:
{
Value* vGatherMaskedVal = VIMMED1((int32_t)0);
- Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
+ Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of an 8x32bit integer gather for 8bit components
// 256i - 0 1 2 3 4 5 6 7
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
case 16:
{
Value* vGatherResult[2];
- Value *vMask;
// TODO: vGatherMaskedVal
Value* vGatherMaskedVal = VIMMED1((int32_t)0);
// always have at least one component out of x or y to fetch
- // save mask as it is zero'd out after each gather
- vMask = mask;
-
- vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
{
// offset base to the next components(zw) in the vertex to gather
pSrcBase = GEP(pSrcBase, C((char)4));
- vMask = mask;
- vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of second 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
{
uint32_t swizzleIndex = info.swizzle[i];
- // save mask as it is zero'd out after each gather
- Value *vMask = mask;
-
// Gather a SIMD of components
- vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
// offset base to the next component to gather
pSrcBase = GEP(pSrcBase, C((char)4));
}
break;
default:
- SWR_ASSERT(0, "unsupported format");
+ SWR_INVALID("unsupported format");
break;
}
}
IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
pFunc->getEntryBlock().begin());
Value* pAlloca = ALLOCA(pType);
- IRB()->restoreIP(saveIP);
+ if (saveIP.isSet()) IRB()->restoreIP(saveIP);
+ return pAlloca;
+ }
+
+ Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
+ {
+ auto saveIP = IRB()->saveIP();
+ IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
+ pFunc->getEntryBlock().begin());
+ Value* pAlloca = ALLOCA(pType, pArraySize);
+ if (saveIP.isSet()) IRB()->restoreIP(saveIP);
return pAlloca;
}
Value* Builder::STACKSAVE()
{
Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
- #if HAVE_LLVM == 0x306
- return CALL(pfnStackSave);
- #else
return CALLA(pfnStackSave);
- #endif
}
void Builder::STACKRESTORE(Value* pSaved)
#if defined( _WIN32 )
char strBuf[1024];
vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
- OutputDebugString(strBuf);
+ OutputDebugStringA(strBuf);
#endif
va_end(args);
Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
{
- #if HAVE_LLVM == 0x306
- Function *func =
- Intrinsic::getDeclaration(JM()->mpCurrentModule,
- Intrinsic::x86_avx_vextractf128_si_256);
- return CALL(func, {a, imm8});
- #else
bool flag = !imm8->isZeroValue();
SmallVector<Constant*,8> idx;
for (unsigned i = 0; i < mVWidth / 2; i++) {
idx.push_back(C(flag ? i + mVWidth / 2 : i));
}
return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
- #endif
}
Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
{
- #if HAVE_LLVM == 0x306
- Function *func =
- Intrinsic::getDeclaration(JM()->mpCurrentModule,
- Intrinsic::x86_avx_vinsertf128_si_256);
- return CALL(func, {a, b, imm8});
- #else
bool flag = !imm8->isZeroValue();
SmallVector<Constant*,8> idx;
for (unsigned i = 0; i < mVWidth; i++) {
idx2.push_back(C(flag ? i + mVWidth / 2 : i));
}
return VSHUFFLE(a, inter, ConstantVector::get(idx2));
- #endif
}
// rdtsc buckets macros
}
}
-}
\ No newline at end of file
+
+ uint32_t Builder::GetTypeSize(Type* pType)
+ {
+ if (pType->isStructTy())
+ {
+ uint32_t numElems = pType->getStructNumElements();
+ Type* pElemTy = pType->getStructElementType(0);
+ return numElems * GetTypeSize(pElemTy);
+ }
+
+ if (pType->isArrayTy())
+ {
+ uint32_t numElems = pType->getArrayNumElements();
+ Type* pElemTy = pType->getArrayElementType();
+ return numElems * GetTypeSize(pElemTy);
+ }
+
+ if (pType->isIntegerTy())
+ {
+ uint32_t bitSize = pType->getIntegerBitWidth();
+ return bitSize / 8;
+ }
+
+ if (pType->isFloatTy())
+ {
+ return 4;
+ }
+
+ if (pType->isHalfTy())
+ {
+ return 2;
+ }
+
+ if (pType->isDoubleTy())
+ {
+ return 8;
+ }
+
+ SWR_ASSERT(false, "Unimplemented type.");
+ return 0;
+ }
+}