/// @param vIndices - SIMD wide value of VB byte offsets
/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
/// @param scale - value to scale indices by
- Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
{
Value* vGather;
{
// force mask to <N x float>, required by vgather
vMask = BITCAST(vMask, mSimdFP32Ty);
- vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
+ vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale));
}
else
{
STORE(vSrc, vSrcPtr);
vGather = VUNDEF_F();
- Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
+ Value *vScaleVec = VIMMED1((uint32_t)scale);
Value *vOffsets = MUL(vIndices,vScaleVec);
Value *mask = MASK(vMask);
for(uint32_t i = 0; i < mVWidth; ++i)
/// @param vIndices - SIMD wide value of VB byte offsets
/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
/// @param scale - value to scale indices by
- Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
{
Value* vGather;
// use avx2 gather instruction if available
if(JM()->mArch.AVX2())
{
- vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
+ vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale));
}
else
{
STORE(vSrc, vSrcPtr);
vGather = VUNDEF_I();
- Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
+ Value *vScaleVec = VIMMED1((uint32_t)scale);
Value *vOffsets = MUL(vIndices, vScaleVec);
Value *mask = MASK(vMask);
for(uint32_t i = 0; i < mVWidth; ++i)
/// @param vIndices - SIMD wide value of VB byte offsets
/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
/// @param scale - value to scale indices by
- Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+ Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
{
Value* vGather;
// use avx2 gather instruction if available
if(JM()->mArch.AVX2())
{
- vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale);
+ vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
}
else
{
STORE(vSrc, vSrcPtr);
vGather = UndefValue::get(VectorType::get(mDoubleTy, 4));
- Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty));
+ Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale));
Value *vOffsets = MUL(vIndices,vScaleVec);
Value *mask = MASK(vMask);
for(uint32_t i = 0; i < mVWidth/2; ++i)
// save mask as it is zero'd out after each gather
vMask = mask;
- vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
pSrcBase = GEP(pSrcBase, C((char)4));
vMask = mask;
- vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of second 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
Value *vMask = mask;
// Gather a SIMD of components
- vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
// offset base to the next component to gather
pSrcBase = GEP(pSrcBase, C((char)4));
case 8:
{
Value* vGatherMaskedVal = VIMMED1((int32_t)0);
- Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
+ Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask);
// e.g. result of an 8x32bit integer gather for 8bit components
// 256i - 0 1 2 3 4 5 6 7
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
// save mask as it is zero'd out after each gather
vMask = mask;
- vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
pSrcBase = GEP(pSrcBase, C((char)4));
vMask = mask;
- vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask);
// e.g. result of second 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
Value *vMask = mask;
// Gather a SIMD of components
- vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+ vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask);
// offset base to the next component to gather
pSrcBase = GEP(pSrcBase, C((char)4));
// only works if pixel size is <= 32bits
SWR_ASSERT(info.bpp <= 32);
- Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask, C((char)1));
+ Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
for (uint32_t comp = 0; comp < 4; ++comp)
{
vMask = vGatherMask;
vMask2 = vGatherMask2;
- vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
- vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+ vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
+ vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
vMask = vGatherMask;
vMask2 = vGatherMask2;
- vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
- vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+ vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
+ vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2);
// e.g. result of second 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
// save mask as it is zero'd out after each gather
vMask = vGatherMask;
- vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
pStreamBase = GEP(pStreamBase, C((char)4));
vMask = vGatherMask;
- vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask);
// e.g. result of second 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
// Right shift the offset by a bit and then scale by 2 to remove the sign extension.
Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
- vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2));
- vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, C((char)2));
+ vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2);
+ vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, 2);
currentVertexElement += 1;
}
// But, we know that elements must be aligned for FETCH. :)
// Right shift the offset by a bit and then scale by 2 to remove the sign extension.
Value* vShiftedOffsets = VPSRLI(vOffsets, C(1));
- vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2));
+ vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2);
}
else
{
Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
- Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
- Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2, C((char)1));
- Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
- Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2, C((char)1));
+ Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
+ Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2);
+ Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
+ Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2);
pGatherLo = VCVTPD2PS(pGatherLo);
pGatherLo2 = VCVTPD2PS(pGatherLo2);
Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
Value* pGatherLo = GATHERPD(vZeroDouble,
- pStreamBase, vOffsetsLo, vMaskLo, C((char)1));
+ pStreamBase, vOffsetsLo, vMaskLo);
Value* pGatherHi = GATHERPD(vZeroDouble,
- pStreamBase, vOffsetsHi, vMaskHi, C((char)1));
+ pStreamBase, vOffsetsHi, vMaskHi);
pGatherLo = VCVTPD2PS(pGatherLo);
pGatherHi = VCVTPD2PS(pGatherHi);
if (compMask)
{
#if USE_SIMD16_GATHERS
- Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
- Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2, C((char)1));
+ Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
+ Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
// e.g. result of an 8x32bit integer gather for 8bit components
// 256i - 0 1 2 3 4 5 6 7
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
#else
- Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
+ Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
// e.g. result of an 8x32bit integer gather for 8bit components
// 256i - 0 1 2 3 4 5 6 7
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
vMask = vGatherMask;
vMask2 = vGatherMask2;
- vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
- vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+ vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
+ vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
vMask = vGatherMask;
vMask2 = vGatherMask2;
- vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
- vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+ vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
+ vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
// e.g. result of second 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
// save mask as it is zero'd out after each gather
vMask = vGatherMask;
- vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
// e.g. result of first 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
pStreamBase = GEP(pStreamBase, C((char)4));
vMask = vGatherMask;
- vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
// e.g. result of second 8x32bit integer gather for 16bit components
// 256i - 0 1 2 3 4 5 6 7
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
Value *vMask = vGatherMask;
Value *vMask2 = vGatherMask2;
- Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
- Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1));
+ Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
+ Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2);
if (conversionType == CONVERT_USCALED)
{
// save mask as it is zero'd out after each gather
Value *vMask = vGatherMask;
- Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+ Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask);
if (conversionType == CONVERT_USCALED)
{