#else
void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
#endif
+#if USE_SIMD16_BUILDER
+ void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args);
+#endif
typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
#else
void Shuffle16bpcGather(Shuffle16bpcArgs &args);
#endif
+#if USE_SIMD16_BUILDER
+ void Shuffle16bpcGather2(Shuffle16bpcArgs &args);
+#endif
void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
#if USE_SIMD16_BUILDER
// only works if pixel size is <= 32bits
SWR_ASSERT(info.bpp <= 32);
- Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
+ Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
for (uint32_t comp = 0; comp < 4; ++comp)
{
Value* vVertexElements[4];
#if USE_SIMD16_GATHERS
Value* vVertexElements2[4];
+#if USE_SIMD16_BUILDER
+ Value *pVtxSrc2[4];
+#endif
#endif
Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
#if USE_SIMD16_GATHERS
// override cur indices with 0 if pitch is 0
Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
+ vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2);
// are vertices partially OOB?
// only fetch lanes that pass both tests
vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
- vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2);
+ vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2);
}
else
{
{
if (isComponentEnabled(compMask, c))
{
- vVertexElements[currentVertexElement] = pResults[c];
+#if USE_SIMD16_BUILDER
+ // pack adjacent pairs of SIMD8s into SIMD16s
+ pVtxSrc2[currentVertexElement] = VUNDEF2_F();
+ pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c], 0);
+ pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1);
+
+#else
+ vVertexElements[currentVertexElement] = pResults[c];
vVertexElements2[currentVertexElement] = pResults2[c];
- currentVertexElement++;
+
+#endif
+ currentVertexElement += 1;
if (currentVertexElement > 3)
{
+#if USE_SIMD16_BUILDER
+ // store SIMD16s
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+ StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+#endif
outputElt += 1;
// reset to the next vVertexElement to output
else if(info.type[0] == SWR_TYPE_FLOAT)
{
///@todo: support 64 bit vb accesses
- Value* gatherSrc = VIMMED1(0.0f);
+ Value *gatherSrc = VIMMED1(0.0f);
#if USE_SIMD16_GATHERS
- Value* gatherSrc2 = VIMMED1(0.0f);
+ Value *gatherSrc2 = VIMMED1(0.0f);
+#if USE_SIMD16_BUILDER
+ Value *gatherSrc16 = VIMMED2_1(0.0f);
+#endif
#endif
SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
case 16:
{
#if USE_SIMD16_GATHERS
- Value* vGatherResult[2];
- Value* vGatherResult2[2];
+ Value *vGatherResult[2];
+ Value *vGatherResult2[2];
// if we have at least one component out of x or y to fetch
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
//
}
+ else
+ {
+ vGatherResult[0] = VUNDEF_I();
+ vGatherResult2[0] = VUNDEF_I();
+ }
// if we have at least one component out of z or w to fetch
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
//
}
-
+ else
+ {
+ vGatherResult[1] = VUNDEF_I();
+ vGatherResult2[1] = VUNDEF_I();
+ }
// if we have at least one component to shuffle into place
if (compMask)
{
+#if USE_SIMD16_BUILDER
+ Value *gatherResult[2];
+
+ gatherResult[0] = VUNDEF2_I();
+ gatherResult[1] = VUNDEF2_I();
+
+ gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
+ gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
+
+ gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
+ gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
+
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+ Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE,
+ currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
+
+ // Shuffle gathered components into place in simdvertex struct
+ Shuffle16bpcGather2(args); // outputs to vVertexElements ref
+#else
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE,
// Shuffle gathered components into place in simdvertex struct
Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
+#endif
}
#else
Value* vGatherResult[2];
break;
case 32:
{
-#if USE_SIMD16_GATHERS
-#if USE_SIMD16_BUILDER
- Value *pVtxSrc2[4];
-
-#endif
-#endif
for (uint32_t i = 0; i < 4; i += 1)
{
#if USE_SIMD16_GATHERS
Value *vShiftedOffsets = VPSRLI(vOffsets, C(1));
Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1));
#if USE_SIMD16_BUILDER
- Value *src = VUNDEF2_F();
- src = INSERT2_F(src, gatherSrc, 0);
- src = INSERT2_F(src, gatherSrc2, 1);
-
Value *indices = VUNDEF2_I();
indices = INSERT2_I(indices, vShiftedOffsets, 0);
indices = INSERT2_I(indices, vShiftedOffsets2, 1);
mask = INSERT2_I(mask, vGatherMask, 0);
mask = INSERT2_I(mask, vGatherMask2, 1);
- pVtxSrc2[currentVertexElement] = GATHERPS2(src, pStreamBase, indices, mask, 2);
-#if 1
-
- vVertexElements[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 0);
- vVertexElements2[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 1);
-#endif
+ pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2);
#else
vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2);
vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2);
Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
- vVertexElements[currentVertexElement] = pGather;
+#if USE_SIMD16_BUILDER
+ // pack adjacent pairs of SIMD8s into SIMD16s
+ pVtxSrc2[currentVertexElement] = VUNDEF2_F();
+ pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
+ pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
+
+#else
+ vVertexElements[currentVertexElement] = pGather;
vVertexElements2[currentVertexElement] = pGather2;
+#endif
currentVertexElement += 1;
}
else
{
+#if USE_SIMD16_BUILDER
+ pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
+
+#else
vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
+#endif
currentVertexElement += 1;
}
if (currentVertexElement > 3)
{
+#if USE_SIMD16_BUILDER
+ // store SIMD16s
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+ StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+#endif
outputElt += 1;
// reset to the next vVertexElement to output
#if USE_SIMD16_GATHERS
Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2);
+
// e.g. result of an 8x32bit integer gather for 8bit components
// 256i - 0 1 2 3 4 5 6 7
// xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
+#if USE_SIMD16_BUILDER
+ Value *gatherResult = VUNDEF2_I();
+
+ gatherResult = INSERT2_I(gatherResult, vGatherResult, 0);
+ gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1);
+
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+ Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
+ currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle);
+
+ // Shuffle gathered components into place in simdvertex struct
+ Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref
+#else
Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
// Shuffle gathered components into place in simdvertex struct
Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref
Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref
+#endif
#else
Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
// e.g. result of an 8x32bit integer gather for 8bit components
// xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
//
}
+ else
+ {
+ vGatherResult[0] = VUNDEF_I();
+ vGatherResult2[0] = VUNDEF_I();
+ }
// if we have at least one component out of z or w to fetch
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
// zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
//
}
+ else
+ {
+ vGatherResult[1] = VUNDEF_I();
+ vGatherResult2[1] = VUNDEF_I();
+ }
// if we have at least one component to shuffle into place
if (compMask)
{
+#if USE_SIMD16_BUILDER
+ Value *gatherResult[2];
+
+ gatherResult[0] = VUNDEF2_I();
+ gatherResult[1] = VUNDEF2_I();
+
+ gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0);
+ gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1);
+
+ gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0);
+ gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1);
+
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+ Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType,
+ currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2);
+
+ // Shuffle gathered components into place in simdvertex struct
+ Shuffle16bpcGather2(args); // outputs to vVertexElements ref
+#else
Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType,
// Shuffle gathered components into place in simdvertex struct
Shuffle16bpcGather(args, false); // outputs to vVertexElements ref
Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref
+#endif
}
#else
Value* vGatherResult[2];
pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f)));
}
+#if USE_SIMD16_BUILDER
+ // pack adjacent pairs of SIMD8s into SIMD16s
+ pVtxSrc2[currentVertexElement] = VUNDEF2_F();
+ pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0);
+ pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1);
+
+#else
vVertexElements[currentVertexElement] = pGather;
vVertexElements2[currentVertexElement] = pGather2;
+
+#endif
+
// e.g. result of a single 8x32bit integer gather for 32bit components
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
{
#if USE_SIMD16_SHADERS
#if USE_SIMD16_GATHERS
+#if USE_SIMD16_BUILDER
+ pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]);
+
+#else
vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false);
vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true);
+#endif
currentVertexElement += 1;
#else
vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2);
if (currentVertexElement > 3)
{
#if USE_SIMD16_GATHERS
+#if USE_SIMD16_BUILDER
+ // store SIMD16s
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+ StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+#endif
outputElt += 1;
#else
StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
if (currentVertexElement > 0)
{
#if USE_SIMD16_GATHERS
+#if USE_SIMD16_BUILDER
+ // store SIMD16s
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+
+ StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2);
+
+#else
StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2);
+#endif
outputElt += 1;
#else
StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements);
}
}
+#if USE_SIMD16_BUILDER
+void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args)
+{
+ // Unpack tuple args
+ Value*& vGatherResult = std::get<0>(args);
+ Value* pVtxOut = std::get<1>(args);
+ const Instruction::CastOps extendType = std::get<2>(args);
+ const ConversionType conversionType = std::get<3>(args);
+ uint32_t ¤tVertexElement = std::get<4>(args);
+ uint32_t &outputElt = std::get<5>(args);
+ const ComponentEnable compMask = std::get<6>(args);
+ const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+ Value* (&vVertexElements)[4] = std::get<8>(args);
+ const uint32_t(&swizzle)[4] = std::get<9>(args);
+
+ // cast types
+ Type *vGatherTy = mSimdInt32Ty;
+ Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+ // have to do extra work for sign extending
+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
+ {
+ Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
+ Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+ // shuffle mask, including any swizzling
+ const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
+ const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
+ Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
+ char(y), char(y + 4), char(y + 8), char(y + 12),
+ char(z), char(z + 4), char(z + 8), char(z + 12),
+ char(w), char(w + 4), char(w + 8), char(w + 12),
+ char(x), char(x + 4), char(x + 8), char(x + 12),
+ char(y), char(y + 4), char(y + 8), char(y + 12),
+ char(z), char(z + 4), char(z + 8), char(z + 12),
+ char(w), char(w + 4), char(w + 8), char(w + 12) });
+
+ // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+ Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
+ Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
+
+ Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+ Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+ Value *vi128XY_lo = nullptr;
+ Value *vi128XY_hi = nullptr;
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+ {
+ vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+ vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+
+ // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+ }
+
+ // do the same for zw components
+ Value *vi128ZW_lo = nullptr;
+ Value *vi128ZW_hi = nullptr;
+ if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+ {
+ vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps fpCast;
+ Value *conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ fpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 127.0));
+ break;
+ case CONVERT_SSCALED:
+ fpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0));
+ break;
+ case CONVERT_USCALED:
+ SWR_INVALID("Type should not be sign extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
+ Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
+
+ // sign extend
+ Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
+ Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE)
+ {
+ temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+ temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+ }
+
+ vVertexElements[currentVertexElement] = VUNDEF2_F();
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+
+ currentVertexElement += 1;
+ }
+ else
+ {
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ }
+ // else zero extend
+ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+ {
+ // init denormalize variables if needed
+ Instruction::CastOps fpCast;
+ Value *conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 255.0));
+ break;
+ case CONVERT_USCALED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0));
+ break;
+ case CONVERT_SSCALED:
+ SWR_INVALID("Type should not be zero extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ // pshufb masks for each component
+ Value *vConstMask;
+ switch (swizzle[i])
+ {
+ case 0:
+ // x shuffle mask
+ vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
+ break;
+ case 1:
+ // y shuffle mask
+ vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
+ break;
+ case 2:
+ // z shuffle mask
+ vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
+ break;
+ case 3:
+ // w shuffle mask
+ vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
+ break;
+ default:
+ vConstMask = nullptr;
+ break;
+ }
+
+ Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0);
+ Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1);
+
+ Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+ Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+ // after pshufb for x channel
+ // 256i - 0 1 2 3 4 5 6 7
+ // x000 x000 x000 x000 x000 x000 x000 x000
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE)
+ {
+ temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+ temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+ }
+
+ vVertexElements[currentVertexElement] = VUNDEF2_F();
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+
+ currentVertexElement += 1;
+ }
+ else
+ {
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ }
+ else
+ {
+ SWR_INVALID("Unsupported conversion type");
+ }
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
/// denormalizes if needed, converts to F32 if needed, and positions in
}
}
+#if USE_SIMD16_BUILDER
+void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args)
+{
+ // Unpack tuple args
+ Value* (&vGatherResult)[2] = std::get<0>(args);
+ Value* pVtxOut = std::get<1>(args);
+ const Instruction::CastOps extendType = std::get<2>(args);
+ const ConversionType conversionType = std::get<3>(args);
+ uint32_t ¤tVertexElement = std::get<4>(args);
+ uint32_t &outputElt = std::get<5>(args);
+ const ComponentEnable compMask = std::get<6>(args);
+ const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+ Value* (&vVertexElements)[4] = std::get<8>(args);
+
+ // cast types
+ Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+
+ // have to do extra work for sign extending
+ if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
+ {
+ // is this PP float?
+ bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
+
+ Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+ Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+
+ // shuffle mask
+ Value *vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
+ Value *vi128XY = nullptr;
+ Value *vi128XY_lo = nullptr;
+ Value *vi128XY_hi = nullptr;
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+ {
+ // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+ Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0);
+ Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1);
+
+ Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+ Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+ // after pshufb: group components together in each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+ vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+
+ // after PERMD: move and pack xy components into each 128bit lane
+ // 256i - 0 1 2 3 4 5 6 7
+ // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+#if 0
+
+ vi128XY = VUNDEF2_I();
+ vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0);
+ vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1);
+#endif
+ }
+
+ // do the same for zw components
+ Value *vi128ZW = nullptr;
+ Value *vi128ZW_lo = nullptr;
+ Value *vi128ZW_hi = nullptr;
+ if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+ {
+ Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0);
+ Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1);
+
+ Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+ Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+
+ vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+#if 0
+
+ vi128ZW = VUNDEF2_I();
+ vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0);
+ vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1);
+#endif
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps IntToFpCast;
+ Value *conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ IntToFpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 32767.0));
+ break;
+ case CONVERT_SSCALED:
+ IntToFpCast = Instruction::CastOps::SIToFP;
+ conversionFactor = VIMMED1((float)(1.0));
+ break;
+ case CONVERT_USCALED:
+ SWR_INVALID("Type should not be sign extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+ uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
+ Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
+
+ if (bFP)
+ {
+ // extract 128 bit lanes to sign extend each component
+ Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+ Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+
+ vVertexElements[currentVertexElement] = VUNDEF2_F();
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+ }
+ else
+ {
+ // extract 128 bit lanes to sign extend each component
+ Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+ Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE)
+ {
+ temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+ temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+ }
+
+ vVertexElements[currentVertexElement] = VUNDEF2_F();
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+ }
+
+ currentVertexElement += 1;
+ }
+ else
+ {
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ }
+ // else zero extend
+ else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+ {
+ // pshufb masks for each component
+ Value *vConstMask[2];
+
+ if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
+ {
+ // x/z shuffle mask
+ vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+ }
+
+ if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
+ {
+ // y/w shuffle mask
+ vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+ }
+
+ // init denormalize variables if needed
+ Instruction::CastOps fpCast;
+ Value* conversionFactor;
+
+ switch (conversionType)
+ {
+ case CONVERT_NORMALIZED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0 / 65535.0));
+ break;
+ case CONVERT_USCALED:
+ fpCast = Instruction::CastOps::UIToFP;
+ conversionFactor = VIMMED1((float)(1.0f));
+ break;
+ case CONVERT_SSCALED:
+ SWR_INVALID("Type should not be zero extended!");
+ conversionFactor = nullptr;
+ break;
+ default:
+ SWR_ASSERT(conversionType == CONVERT_NONE);
+ conversionFactor = nullptr;
+ break;
+ }
+
+ // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ if (isComponentEnabled(compMask, i))
+ {
+ if (compCtrl[i] == ComponentControl::StoreSrc)
+ {
+ // select correct constMask for x/z or y/w pshufb
+ uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+ // if x or y, use vi128XY permute result, else use vi128ZW
+ uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+ // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+
+ Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0);
+ Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1);
+
+ Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+ Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+
+ // after pshufb mask for x channel; z uses the same shuffle from the second gather
+ // 256i - 0 1 2 3 4 5 6 7
+ // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
+
+ // denormalize if needed
+ if (conversionType != CONVERT_NONE)
+ {
+ temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor);
+ temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor);
+ }
+
+ vVertexElements[currentVertexElement] = VUNDEF2_F();
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0);
+ vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1);
+
+ currentVertexElement += 1;
+ }
+ else
+ {
+ vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]);
+ }
+
+ if (currentVertexElement > 3)
+ {
+ StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements);
+ // reset to the next vVertexElement to output
+ currentVertexElement = 0;
+ }
+ }
+ }
+ }
+ else
+ {
+ SWR_INVALID("Unsupported conversion type");
+ }
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Output a simdvertex worth of elements to the current outputElt
/// @param pVtxOut - base address of VIN output struct
case Store1Int: return VIMMED2_1(1);
case StoreVertexId:
{
- Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimd2FP32Ty);
+ Value* pId = VUNDEF2_F();
+
+ Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
+ Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty);
+
+ pId = INSERT2_F(pId, pId_lo, 0);
+ pId = INSERT2_F(pId, pId_hi, 1);
+
return VBROADCAST2(pId);
}
case StoreInstanceId: