From: Tim Rowley Date: Mon, 4 Dec 2017 00:49:29 +0000 (-0600) Subject: swr/rast: WIP - Widen fetch shader to SIMD16 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=36e276b6b03da852c78e314640b3822be263def2;p=mesa.git swr/rast: WIP - Widen fetch shader to SIMD16 Widen vertex gather/storage to SIMD16 for all component types. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 337bb7f6604..6c0e658e68f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -70,6 +70,9 @@ struct FetchJit : public Builder #else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); #endif +#if USE_SIMD16_BUILDER + void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args); +#endif typedef std::tuple Shuffle16bpcArgs; @@ -78,6 +81,9 @@ struct FetchJit : public Builder #else void Shuffle16bpcGather(Shuffle16bpcArgs &args); #endif +#if USE_SIMD16_BUILDER + void Shuffle16bpcGather2(Shuffle16bpcArgs &args); +#endif void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); #if USE_SIMD16_BUILDER @@ -726,7 +732,7 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB // only works if pixel size is <= 32bits SWR_ASSERT(info.bpp <= 32); - Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); + Value *pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); for (uint32_t comp = 0; comp < 4; ++comp) { @@ -825,6 +831,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* vVertexElements[4]; #if USE_SIMD16_GATHERS Value* vVertexElements2[4]; +#if USE_SIMD16_BUILDER + Value *pVtxSrc2[4]; +#endif #endif Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); @@ -961,6 +970,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS // override cur indices with 0 if pitch is 0 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); + vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2); // are vertices partially OOB? @@ -983,7 +993,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // only fetch lanes that pass both tests vGatherMask = AND(vMaxGatherMask, vMinGatherMask); - vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2); + vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2); } else { @@ -1074,15 +1084,32 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { if (isComponentEnabled(compMask, c)) { - vVertexElements[currentVertexElement] = pResults[c]; +#if USE_SIMD16_BUILDER + // pack adjacent pairs of SIMD8s into SIMD16s + pVtxSrc2[currentVertexElement] = VUNDEF2_F(); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults[c], 0); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pResults2[c], 1); + +#else + vVertexElements[currentVertexElement] = pResults[c]; vVertexElements2[currentVertexElement] = pResults2[c]; - currentVertexElement++; + +#endif + currentVertexElement += 1; if (currentVertexElement > 3) { +#if USE_SIMD16_BUILDER + // store SIMD16s + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + + StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); + +#else StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); +#endif outputElt += 1; // reset to the next vVertexElement to output @@ -1113,9 +1140,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, else if(info.type[0] == SWR_TYPE_FLOAT) { ///@todo: support 64 bit vb accesses - Value* gatherSrc = VIMMED1(0.0f); + Value *gatherSrc = VIMMED1(0.0f); #if USE_SIMD16_GATHERS - Value* gatherSrc2 = VIMMED1(0.0f); + Value *gatherSrc2 = VIMMED1(0.0f); +#if USE_SIMD16_BUILDER + Value *gatherSrc16 = VIMMED2_1(0.0f); +#endif #endif SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), @@ -1127,8 +1157,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 16: { #if USE_SIMD16_GATHERS - Value* vGatherResult[2]; - Value* vGatherResult2[2]; + Value *vGatherResult[2]; + Value *vGatherResult2[2]; // if we have at least one component out of x or y to fetch if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) @@ -1140,6 +1170,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy // } + else + { + vGatherResult[0] = VUNDEF_I(); + vGatherResult2[0] = VUNDEF_I(); + } // if we have at least one component out of z or w to fetch if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) @@ -1154,11 +1189,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw // } - + else + { + vGatherResult[1] = VUNDEF_I(); + vGatherResult2[1] = VUNDEF_I(); + } // if we have at least one component to shuffle into place if (compMask) { +#if USE_SIMD16_BUILDER + Value *gatherResult[2]; + + gatherResult[0] = VUNDEF2_I(); + gatherResult[1] = VUNDEF2_I(); + + gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0); + gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1); + + gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0); + gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1); + + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + + Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE, + currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); + + // Shuffle gathered components into place in simdvertex struct + Shuffle16bpcGather2(args); // outputs to vVertexElements ref +#else Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE, @@ -1167,6 +1226,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // Shuffle gathered components into place in simdvertex struct Shuffle16bpcGather(args, false); // outputs to vVertexElements ref Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref +#endif } #else Value* vGatherResult[2]; @@ -1209,12 +1269,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, break; case 32: { -#if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value *pVtxSrc2[4]; - -#endif -#endif for (uint32_t i = 0; i < 4; i += 1) { #if USE_SIMD16_GATHERS @@ -1231,10 +1285,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vShiftedOffsets = VPSRLI(vOffsets, C(1)); Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1)); #if USE_SIMD16_BUILDER - Value *src = VUNDEF2_F(); - src = INSERT2_F(src, gatherSrc, 0); - src = INSERT2_F(src, gatherSrc2, 1); - Value *indices = VUNDEF2_I(); indices = INSERT2_I(indices, vShiftedOffsets, 0); indices = INSERT2_I(indices, vShiftedOffsets2, 1); @@ -1243,12 +1293,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, mask = INSERT2_I(mask, vGatherMask, 0); mask = INSERT2_I(mask, vGatherMask2, 1); - pVtxSrc2[currentVertexElement] = GATHERPS2(src, pStreamBase, indices, mask, 2); -#if 1 - - vVertexElements[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 0); - vVertexElements2[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 1); -#endif + pVtxSrc2[currentVertexElement] = GATHERPS2(gatherSrc16, pStreamBase, indices, mask, 2); #else vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2); vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2); @@ -1384,24 +1429,45 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); - vVertexElements[currentVertexElement] = pGather; +#if USE_SIMD16_BUILDER + // pack adjacent pairs of SIMD8s into SIMD16s + pVtxSrc2[currentVertexElement] = VUNDEF2_F(); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1); + +#else + vVertexElements[currentVertexElement] = pGather; vVertexElements2[currentVertexElement] = pGather2; +#endif currentVertexElement += 1; } else { +#if USE_SIMD16_BUILDER + pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); + +#else vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); +#endif currentVertexElement += 1; } if (currentVertexElement > 3) { +#if USE_SIMD16_BUILDER + // store SIMD16s + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + + StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); + +#else StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); +#endif outputElt += 1; // reset to the next vVertexElement to output @@ -1522,10 +1588,25 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); + // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw +#if USE_SIMD16_BUILDER + Value *gatherResult = VUNDEF2_I(); + + gatherResult = INSERT2_I(gatherResult, vGatherResult, 0); + gatherResult = INSERT2_I(gatherResult, vGatherResult2, 1); + + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + + Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle); + + // Shuffle gathered components into place in simdvertex struct + Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref +#else Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, @@ -1534,6 +1615,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // Shuffle gathered components into place in simdvertex struct Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref +#endif #else Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); // e.g. result of an 8x32bit integer gather for 8bit components @@ -1569,6 +1651,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy // } + else + { + vGatherResult[0] = VUNDEF_I(); + vGatherResult2[0] = VUNDEF_I(); + } // if we have at least one component out of z or w to fetch if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) @@ -1583,10 +1670,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw // } + else + { + vGatherResult[1] = VUNDEF_I(); + vGatherResult2[1] = VUNDEF_I(); + } // if we have at least one component to shuffle into place if (compMask) { +#if USE_SIMD16_BUILDER + Value *gatherResult[2]; + + gatherResult[0] = VUNDEF2_I(); + gatherResult[1] = VUNDEF2_I(); + + gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult[0], 0); + gatherResult[0] = INSERT2_I(gatherResult[0], vGatherResult2[0], 1); + + gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult[1], 0); + gatherResult[1] = INSERT2_I(gatherResult[1], vGatherResult2[1], 1); + + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + + Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); + + // Shuffle gathered components into place in simdvertex struct + Shuffle16bpcGather2(args); // outputs to vVertexElements ref +#else Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, @@ -1595,6 +1707,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // Shuffle gathered components into place in simdvertex struct Shuffle16bpcGather(args, false); // outputs to vVertexElements ref Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref +#endif } #else Value* vGatherResult[2]; @@ -1665,8 +1778,18 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); } +#if USE_SIMD16_BUILDER + // pack adjacent pairs of SIMD8s into SIMD16s + pVtxSrc2[currentVertexElement] = VUNDEF2_F(); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather, 0); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], pGather2, 1); + +#else vVertexElements[currentVertexElement] = pGather; vVertexElements2[currentVertexElement] = pGather2; + +#endif + // e.g. result of a single 8x32bit integer gather for 32bit components // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx @@ -1698,9 +1821,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { #if USE_SIMD16_SHADERS #if USE_SIMD16_GATHERS +#if USE_SIMD16_BUILDER + pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); + +#else vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); +#endif currentVertexElement += 1; #else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); @@ -1713,9 +1841,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (currentVertexElement > 3) { #if USE_SIMD16_GATHERS +#if USE_SIMD16_BUILDER + // store SIMD16s + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + + StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); + +#else StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); +#endif outputElt += 1; #else StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); @@ -1740,9 +1876,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (currentVertexElement > 0) { #if USE_SIMD16_GATHERS +#if USE_SIMD16_BUILDER + // store SIMD16s + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + + StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2); + +#else StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements); StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2); +#endif outputElt += 1; #else StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); @@ -2092,6 +2236,251 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) } } +#if USE_SIMD16_BUILDER +void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) +{ + // Unpack tuple args + Value*& vGatherResult = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t ¤tVertexElement = std::get<4>(args); + uint32_t &outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value* (&vVertexElements)[4] = std::get<8>(args); + const uint32_t(&swizzle)[4] = std::get<9>(args); + + // cast types + Type *vGatherTy = mSimdInt32Ty; + Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + + // have to do extra work for sign extending + if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)) + { + Type *v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane + Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits + + // shuffle mask, including any swizzling + const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; + const char z = (char)swizzle[2]; const char w = (char)swizzle[3]; + Value *vConstMask = C({ char(x), char(x + 4), char(x + 8), char(x + 12), + char(y), char(y + 4), char(y + 8), char(y + 12), + char(z), char(z + 4), char(z + 8), char(z + 12), + char(w), char(w + 4), char(w + 8), char(w + 12), + char(x), char(x + 4), char(x + 8), char(x + 12), + char(y), char(y + 4), char(y + 8), char(y + 12), + char(z), char(z + 4), char(z + 8), char(z + 12), + char(w), char(w + 4), char(w + 8), char(w + 12) }); + + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + + Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0); + Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1); + + Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww + + Value *vi128XY_lo = nullptr; + Value *vi128XY_hi = nullptr; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); + vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty); + + // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) + } + + // do the same for zw components + Value *vi128ZW_lo = nullptr; + Value *vi128ZW_hi = nullptr; + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); + vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty); + } + + // init denormalize variables if needed + Instruction::CastOps fpCast; + Value *conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + fpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0 / 127.0)); + break; + case CONVERT_SSCALED: + fpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0)); + break; + case CONVERT_USCALED: + SWR_INVALID("Type should not be sign extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + for (uint32_t i = 0; i < 4; i++) + { + if (isComponentEnabled(compMask, i)) + { + if (compCtrl[i] == ComponentControl::StoreSrc) + { + // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 + uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; + Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; + + // sign extend + Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty)); + Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty)); + + // denormalize if needed + if (conversionType != CONVERT_NONE) + { + temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor); + temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); + } + + vVertexElements[currentVertexElement] = VUNDEF2_F(); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1); + + currentVertexElement += 1; + } + else + { + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); + } + + if (currentVertexElement > 3) + { + StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + } + } + // else zero extend + else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) + { + // init denormalize variables if needed + Instruction::CastOps fpCast; + Value *conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + fpCast = Instruction::CastOps::UIToFP; + conversionFactor = VIMMED1((float)(1.0 / 255.0)); + break; + case CONVERT_USCALED: + fpCast = Instruction::CastOps::UIToFP; + conversionFactor = VIMMED1((float)(1.0)); + break; + case CONVERT_SSCALED: + SWR_INVALID("Type should not be zero extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits + for (uint32_t i = 0; i < 4; i++) + { + if (isComponentEnabled(compMask, i)) + { + if (compCtrl[i] == ComponentControl::StoreSrc) + { + // pshufb masks for each component + Value *vConstMask; + switch (swizzle[i]) + { + case 0: + // x shuffle mask + vConstMask = C({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, + 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 }); + break; + case 1: + // y shuffle mask + vConstMask = C({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, + 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 }); + break; + case 2: + // z shuffle mask + vConstMask = C({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, + 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 }); + break; + case 3: + // w shuffle mask + vConstMask = C({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, + 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 }); + break; + default: + vConstMask = nullptr; + break; + } + + Value *vGatherResult_lo = EXTRACT2_I(vGatherResult, 0); + Value *vGatherResult_hi = EXTRACT2_I(vGatherResult, 1); + + Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + + // after pshufb for x channel + // 256i - 0 1 2 3 4 5 6 7 + // x000 x000 x000 x000 x000 x000 x000 x000 + + // denormalize if needed + if (conversionType != CONVERT_NONE) + { + temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor); + temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); + } + + vVertexElements[currentVertexElement] = VUNDEF2_F(); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1); + + currentVertexElement += 1; + } + else + { + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); + } + + if (currentVertexElement > 3) + { + StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + } + } + else + { + SWR_INVALID("Unsupported conversion type"); + } +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, /// denormalizes if needed, converts to F32 if needed, and positions in @@ -2318,6 +2707,272 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) } } +#if USE_SIMD16_BUILDER +void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) +{ + // Unpack tuple args + Value* (&vGatherResult)[2] = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t ¤tVertexElement = std::get<4>(args); + uint32_t &outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value* (&vVertexElements)[4] = std::get<8>(args); + + // cast types + Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + + // have to do extra work for sign extending + if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) + { + // is this PP float? + bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; + + Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane + Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits + + // shuffle mask + Value *vConstMask = C({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); + Value *vi128XY = nullptr; + Value *vi128XY_lo = nullptr; + Value *vi128XY_hi = nullptr; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + + Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[0], 0); + Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[0], 1); + + Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy + + vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + + // after PERMD: move and pack xy components into each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy +#if 0 + + vi128XY = VUNDEF2_I(); + vi128XY = INSERT2_I(vi128XY, vi128XY_lo, 0); + vi128XY = INSERT2_I(vi128XY, vi128XY_hi, 1); +#endif + } + + // do the same for zw components + Value *vi128ZW = nullptr; + Value *vi128ZW_lo = nullptr; + Value *vi128ZW_hi = nullptr; + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[1], 0); + Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[1], 1); + + Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + + vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); +#if 0 + + vi128ZW = VUNDEF2_I(); + vi128ZW = INSERT2_I(vi128ZW, vi128ZW_lo, 0); + vi128ZW = INSERT2_I(vi128ZW, vi128ZW_hi, 1); +#endif + } + + // init denormalize variables if needed + Instruction::CastOps IntToFpCast; + Value *conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + IntToFpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0 / 32767.0)); + break; + case CONVERT_SSCALED: + IntToFpCast = Instruction::CastOps::SIToFP; + conversionFactor = VIMMED1((float)(1.0)); + break; + case CONVERT_USCALED: + SWR_INVALID("Type should not be sign extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex + for (uint32_t i = 0; i < 4; i++) + { + if (isComponentEnabled(compMask, i)) + { + if (compCtrl[i] == ComponentControl::StoreSrc) + { + // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 + uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; + Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; + + if (bFP) + { + // extract 128 bit lanes to sign extend each component + Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + + vVertexElements[currentVertexElement] = VUNDEF2_F(); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1); + } + else + { + // extract 128 bit lanes to sign extend each component + Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + + // denormalize if needed + if (conversionType != CONVERT_NONE) + { + temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor); + temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor); + } + + vVertexElements[currentVertexElement] = VUNDEF2_F(); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1); + } + + currentVertexElement += 1; + } + else + { + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); + } + + if (currentVertexElement > 3) + { + StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + } + } + // else zero extend + else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) + { + // pshufb masks for each component + Value *vConstMask[2]; + + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) + { + // x/z shuffle mask + vConstMask[0] = C({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + } + + if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) + { + // y/w shuffle mask + vConstMask[1] = C({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 }); + } + + // init denormalize variables if needed + Instruction::CastOps fpCast; + Value* conversionFactor; + + switch (conversionType) + { + case CONVERT_NORMALIZED: + fpCast = Instruction::CastOps::UIToFP; + conversionFactor = VIMMED1((float)(1.0 / 65535.0)); + break; + case CONVERT_USCALED: + fpCast = Instruction::CastOps::UIToFP; + conversionFactor = VIMMED1((float)(1.0f)); + break; + case CONVERT_SSCALED: + SWR_INVALID("Type should not be zero extended!"); + conversionFactor = nullptr; + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + conversionFactor = nullptr; + break; + } + + // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits + for (uint32_t i = 0; i < 4; i++) + { + if (isComponentEnabled(compMask, i)) + { + if (compCtrl[i] == ComponentControl::StoreSrc) + { + // select correct constMask for x/z or y/w pshufb + uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; + // if x or y, use vi128XY permute result, else use vi128ZW + uint32_t selectedGather = (i < 2) ? 0 : 1; + + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + + Value *vGatherResult_lo = EXTRACT2_I(vGatherResult[selectedGather], 0); + Value *vGatherResult_hi = EXTRACT2_I(vGatherResult[selectedGather], 1); + + Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy); + Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy); + + // after pshufb mask for x channel; z uses the same shuffle from the second gather + // 256i - 0 1 2 3 4 5 6 7 + // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 + + // denormalize if needed + if (conversionType != CONVERT_NONE) + { + temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor); + temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); + } + + vVertexElements[currentVertexElement] = VUNDEF2_F(); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_lo, 0); + vVertexElements[currentVertexElement] = INSERT2_F(vVertexElements[currentVertexElement], temp_hi, 1); + + currentVertexElement += 1; + } + else + { + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); + } + + if (currentVertexElement > 3) + { + StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + } + } + else + { + SWR_INVALID("Unsupported conversion type"); + } +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Output a simdvertex worth of elements to the current outputElt /// @param pVtxOut - base address of VIN output struct @@ -2438,7 +3093,14 @@ Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl) case Store1Int: return VIMMED2_1(1); case StoreVertexId: { - Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimd2FP32Ty); + Value* pId = VUNDEF2_F(); + + Value* pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); + Value* pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); + + pId = INSERT2_F(pId, pId_lo, 0); + pId = INSERT2_F(pId, pId_hi, 1); + return VBROADCAST2(pId); } case StoreInstanceId: