From 2559f2b93edc74d943fa1441433288a92263f854 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 16 Oct 2017 18:39:41 -0500 Subject: [PATCH] swr/rast: Widen fetch shader to SIMD16 (disabled for now) Refactored the gather operation to process 16 elements at a time via paired SIMD8 operations. Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/jitter/fetch_jit.cpp | 441 +++++++++++++++++- 1 file changed, 428 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 1e3db902bb6..30dbcfc8ce1 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -89,7 +89,13 @@ struct FetchJit : public Builder void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); #if USE_SIMD16_SHADERS +#define USE_SIMD16_GATHERS 0 + +#if USE_SIMD16_GATHERS + void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2); +#else void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2); +#endif #else void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); #endif @@ -279,8 +285,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) } else { +#if USE_SIMD16_GATHERS + JitGatherVertices(fetchState, streams, vIndices, vIndices2, pVtxOut, false); +#else JitGatherVertices(fetchState, streams, vIndices, pVtxOut, false); JitGatherVertices(fetchState, streams, vIndices2, GEP(pVtxOut, C(1)), true); +#endif } #else (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, streams, vIndices, pVtxOut) @@ -792,8 +802,13 @@ void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4]) /// @param vIndices - vector value of indices to gather /// @param pVtxOut - value pointer to output simdvertex struct #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS +void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, + Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2) +#else void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut, bool useVertexID2) +#endif #else void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut) @@ -802,6 +817,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, uint32_t currentVertexElement = 0; uint32_t outputElt = 0; Value* vVertexElements[4]; +#if USE_SIMD16_GATHERS + Value* vVertexElements2[4]; +#endif Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); @@ -809,7 +827,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex})); curInstance->setName("curInstance"); - for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt) + for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1) { const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt]; @@ -836,7 +854,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, maxVertex = LOAD(maxVertex); Value *minVertex = NULL; - if (fetchState.bPartialVertexBuffer) { + if (fetchState.bPartialVertexBuffer) + { // min vertex index for low bounds OOB checking minVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_minVertex)}); minVertex = LOAD(minVertex); @@ -849,10 +868,13 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } Value *vCurIndices; +#if USE_SIMD16_GATHERS + Value *vCurIndices2; +#endif Value *startOffset; Value *vInstanceStride = VIMMED1(0); - if(ied.InstanceEnable) + if (ied.InstanceEnable) { Value* stepRate = C(ied.InstanceAdvancementState); @@ -867,6 +889,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); vCurIndices = VBROADCAST(calcInstance); +#if USE_SIMD16_GATHERS + vCurIndices2 = VBROADCAST(calcInstance); +#endif startOffset = startInstance; } @@ -878,6 +903,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // offset indices by baseVertex vCurIndices = ADD(vIndices, vBaseVertex); +#if USE_SIMD16_GATHERS + vCurIndices2 = ADD(vIndices2, vBaseVertex); +#endif startOffset = startVertex; SWR_ASSERT((0), "TODO: Fill out more once driver sends this down."); @@ -886,6 +914,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { // offset indices by baseVertex vCurIndices = ADD(vIndices, vBaseVertex); +#if USE_SIMD16_GATHERS + vCurIndices2 = ADD(vIndices2, vBaseVertex); +#endif startOffset = startVertex; } @@ -903,7 +934,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // if we have a negative value, we're already OOB. clamp at 0. maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty)); - if (fetchState.bPartialVertexBuffer) { + if (fetchState.bPartialVertexBuffer) + { // similary for min vertex minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty)); Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0)); @@ -920,6 +952,61 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // is the element is <= the partially valid size Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); +#if USE_SIMD16_GATHERS + // override cur indices with 0 if pitch is 0 + Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); + vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2); + + // are vertices partially OOB? + Value* vMaxVertex = VBROADCAST(maxVertex); + Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); + Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex); + + // are vertices fully in bounds? + Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); + Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex); + + Value *vGatherMask; + Value *vGatherMask2; + if (fetchState.bPartialVertexBuffer) + { + // are vertices below minVertex limit? + Value *vMinVertex = VBROADCAST(minVertex); + Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); + Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex); + + // only fetch lanes that pass both tests + vGatherMask = AND(vMaxGatherMask, vMinGatherMask); + vGatherMask2 = AND(vMaxGatherMask, vMinGatherMask2); + } + else + { + vGatherMask = vMaxGatherMask; + vGatherMask2 = vMaxGatherMask2; + } + + // blend in any partially OOB indices that have valid elements + vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); + vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2); + Value *pMask = vGatherMask; + Value *pMask2 = vGatherMask2; + vGatherMask = VMASK(vGatherMask); + vGatherMask2 = VMASK(vGatherMask2); + + // calculate the actual offsets into the VB + Value* vOffsets = MUL(vCurIndices, vStride); + vOffsets = ADD(vOffsets, vAlignmentOffsets); + + Value* vOffsets2 = MUL(vCurIndices2, vStride); + vOffsets2 = ADD(vOffsets2, vAlignmentOffsets); + + // if instance stride enable is: + // true - add product of the instanceID and advancement state to the offst into the VB + // false - value of vInstanceStride has been initialialized to zero + vOffsets = ADD(vOffsets, vInstanceStride); + vOffsets2 = ADD(vOffsets2, vInstanceStride); + +#else // override cur indices with 0 if pitch is 0 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); @@ -932,14 +1019,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); Value *vGatherMask; - if (fetchState.bPartialVertexBuffer) { + if (fetchState.bPartialVertexBuffer) + { // are vertices below minVertex limit? Value *vMinVertex = VBROADCAST(minVertex); Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); // only fetch lanes that pass both tests vGatherMask = AND(vMaxGatherMask, vMinGatherMask); - } else { + } + else + { vGatherMask = vMaxGatherMask; } @@ -957,6 +1047,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // false - value of vInstanceStride has been initialialized to zero vOffsets = ADD(vOffsets, vInstanceStride); +#endif // Packing and component control ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking; const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, @@ -965,6 +1056,35 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // Special gather/conversion for formats without equal component sizes if (IsOddFormat((SWR_FORMAT)ied.Format)) { +#if USE_SIMD16_GATHERS + Value *pResults[4]; + Value *pResults2[4]; + CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); + CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2); + ConvertFormat((SWR_FORMAT)ied.Format, pResults); + ConvertFormat((SWR_FORMAT)ied.Format, pResults2); + + for (uint32_t c = 0; c < 4; c += 1) + { + if (isComponentEnabled(compMask, c)) + { + vVertexElements[currentVertexElement] = pResults[c]; + vVertexElements2[currentVertexElement] = pResults2[c]; + currentVertexElement++; + + if (currentVertexElement > 3) + { + StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); + StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); + + outputElt += 1; + + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + } +#else Value* pResults[4]; CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); ConvertFormat((SWR_FORMAT)ied.Format, pResults); @@ -982,20 +1102,75 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } } } +#endif } else if(info.type[0] == SWR_TYPE_FLOAT) { ///@todo: support 64 bit vb accesses Value* gatherSrc = VIMMED1(0.0f); +#if USE_SIMD16_GATHERS + Value* gatherSrc2 = VIMMED1(0.0f); +#endif SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), "Unsupported format for standard gather fetch."); // Gather components from memory to store in a simdvertex structure - switch(bpc) + switch (bpc) { case 16: { +#if USE_SIMD16_GATHERS + Value* vGatherResult[2]; + Value* vGatherResult2[2]; + Value *vMask; + Value *vMask2; + + // if we have at least one component out of x or y to fetch + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + // save mask as it is zero'd out after each gather + vMask = vGatherMask; + vMask2 = vGatherMask2; + + vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + } + + // if we have at least one component out of z or w to fetch + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + // offset base to the next components(zw) in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + vMask = vGatherMask; + vMask2 = vGatherMask2; + + vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } + + + // if we have at least one component to shuffle into place + if (compMask) + { + Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); + Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2); + + // Shuffle gathered components into place in simdvertex struct + Shuffle16bpcGather(args, false); // outputs to vVertexElements ref + Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref + } +#else Value* vGatherResult[2]; Value *vMask; @@ -1036,12 +1211,58 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Shuffle16bpcGather(args); // outputs to vVertexElements ref #endif } +#endif } break; case 32: { - for (uint32_t i = 0; i < 4; i++) + for (uint32_t i = 0; i < 4; i += 1) { +#if USE_SIMD16_GATHERS + if (isComponentEnabled(compMask, i)) + { + // if we need to gather the component + if (compCtrl[i] == StoreSrc) + { + // save mask as it is zero'd out after each gather + Value *vMask = vGatherMask; + Value *vMask2 = vGatherMask2; + + // Gather a SIMD of vertices + // APIs allow a 4GB range for offsets + // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( + // But, we know that elements must be aligned for FETCH. :) + // Right shift the offset by a bit and then scale by 2 to remove the sign extension. + Value *vShiftedOffsets = VPSRLI(vOffsets, C(1)); + Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1)); + vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2)); + vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, C((char)2)); + + currentVertexElement += 1; + } + else + { + vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); + vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); + + currentVertexElement += 1; + } + + if (currentVertexElement > 3) + { + StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); + StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); + + outputElt += 1; + + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); +#else if (isComponentEnabled(compMask, i)) { // if we need to gather the component @@ -1073,18 +1294,85 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // reset to the next vVertexElement to output currentVertexElement = 0; } - } // offset base to the next component in the vertex to gather pStreamBase = GEP(pStreamBase, C((char)4)); +#endif } } break; case 64: { - for (uint32_t i = 0; i < 4; i++) + for (uint32_t i = 0; i < 4; i += 1) { +#if USE_SIMD16_GATHERS + if (isComponentEnabled(compMask, i)) + { + // if we need to gather the component + if (compCtrl[i] == StoreSrc) + { + Value *vMaskLo = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 })); + Value *vMaskLo2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 0, 1, 2, 3 })); + Value *vMaskHi = VSHUFFLE(pMask, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 })); + Value *vMaskHi2 = VSHUFFLE(pMask2, VUNDEF(mInt1Ty, 8), C({ 4, 5, 6, 7 })); + vMaskLo = S_EXT(vMaskLo, VectorType::get(mInt64Ty, 4)); + vMaskLo2 = S_EXT(vMaskLo2, VectorType::get(mInt64Ty, 4)); + vMaskHi = S_EXT(vMaskHi, VectorType::get(mInt64Ty, 4)); + vMaskHi2 = S_EXT(vMaskHi2, VectorType::get(mInt64Ty, 4)); + vMaskLo = BITCAST(vMaskLo, VectorType::get(mDoubleTy, 4)); + vMaskLo2 = BITCAST(vMaskLo2, VectorType::get(mDoubleTy, 4)); + vMaskHi = BITCAST(vMaskHi, VectorType::get(mDoubleTy, 4)); + vMaskHi2 = BITCAST(vMaskHi2, VectorType::get(mDoubleTy, 4)); + + Value *vOffsetsLo = VEXTRACTI128(vOffsets, C(0)); + Value *vOffsetsLo2 = VEXTRACTI128(vOffsets2, C(0)); + Value *vOffsetsHi = VEXTRACTI128(vOffsets, C(1)); + Value *vOffsetsHi2 = VEXTRACTI128(vOffsets2, C(1)); + + Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); + + Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo, C((char)1)); + Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2, C((char)1)); + Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi, C((char)1)); + Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2, C((char)1)); + + pGatherLo = VCVTPD2PS(pGatherLo); + pGatherLo2 = VCVTPD2PS(pGatherLo2); + pGatherHi = VCVTPD2PS(pGatherHi); + pGatherHi2 = VCVTPD2PS(pGatherHi2); + + Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); + Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); + + vVertexElements[currentVertexElement] = pGather; + vVertexElements2[currentVertexElement] = pGather2; + + currentVertexElement += 1; + } + else + { + vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); + vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); + + currentVertexElement += 1; + } + + if (currentVertexElement > 3) + { + StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); + StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); + + outputElt += 1; + + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } + + // offset base to the next component in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)8)); +#else if (isComponentEnabled(compMask, i)) { // if we need to gather the component @@ -1129,11 +1417,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // reset to the next vVertexElement to output currentVertexElement = 0; } - } // offset base to the next component in the vertex to gather pStreamBase = GEP(pStreamBase, C((char)8)); +#endif } } break; @@ -1180,6 +1468,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // value substituted when component of gather is masked Value* gatherSrc = VIMMED1(0); +#if USE_SIMD16_GATHERS + Value* gatherSrc2 = VIMMED1(0); +#endif // Gather components from memory to store in a simdvertex structure switch (bpc) @@ -1187,8 +1478,24 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 8: { // if we have at least one component to fetch - if(compMask) + if (compMask) { +#if USE_SIMD16_GATHERS + Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); + Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2, C((char)1)); + // e.g. result of an 8x32bit integer gather for 8bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw + + Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); + Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle); + + // Shuffle gathered components into place in simdvertex struct + Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref + Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref +#else Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 @@ -1202,12 +1509,63 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Shuffle8bpcGatherd(args, useVertexID2); // outputs to vVertexElements ref #else Shuffle8bpcGatherd(args); // outputs to vVertexElements ref +#endif #endif } } break; case 16: { +#if USE_SIMD16_GATHERS + Value* vGatherResult[2]; + Value *vMask; + Value* vGatherResult2[2]; + Value *vMask2; + + // if we have at least one component out of x or y to fetch + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + // save mask as it is zero'd out after each gather + vMask = vGatherMask; + vMask2 = vGatherMask2; + + vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + } + + // if we have at least one component out of z or w to fetch + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + // offset base to the next components(zw) in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + vMask = vGatherMask; + vMask2 = vGatherMask2; + + vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } + + // if we have at least one component to shuffle into place + if (compMask) + { + Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); + Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, + currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2); + + // Shuffle gathered components into place in simdvertex struct + Shuffle16bpcGather(args, false); // outputs to vVertexElements ref + Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref + } +#else Value* vGatherResult[2]; Value *vMask; @@ -1248,6 +1606,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Shuffle16bpcGather(args); // outputs to vVertexElements ref #endif } +#endif } break; case 32: @@ -1260,6 +1619,38 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // if we need to gather the component if (compCtrl[i] == StoreSrc) { +#if USE_SIMD16_GATHERS + // save mask as it is zero'd out after each gather + Value *vMask = vGatherMask; + Value *vMask2 = vGatherMask2; + + Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + + if (conversionType == CONVERT_USCALED) + { + pGather = UI_TO_FP(pGather, mSimdFP32Ty); + pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty); + } + else if (conversionType == CONVERT_SSCALED) + { + pGather = SI_TO_FP(pGather, mSimdFP32Ty); + pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty); + } + else if (conversionType == CONVERT_SFIXED) + { + pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); + pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); + } + + vVertexElements[currentVertexElement] = pGather; + vVertexElements2[currentVertexElement] = pGather2; + // e.g. result of a single 8x32bit integer gather for 32bit components + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx + + currentVertexElement += 1; +#else // save mask as it is zero'd out after each gather Value *vMask = vGatherMask; @@ -1282,11 +1673,19 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // e.g. result of a single 8x32bit integer gather for 32bit components // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx +#endif } else { #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS + vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); + vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); + + currentVertexElement += 1; +#else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#endif #else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); #endif @@ -1294,7 +1693,15 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (currentVertexElement > 3) { +#if USE_SIMD16_GATHERS + StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); + StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); + + outputElt += 1; +#else StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); +#endif + // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -1311,8 +1718,16 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } // if we have a partially filled vVertexElement struct, output it - if(currentVertexElement > 0){ + if (currentVertexElement > 0) + { +#if USE_SIMD16_GATHERS + StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements); + StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2); + + outputElt += 1; +#else StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); +#endif } } -- 2.30.2