From: Tim Rowley Date: Wed, 8 Nov 2017 20:07:33 +0000 (-0600) Subject: swr/rast: Simplify GATHER* jit builder api X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2e244c7168a1130a18c8d8a901161db9b6cbaac3;p=mesa.git swr/rast: Simplify GATHER* jit builder api General cleanup, and prep work for possibly moving to llvm masked gather intrinsic. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index daa9cb1ec11..bd3a52566d6 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -554,7 +554,7 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) + Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { Value* vGather; @@ -563,7 +563,7 @@ namespace SwrJit { // force mask to , required by vgather vMask = BITCAST(vMask, mSimdFP32Ty); - vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale); + vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale)); } else { @@ -574,7 +574,7 @@ namespace SwrJit STORE(vSrc, vSrcPtr); vGather = VUNDEF_F(); - Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); + Value *vScaleVec = VIMMED1((uint32_t)scale); Value *vOffsets = MUL(vIndices,vScaleVec); Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth; ++i) @@ -606,14 +606,14 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) + Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { Value* vGather; // use avx2 gather instruction if available if(JM()->mArch.AVX2()) { - vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale); + vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale)); } else { @@ -624,7 +624,7 @@ namespace SwrJit STORE(vSrc, vSrcPtr); vGather = VUNDEF_I(); - Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); + Value *vScaleVec = VIMMED1((uint32_t)scale); Value *vOffsets = MUL(vIndices, vScaleVec); Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth; ++i) @@ -656,14 +656,14 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale) + Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) { Value* vGather; // use avx2 gather instruction if available if(JM()->mArch.AVX2()) { - vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, scale); + vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); } else { @@ -674,7 +674,7 @@ namespace SwrJit STORE(vSrc, vSrcPtr); vGather = UndefValue::get(VectorType::get(mDoubleTy, 4)); - Value *vScaleVec = VECTOR_SPLAT(4, Z_EXT(scale,mInt32Ty)); + Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale)); Value *vOffsets = MUL(vIndices,vScaleVec); Value *mask = MASK(vMask); for(uint32_t i = 0; i < mVWidth/2; ++i) @@ -1016,7 +1016,7 @@ namespace SwrJit // save mask as it is zero'd out after each gather vMask = mask; - vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy @@ -1029,7 +1029,7 @@ namespace SwrJit pSrcBase = GEP(pSrcBase, C((char)4)); vMask = mask; - vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw @@ -1060,7 +1060,7 @@ namespace SwrJit Value *vMask = mask; // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); + vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask); // offset base to the next component to gather pSrcBase = GEP(pSrcBase, C((char)4)); @@ -1081,7 +1081,7 @@ namespace SwrJit case 8: { Value* vGatherMaskedVal = VIMMED1((int32_t)0); - Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1)); + Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw @@ -1102,7 +1102,7 @@ namespace SwrJit // save mask as it is zero'd out after each gather vMask = mask; - vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy @@ -1115,7 +1115,7 @@ namespace SwrJit pSrcBase = GEP(pSrcBase, C((char)4)); vMask = mask; - vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1)); + vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw @@ -1147,7 +1147,7 @@ namespace SwrJit Value *vMask = mask; // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1)); + vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask); // offset base to the next component to gather pSrcBase = GEP(pSrcBase, C((char)4)); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index d9ff4a21567..9aa24148d38 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -112,15 +112,15 @@ Value *MASKLOADD(Value* src, Value* mask); void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); -Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); +Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); -Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); +Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); -Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale); +Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 062852e2d22..aa3fca4c358 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -723,7 +723,7 @@ void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pB // only works if pixel size is <= 32bits SWR_ASSERT(info.bpp <= 32); - Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask, C((char)1)); + Value* pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask); for (uint32_t comp = 0; comp < 4; ++comp) { @@ -1136,8 +1136,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, vMask = vGatherMask; vMask2 = vGatherMask2; - vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); - vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask); + vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy @@ -1152,8 +1152,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, vMask = vGatherMask; vMask2 = vGatherMask2; - vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); - vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask); + vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vMask2); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw @@ -1182,7 +1182,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // save mask as it is zero'd out after each gather vMask = vGatherMask; - vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy @@ -1195,7 +1195,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, pStreamBase = GEP(pStreamBase, C((char)4)); vMask = vGatherMask; - vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw @@ -1238,8 +1238,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // Right shift the offset by a bit and then scale by 2 to remove the sign extension. Value *vShiftedOffsets = VPSRLI(vOffsets, C(1)); Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1)); - vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2)); - vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, C((char)2)); + vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2); + vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, 2); currentVertexElement += 1; } @@ -1298,7 +1298,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // But, we know that elements must be aligned for FETCH. :) // Right shift the offset by a bit and then scale by 2 to remove the sign extension. Value* vShiftedOffsets = VPSRLI(vOffsets, C(1)); - vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, C((char)2)); + vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2); } else { @@ -1353,10 +1353,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); - Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo, C((char)1)); - Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2, C((char)1)); - Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi, C((char)1)); - Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2, C((char)1)); + Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo); + Value* pGatherLo2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo2, vMaskLo2); + Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi); + Value* pGatherHi2 = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi2, vMaskHi2); pGatherLo = VCVTPD2PS(pGatherLo); pGatherLo2 = VCVTPD2PS(pGatherLo2); @@ -1412,9 +1412,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); Value* pGatherLo = GATHERPD(vZeroDouble, - pStreamBase, vOffsetsLo, vMaskLo, C((char)1)); + pStreamBase, vOffsetsLo, vMaskLo); Value* pGatherHi = GATHERPD(vZeroDouble, - pStreamBase, vOffsetsHi, vMaskHi, C((char)1)); + pStreamBase, vOffsetsHi, vMaskHi); pGatherLo = VCVTPD2PS(pGatherLo); pGatherHi = VCVTPD2PS(pGatherHi); @@ -1502,8 +1502,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compMask) { #if USE_SIMD16_GATHERS - Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); - Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2, C((char)1)); + Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); + Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw @@ -1517,7 +1517,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref #else - Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1)); + Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw @@ -1550,8 +1550,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, vMask = vGatherMask; vMask2 = vGatherMask2; - vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); - vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask); + vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy @@ -1566,8 +1566,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, vMask = vGatherMask; vMask2 = vGatherMask2; - vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); - vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask); + vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw @@ -1595,7 +1595,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // save mask as it is zero'd out after each gather vMask = vGatherMask; - vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy @@ -1608,7 +1608,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, pStreamBase = GEP(pStreamBase, C((char)4)); vMask = vGatherMask; - vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask); // e.g. result of second 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw @@ -1645,8 +1645,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vMask = vGatherMask; Value *vMask2 = vGatherMask2; - Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); - Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2, C((char)1)); + Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask); + Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vMask2); if (conversionType == CONVERT_USCALED) { @@ -1675,7 +1675,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // save mask as it is zero'd out after each gather Value *vMask = vGatherMask; - Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1)); + Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask); if (conversionType == CONVERT_USCALED) { diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp index 732e08dae7b..599dc43698a 100644 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -1238,7 +1238,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) // peform a gather to grab stipple words for each lane Value *vStipple = GATHERDD(VUNDEF_I(), stipplePtr, vYstipple, - VIMMED1(0xffffffff), C((char)4)); + VIMMED1(0xffffffff), 4); // create a mask with one bit corresponding to the x stipple // and AND it with the pattern, to see if we have a bit