From: Tim Rowley Date: Mon, 11 Dec 2017 05:54:30 +0000 (-0600) Subject: swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=01a57c11cb7fe85196b9cb4b5a1555e6eb239297;p=mesa.git swr/rast: SIMD16 Fetch - Fully widen 32-bit integer vertex components Also widen the 16-bit a 8-bit integer vertex component gathers to SIMD16. Reviewed-by: Bruce Cherniak --- diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index ac8b3badf6d..8bbf36d9b83 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -46,6 +46,7 @@ intrinsics = [ ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], + ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']], ['VPSRLI_16', 'x86_avx512_psrli_d_512', ['src', 'imm']], ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 3a486e4c1ea..684c9fac549 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -723,6 +723,42 @@ namespace SwrJit return vGather; } +#if USE_SIMD16_BUILDER + Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) + { + Value *vGather = VUNDEF2_F(); + + // use avx512 gather instruction if available + if (JM()->mArch.AVX512F()) + { + // force mask to , required by vgather2 + Value *mask = BITCAST(vMask, mInt16Ty); + + vGather = VGATHERDD_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); + } + else + { + Value *src0 = EXTRACT2_F(vSrc, 0); + Value *src1 = EXTRACT2_F(vSrc, 1); + + Value *indices0 = EXTRACT2_I(vIndices, 0); + Value *indices1 = EXTRACT2_I(vIndices, 1); + + Value *vmask16 = VMASK2(vMask); + + Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better.. + Value *mask1 = MASK(EXTRACT2_I(vmask16, 1)); + + Value *gather0 = GATHERDD(src0, pBase, indices0, mask0, scale); + Value *gather1 = GATHERDD(src1, pBase, indices1, mask1, scale); + + vGather = JOIN2(gather0, gather1); + } + + return vGather; + } + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 231bd6ad857..6c883d8f52b 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -135,6 +135,9 @@ void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); +#if USE_SIMD16_BUILDER +Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); +#endif void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index e0a0770560c..ec3b5eafccd 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1349,14 +1349,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compMask) { #if USE_SIMD16_BUILDER -#if USE_SIMD16_BUILDER -#else - Value *gatherResult[2]; - - gatherResult[0] = JOIN2(vGatherResult[0], vGatherResult2[0]); - gatherResult[1] = JOIN2(vGatherResult[1], vGatherResult2[1]); - -#endif Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE, @@ -1701,6 +1693,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* gatherSrc = VIMMED1(0); #if USE_SIMD16_GATHERS Value* gatherSrc2 = VIMMED1(0); +#if USE_SIMD16_BUILDER + Value *gatherSrc16 = VIMMED2_1(0); +#endif #endif // Gather components from memory to store in a simdvertex structure @@ -1712,6 +1707,14 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compMask) { #if USE_SIMD16_GATHERS +#if USE_SIMD16_BUILDER + Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); + + // e.g. result of an 8x32bit integer gather for 8bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw + +#else Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); @@ -1719,9 +1722,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw +#endif #if USE_SIMD16_BUILDER - Value *gatherResult = JOIN2(vGatherResult, vGatherResult2); - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, @@ -1761,6 +1763,43 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 16: { #if USE_SIMD16_GATHERS +#if USE_SIMD16_BUILDER + Value* gatherResult[2]; + + // if we have at least one component out of x or y to fetch + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + gatherResult[0] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); + + // e.g. result of first 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy + // + } + else + { + gatherResult[0] = VUNDEF2_I(); + } + + // if we have at least one component out of z or w to fetch + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + // offset base to the next components(zw) in the vertex to gather + pStreamBase = GEP(pStreamBase, C((char)4)); + + gatherResult[1] = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); + + // e.g. result of second 8x32bit integer gather for 16bit components + // 256i - 0 1 2 3 4 5 6 7 + // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw + // + } + else + { + gatherResult[1] = VUNDEF2_I(); + } + +#else Value* vGatherResult[2]; Value* vGatherResult2[2]; @@ -1799,15 +1838,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, vGatherResult2[1] = VUNDEF_I(); } +#endif // if we have at least one component to shuffle into place if (compMask) { #if USE_SIMD16_BUILDER - Value *gatherResult[2]; - - gatherResult[0] = JOIN2(vGatherResult[0], vGatherResult2[0]); - gatherResult[1] = JOIN2(vGatherResult[1], vGatherResult2[1]); - Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, @@ -1876,6 +1911,23 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compCtrl[i] == StoreSrc) { #if USE_SIMD16_GATHERS +#if USE_SIMD16_BUILDER + Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); + + if (conversionType == CONVERT_USCALED) + { + pGather = UI_TO_FP(pGather, mSimd2FP32Ty); + } + else if (conversionType == CONVERT_SSCALED) + { + pGather = SI_TO_FP(pGather, mSimd2FP32Ty); + } + else if (conversionType == CONVERT_SFIXED) + { + pGather = FMUL(SI_TO_FP(pGather, mSimd2FP32Ty), VBROADCAST2(C(1 / 65536.0f))); + } + +#else Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); @@ -1895,9 +1947,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); } +#endif #if USE_SIMD16_BUILDER - // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN2(pGather, pGather2); + pVtxSrc2[currentVertexElement] = pGather; #else vVertexElements[currentVertexElement] = pGather;