From 08512c52de783233fd2292951095e2456da843a4 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Thu, 19 Oct 2017 17:33:37 -0500 Subject: [PATCH] swr/rast: Widen fetch shader to SIMD16 Widen fetch shader to SIMD16, enable SIMD16 types in the jitter, and provide utility EXTRACT/INSERT SIMD8 <-> SIMD16 utility functions. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/jitter/builder.cpp | 20 +++++++ .../drivers/swr/rasterizer/jitter/builder.h | 16 ++++++ .../swr/rasterizer/jitter/builder_misc.cpp | 52 +++++++++++++++++ .../swr/rasterizer/jitter/builder_misc.h | 9 +++ .../swr/rasterizer/jitter/fetch_jit.cpp | 57 ++++++++++++++++++- 5 files changed, 151 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index 6a33ec265fc..4b83a3204cf 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -41,6 +41,9 @@ namespace SwrJit : mpJitMgr(pJitMgr) { mVWidth = pJitMgr->mVWidth; +#if USE_SIMD16_BUILDER + mVWidth2 = pJitMgr->mVWidth * 2; +#endif mpIRBuilder = &pJitMgr->mBuilder; @@ -65,17 +68,34 @@ namespace SwrJit mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4); mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5); +#if USE_SIMD16_BUILDER + mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2); + mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2); + mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2); + mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2); + mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2); + mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2); + mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4); + mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5); +#endif if (sizeof(uint32_t*) == 4) { mIntPtrTy = mInt32Ty; mSimdIntPtrTy = mSimdInt32Ty; +#if USE_SIMD16_BUILDER + mSimd2IntPtrTy = mSimd2Int32Ty; +#endif } else { SWR_ASSERT(sizeof(uint32_t*) == 8); + mIntPtrTy = mInt64Ty; mSimdIntPtrTy = mSimdInt64Ty; +#if USE_SIMD16_BUILDER + mSimd2IntPtrTy = mSimd2Int64Ty; +#endif } } } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 8210e49b185..c6ab64e06e8 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -32,6 +32,8 @@ #include "JitManager.h" #include "common/formats.h" +#define USE_SIMD16_BUILDER 0 + namespace SwrJit { using namespace llvm; @@ -45,6 +47,9 @@ namespace SwrJit IRBuilder<>* mpIRBuilder; uint32_t mVWidth; +#if USE_SIMD16_BUILDER + uint32_t mVWidth2; +#endif // Built in types. Type* mVoidTy; @@ -70,6 +75,17 @@ namespace SwrJit Type* mSimdIntPtrTy; Type* mSimdVectorTy; Type* mSimdVectorTRTy; +#if USE_SIMD16_BUILDER + Type* mSimd2FP16Ty; + Type* mSimd2FP32Ty; + Type* mSimd2Int1Ty; + Type* mSimd2Int16Ty; + Type* mSimd2Int32Ty; + Type* mSimd2Int64Ty; + Type* mSimd2IntPtrTy; + Type* mSimd2VectorTy; + Type* mSimd2VectorTRTy; +#endif #include "gen_builder.hpp" #include "gen_builder_x86.hpp" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 9ca36b24671..daa9cb1ec11 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -231,6 +231,13 @@ namespace SwrJit return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); } +#if USE_SIMD16_BUILDER + Value *Builder::VUNDEF2_F() + { + return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2)); + } + +#endif Value *Builder::VUNDEF(Type* t) { return UndefValue::get(VectorType::get(t, mVWidth)); @@ -690,6 +697,51 @@ namespace SwrJit return vGather; } +#if USE_SIMD16_BUILDER + ////////////////////////////////////////////////////////////////////////// + /// @brief + Value *Builder::EXTRACT(Value *a2, uint32_t imm) + { + const uint32_t i0 = (imm > 0) ? mVWidth : 0; + + Value *result = VUNDEF_F(); + + for (uint32_t i = 0; i < mVWidth; i += 1) + { + Value *temp = VEXTRACT(a2, C(i0 + i)); + + result = VINSERT(result, temp, C(i)); + } + + return result; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief + Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm) + { + const uint32_t i0 = (imm > 0) ? mVWidth : 0; + + Value *result = BITCAST(a2, mSimd2FP32Ty); + + for (uint32_t i = 0; i < mVWidth; i += 1) + { +#if 1 + if (!b->getType()->getScalarType()->isFloatTy()) + { + b = BITCAST(b, mSimdFP32Ty); + } + +#endif + Value *temp = VEXTRACT(b, C(i)); + + result = VINSERT(result, temp, C(i0 + i)); + } + + return result; + } + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief convert x86 mask to llvm mask Value* Builder::MASK(Value* vmask) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 662574d638f..d9ff4a21567 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -56,6 +56,9 @@ Value *VIMMED1(float i); Value *VIMMED1(bool i); Value *VUNDEF(Type* t); Value *VUNDEF_F(); +#if USE_SIMD16_BUILDER +Value *VUNDEF2_F(); +#endif Value *VUNDEF_I(); Value *VUNDEF(Type* ty, uint32_t size); Value *VUNDEF_IPTR(); @@ -98,6 +101,12 @@ Value *VMASK(Value* mask); /// @brief functions that build IR to call x86 intrinsics directly, or /// emulate them with other instructions if not available on the host ////////////////////////////////////////////////////////////////////////// + +#if USE_SIMD16_BUILDER +Value *EXTRACT(Value *a, uint32_t imm); +Value *INSERT(Value *a, Value *b, uint32_t imm); + +#endif Value *MASKLOADD(Value* src, Value* mask); void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 30dbcfc8ce1..062852e2d22 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -80,6 +80,9 @@ struct FetchJit : public Builder #endif void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); +#if USE_SIMD16_BUILDER + void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); +#endif #if USE_SIMD16_SHADERS Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); @@ -137,8 +140,8 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) // GEP pVtxOut = GEP(pVtxOut, C(0)); #if USE_SIMD16_SHADERS -#if 0 - pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth * 2), 0)); +#if 0// USE_SIMD16_BUILDER + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); #else pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); #endif @@ -1250,9 +1253,27 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (currentVertexElement > 3) { +#if USE_SIMD16_BUILDER + Value *pVtxSrc2[4]; + + // pack adjacent pairs of SIMD8s into SIMD16s + for (uint32_t i = 0; i < 4; i += 1) + { + pVtxSrc2[i] = VUNDEF2_F(); + + pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements[i], 0); + pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements2[i], 1); + } + + // store SIMD16s + Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); + +#else StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); +#endif outputElt += 1; // reset to the next vVertexElement to output @@ -2312,7 +2333,8 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con for(uint32_t c = 0; c < numEltsToStore; ++c) { // STORE expects FP32 x vWidth type, just bitcast if needed - if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){ + if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) + { #if FETCH_DUMP_VERTEX PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); #endif @@ -2335,6 +2357,35 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con } } +#if USE_SIMD16_BUILDER +void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) +{ + SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); + + for (uint32_t c = 0; c < numEltsToStore; ++c) + { + // STORE expects FP32 x vWidth type, just bitcast if needed + if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) + { +#if FETCH_DUMP_VERTEX + PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] }); +#endif + vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty); + } +#if FETCH_DUMP_VERTEX + else + { + PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] }); + } +#endif + // outputElt * 4 = offsetting by the size of a simdvertex + // + c offsets to a 32bit x vWidth row within the current vertex + Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); + STORE(vVertexElements[c], dest); + } +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Generates a constant vector of values based on the /// ComponentControl value -- 2.30.2