X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fswr%2Frasterizer%2Fjitter%2Fbuilder_misc.cpp;h=530752850c611802978fa37942136c7ef14e6b4f;hb=639605e5ba947bb947313a6584ef7fbb8619e9c2;hp=3a486e4c1eae81e794fcdf59cff0206c1a0237e6;hpb=fa3105cdb54415d7b93be932351966d3108511e4;p=mesa.git diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 3a486e4c1ea..530752850c6 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -1,41 +1,42 @@ /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file builder_misc.cpp -* -* @brief Implementation for miscellaneous builder functions -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file builder_misc.cpp + * + * @brief Implementation for miscellaneous builder functions + * + * Notes: + * + ******************************************************************************/ +#include "jit_pch.hpp" #include "builder.h" #include "common/rdtsc_buckets.h" #include +extern "C" void CallPrint(const char* fmt, ...); + namespace SwrJit { - void __cdecl CallPrint(const char* fmt, ...); - ////////////////////////////////////////////////////////////////////////// /// @brief Convert an IEEE 754 32-bit single precision float to an /// 16 bit float with 5 exponent bits and a variable @@ -49,25 +50,25 @@ namespace SwrJit // Extract the sign, exponent, and mantissa uint32_t uf = *(uint32_t*)&val; - sign = (uf & 0x80000000) >> 31; - exp = (uf & 0x7F800000) >> 23; - mant = uf & 0x007FFFFF; + sign = (uf & 0x80000000) >> 31; + exp = (uf & 0x7F800000) >> 23; + mant = uf & 0x007FFFFF; // Check for out of range if (std::isnan(val)) { - exp = 0x1F; + exp = 0x1F; mant = 0x200; - sign = 1; // set the sign bit for NANs + sign = 1; // set the sign bit for NANs } else if (std::isinf(val)) { - exp = 0x1f; + exp = 0x1f; mant = 0x0; } else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value { - exp = 0x1E; + exp = 0x1E; mant = 0x3FF; } else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm @@ -75,12 +76,12 @@ namespace SwrJit mant |= 0x00800000; for (; exp <= 0x70; mant >>= 1, exp++) ; - exp = 0; + exp = 0; mant = mant >> 13; } else if (exp < 0x66) // Too small to represent -> Zero { - exp = 0; + exp = 0; mant = 0; } else @@ -88,7 +89,7 @@ namespace SwrJit // Saves bits that will be shifted off for rounding roundBits = mant & 0x1FFFu; // convert exponent and mantissa to 16 bit format - exp = exp - 0x70; + exp = exp - 0x70; mant = mant >> 13; // Essentially RTZ, but round up if off by only 1 lsb @@ -107,170 +108,137 @@ namespace SwrJit return (uint16_t)tmpVal; } - ////////////////////////////////////////////////////////////////////////// - /// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision - /// float - /// @param val - 16-bit float - /// @todo Maybe move this outside of this file into a header? - static float ConvertFloat16ToFloat32(uint32_t val) - { - uint32_t result; - if ((val & 0x7fff) == 0) - { - result = ((uint32_t)(val & 0x8000)) << 16; - } - else if ((val & 0x7c00) == 0x7c00) - { - result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000; - result |= ((uint32_t)val & 0x8000) << 16; - } - else - { - uint32_t sign = (val & 0x8000) << 16; - uint32_t mant = (val & 0x3ff) << 13; - uint32_t exp = (val >> 10) & 0x1f; - if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals - { - mant <<= 1; - while (mant < (0x400 << 13)) - { - exp--; - mant <<= 1; - } - mant &= (0x3ff << 13); - } - exp = ((exp - 15 + 127) & 0xff) << 23; - result = sign | exp | mant; - } + Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); } - return *(float*)&result; - } + Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); } - Constant *Builder::C(bool i) - { - return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); - } + Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); } - Constant *Builder::C(char i) - { - return ConstantInt::get(IRB()->getInt8Ty(), i); - } + Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); } - Constant *Builder::C(uint8_t i) - { - return ConstantInt::get(IRB()->getInt8Ty(), i); - } + Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); } - Constant *Builder::C(int i) - { - return ConstantInt::get(IRB()->getInt32Ty(), i); - } + Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); } - Constant *Builder::C(int64_t i) - { - return ConstantInt::get(IRB()->getInt64Ty(), i); - } + Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); } - Constant *Builder::C(uint16_t i) - { - return ConstantInt::get(mInt16Ty,i); - } + Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); } - Constant *Builder::C(uint32_t i) - { - return ConstantInt::get(IRB()->getInt32Ty(), i); - } + Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); } - Constant *Builder::C(float i) + Constant* Builder::PRED(bool pred) { - return ConstantFP::get(IRB()->getFloatTy(), i); + return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); } - Constant *Builder::PRED(bool pred) + Value* Builder::VIMMED1(uint64_t i) { - return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast(C(i))); +#else + return ConstantVector::getSplat(mVWidth, cast(C(i))); +#endif } - Value *Builder::VIMMED1(int i) + Value* Builder::VIMMED1_16(uint64_t i) { - return ConstantVector::getSplat(mVWidth, cast(C(i))); +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast(C(i))); +#else + return ConstantVector::getSplat(mVWidth16, cast(C(i))); +#endif } - Value *Builder::VIMMED1(uint32_t i) + Value* Builder::VIMMED1(int i) { +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast(C(i))); +#else return ConstantVector::getSplat(mVWidth, cast(C(i))); +#endif } - Value *Builder::VIMMED1(float i) + Value* Builder::VIMMED1_16(int i) { - return ConstantVector::getSplat(mVWidth, cast(C(i))); +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast(C(i))); +#else + return ConstantVector::getSplat(mVWidth16, cast(C(i))); +#endif } - Value *Builder::VIMMED1(bool i) + Value* Builder::VIMMED1(uint32_t i) { +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast(C(i))); +#else return ConstantVector::getSplat(mVWidth, cast(C(i))); +#endif } -#if USE_SIMD16_BUILDER - Value *Builder::VIMMED2_1(int i) + Value* Builder::VIMMED1_16(uint32_t i) { - return ConstantVector::getSplat(mVWidth2, cast(C(i))); +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast(C(i))); +#else + return ConstantVector::getSplat(mVWidth16, cast(C(i))); +#endif } - Value *Builder::VIMMED2_1(uint32_t i) + Value* Builder::VIMMED1(float i) { - return ConstantVector::getSplat(mVWidth2, cast(C(i))); +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast(C(i))); +#else + return ConstantVector::getSplat(mVWidth, cast(C(i))); +#endif } - Value *Builder::VIMMED2_1(float i) + Value* Builder::VIMMED1_16(float i) { - return ConstantVector::getSplat(mVWidth2, cast(C(i))); +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast(C(i))); +#else + return ConstantVector::getSplat(mVWidth16, cast(C(i))); +#endif } - Value *Builder::VIMMED2_1(bool i) + Value* Builder::VIMMED1(bool i) { - return ConstantVector::getSplat(mVWidth2, cast(C(i))); - } - +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth, false), cast(C(i))); +#else + return ConstantVector::getSplat(mVWidth, cast(C(i))); #endif - Value *Builder::VUNDEF_IPTR() - { - return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); } - Value *Builder::VUNDEF_I() + Value* Builder::VIMMED1_16(bool i) { - return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); +#if LLVM_VERSION_MAJOR > 10 + return ConstantVector::getSplat(ElementCount::get(mVWidth16, false), cast(C(i))); +#else + return ConstantVector::getSplat(mVWidth16, cast(C(i))); +#endif } - Value *Builder::VUNDEF(Type *ty, uint32_t size) - { - return UndefValue::get(VectorType::get(ty, size)); - } + Value* Builder::VUNDEF_IPTR() { return UndefValue::get(getVectorType(mInt32PtrTy, mVWidth)); } - Value *Builder::VUNDEF_F() - { - return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); - } + Value* Builder::VUNDEF(Type* t) { return UndefValue::get(getVectorType(t, mVWidth)); } -#if USE_SIMD16_BUILDER - Value *Builder::VUNDEF2_F() - { - return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2)); - } + Value* Builder::VUNDEF_I() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth)); } - Value *Builder::VUNDEF2_I() - { - return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2)); - } + Value* Builder::VUNDEF_I_16() { return UndefValue::get(getVectorType(mInt32Ty, mVWidth16)); } -#endif - Value *Builder::VUNDEF(Type* t) + Value* Builder::VUNDEF_F() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth)); } + + Value* Builder::VUNDEF_F_16() { return UndefValue::get(getVectorType(mFP32Ty, mVWidth16)); } + + Value* Builder::VUNDEF(Type* ty, uint32_t size) { - return UndefValue::get(VectorType::get(t, mVWidth)); + return UndefValue::get(getVectorType(ty, size)); } - Value *Builder::VBROADCAST(Value *src) + Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name) { // check if src is already a vector if (src->getType()->isVectorTy()) @@ -278,11 +246,10 @@ namespace SwrJit return src; } - return VECTOR_SPLAT(mVWidth, src); + return VECTOR_SPLAT(mVWidth, src, name); } -#if USE_SIMD16_BUILDER - Value *Builder::VBROADCAST2(Value *src) + Value* Builder::VBROADCAST_16(Value* src) { // check if src is already a vector if (src->getType()->isVectorTy()) @@ -290,167 +257,89 @@ namespace SwrJit return src; } - return VECTOR_SPLAT(mVWidth2, src); + return VECTOR_SPLAT(mVWidth16, src); } -#endif uint32_t Builder::IMMED(Value* v) { SWR_ASSERT(isa(v)); - ConstantInt *pValConst = cast(v); + ConstantInt* pValConst = cast(v); return pValConst->getZExtValue(); } int32_t Builder::S_IMMED(Value* v) { SWR_ASSERT(isa(v)); - ConstantInt *pValConst = cast(v); + ConstantInt* pValConst = cast(v); return pValConst->getSExtValue(); } - Value *Builder::GEP(Value* ptr, const std::initializer_list &indexList) - { - std::vector indices; - for (auto i : indexList) - indices.push_back(i); - return GEPA(ptr, indices); - } - - Value *Builder::GEP(Value* ptr, const std::initializer_list &indexList) - { - std::vector indices; - for (auto i : indexList) - indices.push_back(C(i)); - return GEPA(ptr, indices); - } - - Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list &indexList) - { - std::vector indices; - for (auto i : indexList) - indices.push_back(i); - return IN_BOUNDS_GEP(ptr, indices); - } - - Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list &indexList) - { - std::vector indices; - for (auto i : indexList) - indices.push_back(C(i)); - return IN_BOUNDS_GEP(ptr, indices); - } - - LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list &indices, const llvm::Twine& name) - { - std::vector valIndices; - for (auto i : indices) - valIndices.push_back(C(i)); - return LOAD(GEPA(basePtr, valIndices), name); - } - - LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list &indices, const llvm::Twine& name) - { - std::vector valIndices; - for (auto i : indices) - valIndices.push_back(i); - return LOAD(GEPA(basePtr, valIndices), name); - } - - StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list &indices) - { - std::vector valIndices; - for (auto i : indices) - valIndices.push_back(C(i)); - return STORE(val, GEPA(basePtr, valIndices)); - } - - StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list &indices) - { - std::vector valIndices; - for (auto i : indices) - valIndices.push_back(i); - return STORE(val, GEPA(basePtr, valIndices)); - } - - CallInst *Builder::CALL(Value *Callee, const std::initializer_list &argsList) + CallInst* Builder::CALL(Value* Callee, + const std::initializer_list& argsList, + const llvm::Twine& name) { std::vector args; for (auto arg : argsList) args.push_back(arg); - return CALLA(Callee, args); +#if LLVM_VERSION_MAJOR >= 11 + // see comment to CALLA(Callee) function in the header + return CALLA(FunctionCallee(cast(Callee)), args, name); +#else + return CALLA(Callee, args, name); +#endif } - CallInst *Builder::CALL(Value *Callee, Value* arg) + CallInst* Builder::CALL(Value* Callee, Value* arg) { std::vector args; args.push_back(arg); +#if LLVM_VERSION_MAJOR >= 11 + // see comment to CALLA(Callee) function in the header + return CALLA(FunctionCallee(cast(Callee)), args); +#else return CALLA(Callee, args); +#endif } - CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2) + CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2) { std::vector args; args.push_back(arg1); args.push_back(arg2); +#if LLVM_VERSION_MAJOR >= 11 + // see comment to CALLA(Callee) function in the header + return CALLA(FunctionCallee(cast(Callee)), args); +#else return CALLA(Callee, args); +#endif } - CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3) + CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3) { std::vector args; args.push_back(arg1); args.push_back(arg2); args.push_back(arg3); +#if LLVM_VERSION_MAJOR >= 11 + // see comment to CALLA(Callee) function in the header + return CALLA(FunctionCallee(cast(Callee)), args); +#else return CALLA(Callee, args); +#endif } - ////////////////////////////////////////////////////////////////////////// - Value *Builder::DEBUGTRAP() - { - Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap); - return CALL(func); - } - - Value *Builder::VRCP(Value *va) + Value* Builder::VRCP(Value* va, const llvm::Twine& name) { - return FDIV(VIMMED1(1.0f), va); // 1 / a + return FDIV(VIMMED1(1.0f), va, name); // 1 / a } - Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY) + Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY) { Value* vOut = FMADDPS(vA, vX, vC); - vOut = FMADDPS(vB, vY, vOut); + vOut = FMADDPS(vB, vY, vOut); return vOut; } - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate an i32 masked load operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with float masked load - /// @param src - base address pointer for the load - /// @param vMask - SIMD wide mask that controls whether to access memory load 0 - Value *Builder::MASKLOADD(Value* src,Value* mask) - { - Value* vResult; - // use avx2 gather instruction is available - if(JM()->mArch.AVX2()) - { - Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256); - vResult = CALL(func,{src,mask}); - } - else - { - // maskload intrinsic expects integer mask operand in llvm >= 3.8 - #if (LLVM_VERSION_MAJOR > 3) || (LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8) - mask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth)); - #else - mask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth)); - #endif - Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); - vResult = BITCAST(CALL(func,{src,mask}), VectorType::get(mInt32Ty,mVWidth)); - } - return vResult; - } - ////////////////////////////////////////////////////////////////////////// /// @brief insert a JIT call to CallPrint /// - outputs formatted string to both stdout and VS output window @@ -461,7 +350,8 @@ namespace SwrJit /// result from a GEP, printing out the pointer to memory /// @param printStr - constant string to print, which includes format specifiers /// @param printArgs - initializer list of Value*'s to print to std out - CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list &printArgs) + CallInst* Builder::PRINT(const std::string& printStr, + const std::initializer_list& printArgs) { // push the arguments to CallPrint into a vector std::vector printCallArgs; @@ -469,23 +359,25 @@ namespace SwrJit printCallArgs.resize(1); // search through the format string for special processing - size_t pos = 0; + size_t pos = 0; std::string tempStr(printStr); - pos = tempStr.find('%', pos); + pos = tempStr.find('%', pos); auto v = printArgs.begin(); while ((pos != std::string::npos) && (v != printArgs.end())) { - Value* pArg = *v; - Type* pType = pArg->getType(); + Value* pArg = *v; + Type* pType = pArg->getType(); if (pType->isVectorTy()) { Type* pContainedType = pType->getContainedType(0); - +#if LLVM_VERSION_MAJOR >= 11 + VectorType* pVectorType = cast(pType); +#endif if (toupper(tempStr[pos + 1]) == 'X') { - tempStr[pos] = '0'; + tempStr[pos] = '0'; tempStr[pos + 1] = 'x'; tempStr.insert(pos + 2, "%08X "); pos += 7; @@ -493,7 +385,11 @@ namespace SwrJit printCallArgs.push_back(VEXTRACT(pArg, C(0))); std::string vectorFormatStr; +#if LLVM_VERSION_MAJOR >= 11 + for (uint32_t i = 1; i < pVectorType->getNumElements(); ++i) +#else for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i) +#endif { vectorFormatStr += "0x%08X "; printCallArgs.push_back(VEXTRACT(pArg, C(i))); @@ -505,24 +401,53 @@ namespace SwrJit else if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy())) { uint32_t i = 0; - for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) +#if LLVM_VERSION_MAJOR >= 11 + for (; i < pVectorType->getNumElements() - 1; i++) +#else + for (; i < pType->getVectorNumElements() - 1; i++) +#endif { tempStr.insert(pos, std::string("%f ")); pos += 3; - printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); + printCallArgs.push_back( + FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); } - printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); + printCallArgs.push_back( + FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext))); } else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy())) { uint32_t i = 0; - for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++) +#if LLVM_VERSION_MAJOR >= 11 + for (; i < pVectorType->getNumElements() - 1; i++) +#else + for (; i < pType->getVectorNumElements() - 1; i++) +#endif { tempStr.insert(pos, std::string("%d ")); pos += 3; - printCallArgs.push_back(VEXTRACT(pArg, C(i))); + printCallArgs.push_back( + S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); } - printCallArgs.push_back(VEXTRACT(pArg, C(i))); + printCallArgs.push_back( + S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); + } + else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy())) + { + uint32_t i = 0; +#if LLVM_VERSION_MAJOR >= 11 + for (; i < pVectorType->getNumElements() - 1; i++) +#else + for (; i < pType->getVectorNumElements() - 1; i++) +#endif + { + tempStr.insert(pos, std::string("%d ")); + pos += 3; + printCallArgs.push_back( + Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); + } + printCallArgs.push_back( + Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext))); } } else @@ -552,366 +477,155 @@ namespace SwrJit } // create global variable constant string - Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true); - GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr"); + Constant* constString = ConstantDataArray::getString(JM()->mContext, tempStr, true); + GlobalVariable* gvPtr = new GlobalVariable( + constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr"); JM()->mpCurrentModule->getGlobalList().push_back(gvPtr); // get a pointer to the first character in the constant string array - std::vector geplist{C(0),C(0)}; - Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); + std::vector geplist{C(0), C(0)}; + Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false); // insert the pointer to the format string in the argument vector printCallArgs[0] = strGEP; // get pointer to CallPrint function and insert decl into the module if needed std::vector args; - args.push_back(PointerType::get(mInt8Ty,0)); - FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true); - Function *callPrintFn = cast(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); + args.push_back(PointerType::get(mInt8Ty, 0)); + FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true); + Function* callPrintFn = +#if LLVM_VERSION_MAJOR >= 9 + cast(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy).getCallee()); +#else + cast(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy)); +#endif // if we haven't yet added the symbol to the symbol table - if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) + if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr) { - sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint); + sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint); } // insert a call to CallPrint - return CALLA(callPrintFn,printCallArgs); + return CALLA(callPrintFn, printCallArgs); } ////////////////////////////////////////////////////////////////////////// /// @brief Wrapper around PRINT with initializer list. - CallInst* Builder::PRINT(const std::string &printStr) - { - return PRINT(printStr, {}); - } + CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); } - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with loads - /// @param vSrc - SIMD wide value that will be loaded if mask is invalid - /// @param pBase - Int8* base VB address pointer value - /// @param vIndices - SIMD wide value of VB byte offsets - /// @param vMask - SIMD wide mask that controls whether to access memory or the src values - /// @param scale - value to scale indices by - Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) + Value* Builder::EXTRACT_16(Value* x, uint32_t imm) { - Value *vGather; - - // use avx2 gather instruction if available - if(JM()->mArch.AVX2()) + if (imm == 0) { - // force mask to , required by vgather - Value *mask = BITCAST(VMASK(vMask), mSimdFP32Ty); - - vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale)); + return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7}); } else { - Value* pStack = STACKSAVE(); - - // store vSrc on the stack. this way we can select between a valid load address and the vSrc address - Value* vSrcPtr = ALLOCA(vSrc->getType()); - STORE(vSrc, vSrcPtr); - - vGather = VUNDEF_F(); - Value *vScaleVec = VIMMED1((uint32_t)scale); - Value *vOffsets = MUL(vIndices,vScaleVec); - for(uint32_t i = 0; i < mVWidth; ++i) - { - // single component byte index - Value *offset = VEXTRACT(vOffsets,C(i)); - // byte pointer to component - Value *loadAddress = GEP(pBase,offset); - loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0)); - // pointer to the value to load if we're masking off a component - Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); - Value *selMask = VEXTRACT(vMask,C(i)); - // switch in a safe address to load if we're trying to access a vertex - Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); - Value *val = LOAD(validAddress); - vGather = VINSERT(vGather,val,C(i)); - } - STACKRESTORE(pStack); + return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15}); } - - return vGather; } -#if USE_SIMD16_BUILDER - Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) + Value* Builder::JOIN_16(Value* a, Value* b) { - Value *vGather = VUNDEF2_F(); - - // use avx512 gather instruction if available - if (JM()->mArch.AVX512F()) - { - // force mask to , required by vgather2 - Value *mask = BITCAST(vMask, mInt16Ty); - - vGather = VGATHERPS_16(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); - } - else - { - Value *src0 = EXTRACT2_F(vSrc, 0); - Value *src1 = EXTRACT2_F(vSrc, 1); - - Value *indices0 = EXTRACT2_I(vIndices, 0); - Value *indices1 = EXTRACT2_I(vIndices, 1); - - Value *vmask16 = VMASK2(vMask); - - Value *mask0 = MASK(EXTRACT2_I(vmask16, 0)); // TODO: do this better.. - Value *mask1 = MASK(EXTRACT2_I(vmask16, 1)); - - Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale); - Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale); - - vGather = JOIN2(gather0, gather1); - } - - return vGather; + return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); } -#endif ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with loads - /// @param vSrc - SIMD wide value that will be loaded if mask is invalid - /// @param pBase - Int8* base VB address pointer value - /// @param vIndices - SIMD wide value of VB byte offsets - /// @param vMask - SIMD wide mask that controls whether to access memory or the src values - /// @param scale - value to scale indices by - Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) + /// @brief convert x86 mask to llvm mask + Value* Builder::MASK(Value* vmask) { - Value* vGather; - - // use avx2 gather instruction if available - if(JM()->mArch.AVX2()) - { - vGather = VGATHERDD(vSrc, pBase, vIndices, VMASK(vMask), C(scale)); - } - else - { - Value* pStack = STACKSAVE(); - - // store vSrc on the stack. this way we can select between a valid load address and the vSrc address - Value* vSrcPtr = ALLOCA(vSrc->getType()); - STORE(vSrc, vSrcPtr); - - vGather = VUNDEF_I(); - Value *vScaleVec = VIMMED1((uint32_t)scale); - Value *vOffsets = MUL(vIndices, vScaleVec); - for(uint32_t i = 0; i < mVWidth; ++i) - { - // single component byte index - Value *offset = VEXTRACT(vOffsets, C(i)); - // byte pointer to component - Value *loadAddress = GEP(pBase, offset); - loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0)); - // pointer to the value to load if we're masking off a component - Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)}); - Value *selMask = VEXTRACT(vMask, C(i)); - // switch in a safe address to load if we're trying to access a vertex - Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); - Value *val = LOAD(validAddress, C(0)); - vGather = VINSERT(vGather, val, C(i)); - } - - STACKRESTORE(pStack); - } - return vGather; + Value* src = BITCAST(vmask, mSimdInt32Ty); + return ICMP_SLT(src, VIMMED1(0)); } - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a masked gather operation in LLVM IR. If not - /// supported on the underlying platform, emulate it with loads - /// @param vSrc - SIMD wide value that will be loaded if mask is invalid - /// @param pBase - Int8* base VB address pointer value - /// @param vIndices - SIMD wide value of VB byte offsets - /// @param vMask - SIMD wide mask that controls whether to access memory or the src values - /// @param scale - value to scale indices by - Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) + Value* Builder::MASK_16(Value* vmask) { - Value* vGather; - - // use avx2 gather instruction if available - if(JM()->mArch.AVX2()) - { - vMask = BITCAST(S_EXT(vMask, VectorType::get(mInt64Ty, mVWidth/2)), VectorType::get(mDoubleTy, mVWidth/2)); - vGather = VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale)); - } - else - { - Value* pStack = STACKSAVE(); - - // store vSrc on the stack. this way we can select between a valid load address and the vSrc address - Value* vSrcPtr = ALLOCA(vSrc->getType()); - STORE(vSrc, vSrcPtr); - - vGather = UndefValue::get(VectorType::get(mDoubleTy, 4)); - Value *vScaleVec = VECTOR_SPLAT(4, C((uint32_t)scale)); - Value *vOffsets = MUL(vIndices,vScaleVec); - for(uint32_t i = 0; i < mVWidth/2; ++i) - { - // single component byte index - Value *offset = VEXTRACT(vOffsets,C(i)); - // byte pointer to component - Value *loadAddress = GEP(pBase,offset); - loadAddress = BITCAST(loadAddress,PointerType::get(mDoubleTy,0)); - // pointer to the value to load if we're masking off a component - Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)}); - Value *selMask = VEXTRACT(vMask,C(i)); - // switch in a safe address to load if we're trying to access a vertex - Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress); - Value *val = LOAD(validAddress); - vGather = VINSERT(vGather,val,C(i)); - } - STACKRESTORE(pStack); - } - return vGather; + Value* src = BITCAST(vmask, mSimd16Int32Ty); + return ICMP_SLT(src, VIMMED1_16(0)); } -#if USE_SIMD16_BUILDER - Value *Builder::PSRLI(Value *a, Value *imm) - { - return VPSRLI(a, imm); - } + ////////////////////////////////////////////////////////////////////////// + /// @brief convert llvm mask to x86 mask + Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); } - Value *Builder::PSRLI_16(Value *a, Value *imm) - { - Value *result = VUNDEF2_I(); + Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); } - // use avx512 shift right instruction if available - if (JM()->mArch.AVX512F()) + /// @brief Convert llvm mask to integer + Value* Builder::VMOVMSK(Value* mask) + { +#if LLVM_VERSION_MAJOR >= 11 + VectorType* pVectorType = cast(mask->getType()); + SWR_ASSERT(pVectorType->getElementType() == mInt1Ty); + uint32_t numLanes = pVectorType->getNumElements(); +#else + SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty); + uint32_t numLanes = mask->getType()->getVectorNumElements(); +#endif + Value* i32Result; + if (numLanes == 8) { - result = VPSRLI_16(a, imm); + i32Result = BITCAST(mask, mInt8Ty); } - else + else if (numLanes == 16) { - Value *a0 = EXTRACT2_I(a, 0); - Value *a1 = EXTRACT2_I(a, 1); - - Value *result0 = PSRLI(a0, imm); - Value *result1 = PSRLI(a1, imm); - - result = JOIN2(result0, result1); + i32Result = BITCAST(mask, mInt16Ty); } - - return result; - } - -#endif -#if USE_SIMD16_BUILDER - ////////////////////////////////////////////////////////////////////////// - /// @brief - Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm) - { - const uint32_t i0 = (imm > 0) ? mVWidth : 0; - - Value *result = VUNDEF_F(); - - for (uint32_t i = 0; i < mVWidth; i += 1) + else { -#if 1 - if (!a2->getType()->getScalarType()->isFloatTy()) - { - a2 = BITCAST(a2, mSimd2FP32Ty); - } - -#endif - Value *temp = VEXTRACT(a2, C(i0 + i)); - - result = VINSERT(result, temp, C(i)); + SWR_ASSERT("Unsupported vector width"); + i32Result = BITCAST(mask, mInt8Ty); } - - return result; - } - - Value *Builder::EXTRACT2_I(Value *a2, uint32_t imm) - { - return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty); - } - - Value *Builder::JOIN2(Value *a, Value *b) - { - return VSHUFFLE(a, b, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); - } -#endif - - ////////////////////////////////////////////////////////////////////////// - /// @brief convert x86 mask to llvm mask - Value *Builder::MASK(Value *vmask) - { - Value *src = BITCAST(vmask, mSimdInt32Ty); - return ICMP_SLT(src, VIMMED1(0)); + return Z_EXT(i32Result, mInt32Ty); } -#if USE_SIMD16_BUILDER - Value *Builder::MASK2(Value *vmask) - { - Value *src = BITCAST(vmask, mSimd2Int32Ty); - return ICMP_SLT(src, VIMMED2_1(0)); - } - -#endif ////////////////////////////////////////////////////////////////////////// - /// @brief convert llvm mask to x86 mask - Value *Builder::VMASK(Value *mask) - { - return S_EXT(mask, mSimdInt32Ty); - } - -#if USE_SIMD16_BUILDER - Value *Builder::VMASK2(Value *mask) - { - return S_EXT(mask, mSimd2Int32Ty); - } - -#endif - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation in LLVM IR. If not + /// @brief Generate a VPSHUFB operation in LLVM IR. If not /// supported on the underlying platform, emulate it /// @param a - 256bit SIMD(32x8bit) of 8bit integer values /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values - /// Byte masks in lower 128 lane of b selects 8 bit values from lower - /// 128bits of a, and vice versa for the upper lanes. If the mask + /// Byte masks in lower 128 lane of b selects 8 bit values from lower + /// 128bits of a, and vice versa for the upper lanes. If the mask /// value is negative, '0' is inserted. - Value *Builder::PSHUFB(Value* a, Value* b) + Value* Builder::PSHUFB(Value* a, Value* b) { Value* res; // use avx2 pshufb instruction if available - if(JM()->mArch.AVX2()) + if (JM()->mArch.AVX2()) { res = VPSHUFB(a, b); } else { Constant* cB = dyn_cast(b); + assert(cB != nullptr); // number of 8 bit elements in b uint32_t numElms = cast(cB->getType())->getNumElements(); // output vector - Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); + Value* vShuf = UndefValue::get(getVectorType(mInt8Ty, numElms)); // insert an 8 bit value from the high and low lanes of a per loop iteration numElms /= 2; - for(uint32_t i = 0; i < numElms; i++) + for (uint32_t i = 0; i < numElms; i++) { - ConstantInt* cLow128b = cast(cB->getAggregateElement(i)); + ConstantInt* cLow128b = cast(cB->getAggregateElement(i)); ConstantInt* cHigh128b = cast(cB->getAggregateElement(i + numElms)); // extract values from constant mask - char valLow128bLane = (char)(cLow128b->getSExtValue()); + char valLow128bLane = (char)(cLow128b->getSExtValue()); char valHigh128bLane = (char)(cHigh128b->getSExtValue()); Value* insertValLow128b; Value* insertValHigh128b; // if the mask value is negative, insert a '0' in the respective output position - // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector - insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); - insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); + // otherwise, lookup the value at mask position (bits 3..0 of the respective mask + // byte) in a and insert in output vector + insertValLow128b = + (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); + insertValHigh128b = (valHigh128bLane < 0) + ? C((char)0) + : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); vShuf = VINSERT(vShuf, insertValLow128b, i); vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); @@ -922,137 +636,52 @@ namespace SwrJit } ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 + /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it - /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only + /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only /// lower 8 values are used. - Value *Builder::PMOVSXBD(Value* a) + Value* Builder::PMOVSXBD(Value* a) { // VPMOVSXBD output type - Type* v8x32Ty = VectorType::get(mInt32Ty, 8); + Type* v8x32Ty = getVectorType(mInt32Ty, 8); // Extract 8 values from 128bit lane and sign extend return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); } ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 + /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 /// bits)in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. - Value *Builder::PMOVSXWD(Value* a) + Value* Builder::PMOVSXWD(Value* a) { // VPMOVSXWD output type - Type* v8x32Ty = VectorType::get(mInt32Ty, 8); + Type* v8x32Ty = getVectorType(mInt32Ty, 8); // Extract 8 values from 128bit lane and sign extend return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); } - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPERMD operation (shuffle 32 bit integer values - /// across 128 bit lanes) in LLVM IR. If not supported on the underlying - /// platform, emulate it - /// @param a - 256bit SIMD lane(8x32bit) of integer values. - /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values - Value *Builder::PERMD(Value* a, Value* idx) - { - Value* res; - // use avx2 permute instruction if available - if(JM()->mArch.AVX2()) - { - res = VPERMD(a, idx); - } - else - { - if (isa(idx)) - { - res = VSHUFFLE(a, a, idx); - } - else - { - res = VUNDEF_I(); - for (uint32_t l = 0; l < JM()->mVWidth; ++l) - { - Value* pIndex = VEXTRACT(idx, C(l)); - Value* pVal = VEXTRACT(a, pIndex); - res = VINSERT(res, pVal, C(l)); - } - } - } - return res; - } - - ////////////////////////////////////////////////////////////////////////// - /// @brief Generate a VPERMPS operation (shuffle 32 bit float values - /// across 128 bit lanes) in LLVM IR. If not supported on the underlying - /// platform, emulate it - /// @param a - 256bit SIMD lane(8x32bit) of float values. - /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values - Value *Builder::PERMPS(Value* a, Value* idx) - { - Value* res; - // use avx2 permute instruction if available - if (JM()->mArch.AVX2()) - { - // llvm 3.6.0 swapped the order of the args to vpermd - res = VPERMPS(idx, a); - } - else - { - if (isa(idx)) - { - res = VSHUFFLE(a, a, idx); - } - else - { - res = VUNDEF_F(); - for (uint32_t l = 0; l < JM()->mVWidth; ++l) - { - Value* pIndex = VEXTRACT(idx, C(l)); - Value* pVal = VEXTRACT(a, pIndex); - res = VINSERT(res, pVal, C(l)); - } - } - } - - return res; - } - ////////////////////////////////////////////////////////////////////////// /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) /// in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. - Value *Builder::CVTPH2PS(Value* a) + Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name) { - if (JM()->mArch.F16C()) - { - return VCVTPH2PS(a); - } - else - { - FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty); - Function* pCvtPh2Ps = cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy)); - - if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr) - { - sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32); - } - - Value* pResult = UndefValue::get(mSimdFP32Ty); - for (uint32_t i = 0; i < mVWidth; ++i) - { - Value* pSrc = VEXTRACT(a, C(i)); - Value* pConv = CALL(pCvtPh2Ps, std::initializer_list{pSrc}); - pResult = VINSERT(pResult, pConv, C(i)); - } + // Bitcast Nxint16 to Nxhalf +#if LLVM_VERSION_MAJOR >= 11 + uint32_t numElems = cast(a->getType())->getNumElements(); +#else + uint32_t numElems = a->getType()->getVectorNumElements(); +#endif + Value* input = BITCAST(a, getVectorType(mFP16Ty, numElems)); - return pResult; - } + return FP_EXT(input, getVectorType(mFP32Ty, numElems), name); } ////////////////////////////////////////////////////////////////////////// /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion) /// in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. - Value *Builder::CVTPS2PH(Value* a, Value* rounding) + Value* Builder::CVTPS2PH(Value* a, Value* rounding) { if (JM()->mArch.F16C()) { @@ -1061,631 +690,302 @@ namespace SwrJit else { // call scalar C function for now - FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); - Function* pCvtPs2Ph = cast(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy)); + FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty); + Function* pCvtPs2Ph = cast( +#if LLVM_VERSION_MAJOR >= 9 + JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy).getCallee()); +#else + JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy)); +#endif if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr) { - sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16); + sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", + (void*)&ConvertFloat32ToFloat16); } Value* pResult = UndefValue::get(mSimdInt16Ty); for (uint32_t i = 0; i < mVWidth; ++i) { - Value* pSrc = VEXTRACT(a, C(i)); + Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPs2Ph, std::initializer_list{pSrc}); - pResult = VINSERT(pResult, pConv, C(i)); + pResult = VINSERT(pResult, pConv, C(i)); } return pResult; } } - Value *Builder::PMAXSD(Value* a, Value* b) + Value* Builder::PMAXSD(Value* a, Value* b) { Value* cmp = ICMP_SGT(a, b); return SELECT(cmp, a, b); } - Value *Builder::PMINSD(Value* a, Value* b) + Value* Builder::PMINSD(Value* a, Value* b) { Value* cmp = ICMP_SLT(a, b); return SELECT(cmp, a, b); } - void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, - Value* mask, Value* vGatherComponents[], bool bPackedOutput) - { - const SWR_FORMAT_INFO &info = GetFormatInfo(format); - if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32) - { - GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); - } - else - { - GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput); - } - } - - void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, - Value* vMask, Value* vGatherComponents[], bool bPackedOutput) - { - switch(info.bpp / info.numComps) - { - case 16: - { - Value* vGatherResult[2]; - - // TODO: vGatherMaskedVal - Value* vGatherMaskedVal = VIMMED1((float)0); - - // always have at least one component out of x or y to fetch - - vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - - // if we have at least one component out of x or y to fetch - if(info.numComps > 2) - { - // offset base to the next components(zw) in the vertex to gather - pSrcBase = GEP(pSrcBase, C((char)4)); - - vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - else - { - vGatherResult[1] = vGatherMaskedVal; - } - - // Shuffle gathered components into place, each row is a component - Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - } - break; - case 32: - { - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]); - } - - for(uint32_t i = 0; i < info.numComps; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - - // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask); - - // offset base to the next component to gather - pSrcBase = GEP(pSrcBase, C((char)4)); - } - } - break; - default: - SWR_INVALID("Invalid float format"); - break; - } - } - - void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, - Value* vMask, Value* vGatherComponents[], bool bPackedOutput) - { - switch (info.bpp / info.numComps) - { - case 8: - { - Value* vGatherMaskedVal = VIMMED1((int32_t)0); - Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); - // e.g. result of an 8x32bit integer gather for 8bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw - - Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - } - break; - case 16: - { - Value* vGatherResult[2]; - - // TODO: vGatherMaskedVal - Value* vGatherMaskedVal = VIMMED1((int32_t)0); - - // always have at least one component out of x or y to fetch - - vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - - // if we have at least one component out of x or y to fetch - if(info.numComps > 2) - { - // offset base to the next components(zw) in the vertex to gather - pSrcBase = GEP(pSrcBase, C((char)4)); - - vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - else - { - vGatherResult[1] = vGatherMaskedVal; - } - - // Shuffle gathered components into place, each row is a component - Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput); - - } - break; - case 32: - { - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherComponents[i] = VIMMED1((int)info.defaults[i]); - } - - for(uint32_t i = 0; i < info.numComps; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - - // Gather a SIMD of components - vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask); - - // offset base to the next component to gather - pSrcBase = GEP(pSrcBase, C((char)4)); - } - } - break; - default: - SWR_INVALID("unsupported format"); - break; - } - } - - void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) + Value* Builder::PMAXUD(Value* a, Value* b) { - // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - - // input could either be float or int vector; do shuffle work in int - vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); - vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty); - - if(bPackedOutput) - { - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - - // shuffle mask - Value* vConstMask = C({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - - Value* vi128XY = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - // after PERMD: move and pack xy components into each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy - - // do the same for zw components - Value* vi128ZW = nullptr; - if(info.numComps > 2) - { - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - } - - for(uint32_t i = 0; i < 4; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - // todo: fixed for packed - Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); - if(i >= info.numComps) - { - // set the default component val - vGatherOutput[swizzleIndex] = vGatherMaskedVal; - continue; - } - - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - - // extract packed component 128 bit lanes - vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); - } - - } - else - { - // pshufb masks for each component - Value* vConstMask[2]; - // x/z shuffle mask - vConstMask[0] = C({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); - - // y/w shuffle mask - vConstMask[1] = C({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); - - - // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); - } - - for(uint32_t i = 0; i < info.numComps; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - - // select correct constMask for x/z or y/w pshufb - uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - uint32_t selectedGather = (i < 2) ? 0 : 1; - - vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); - // after pshufb mask for x channel; z uses the same shuffle from the second gather - // 256i - 0 1 2 3 4 5 6 7 - // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 - } - } + Value* cmp = ICMP_UGT(a, b); + return SELECT(cmp, a, b); } - void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) + Value* Builder::PMINUD(Value* a, Value* b) { - // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits - - if(bPackedOutput) - { - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - // shuffle mask - Value* vConstMask = C({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, - 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww - - Value* vi128XY = BITCAST(PERMD(vShufResult, C({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty); - // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care) - - // do the same for zw components - Value* vi128ZW = nullptr; - if(info.numComps > 2) - { - vi128ZW = BITCAST(PERMD(vShufResult, C({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty); - } - - // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex - for(uint32_t i = 0; i < 4; i++) - { - uint32_t swizzleIndex = info.swizzle[i]; - // todo: fix for packed - Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i])); - if(i >= info.numComps) - { - // set the default component val - vGatherOutput[swizzleIndex] = vGatherMaskedVal; - continue; - } - - // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 - uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; - // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - - // sign extend - vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane)); - } - } - // else zero extend - else{ - // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits - // apply defaults - for (uint32_t i = 0; i < 4; ++i) - { - vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]); - } - - for(uint32_t i = 0; i < info.numComps; i++){ - uint32_t swizzleIndex = info.swizzle[i]; - - // pshufb masks for each component - Value* vConstMask; - switch(i) - { - case 0: - // x shuffle mask - vConstMask = C({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1, - 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1}); - break; - case 1: - // y shuffle mask - vConstMask = C({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1, - 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1}); - break; - case 2: - // z shuffle mask - vConstMask = C({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, - 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1}); - break; - case 3: - // w shuffle mask - vConstMask = C({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1, - 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1}); - break; - default: - vConstMask = nullptr; - break; - } - - vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy); - // after pshufb for x channel - // 256i - 0 1 2 3 4 5 6 7 - // x000 x000 x000 x000 x000 x000 x000 x000 - } - } + Value* cmp = ICMP_ULT(a, b); + return SELECT(cmp, a, b); } // Helper function to create alloca in entry block of function Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType) { auto saveIP = IRB()->saveIP(); - IRB()->SetInsertPoint(&pFunc->getEntryBlock(), - pFunc->getEntryBlock().begin()); + IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin()); Value* pAlloca = ALLOCA(pType); - if (saveIP.isSet()) IRB()->restoreIP(saveIP); + if (saveIP.isSet()) + IRB()->restoreIP(saveIP); return pAlloca; } Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize) { auto saveIP = IRB()->saveIP(); - IRB()->SetInsertPoint(&pFunc->getEntryBlock(), - pFunc->getEntryBlock().begin()); + IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin()); Value* pAlloca = ALLOCA(pType, pArraySize); - if (saveIP.isSet()) IRB()->restoreIP(saveIP); + if (saveIP.isSet()) + IRB()->restoreIP(saveIP); return pAlloca; } - ////////////////////////////////////////////////////////////////////////// - /// @brief emulates a scatter operation. - /// @param pDst - pointer to destination - /// @param vSrc - vector of src data to scatter - /// @param vOffsets - vector of byte offsets from pDst - /// @param vMask - mask of valid lanes - void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) + Value* Builder::VABSPS(Value* a) { - /* Scatter algorithm - - while(Index = BitScanForward(mask)) - srcElem = srcVector[Index] - offsetElem = offsetVector[Index] - *(pDst + offsetElem) = srcElem - Update mask (&= ~(1<GetInsertBlock(); - Function* pFunc = pCurBB->getParent(); - Type* pSrcTy = vSrc->getType()->getVectorElementType(); - - // Store vectors on stack - if (pScatterStackSrc == nullptr) - { - // Save off stack allocations and reuse per scatter. Significantly reduces stack - // requirements for shaders with a lot of scatters. - pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty); - pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty); - } - - Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0)); - Value* pOffsetsArrayPtr = pScatterStackOffsets; - STORE(vSrc, pSrcArrayPtr); - STORE(vOffsets, pOffsetsArrayPtr); + Value* asInt = BITCAST(a, mSimdInt32Ty); + Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); + return result; + } - // Cast to pointers for random access - pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0)); - pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0)); + Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name) + { + Value* lowCmp = ICMP_SLT(src, low); + Value* ret = SELECT(lowCmp, low, src); - Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty)); + Value* highCmp = ICMP_SGT(ret, high); + ret = SELECT(highCmp, high, ret, name); - // Get cttz function - Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty }); - - // Setup loop basic block - BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter Loop", pFunc); + return ret; + } - // compute first set bit - Value* pIndex = CALL(pfnCttz, { pMask, C(false) }); + Value* Builder::FCLAMP(Value* src, Value* low, Value* high) + { + Value* lowCmp = FCMP_OLT(src, low); + Value* ret = SELECT(lowCmp, low, src); - Value* pIsUndef = ICMP_EQ(pIndex, C(32)); + Value* highCmp = FCMP_OGT(ret, high); + ret = SELECT(highCmp, high, ret); - // Split current block - BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast(pIsUndef)->getNextNode()); + return ret; + } - // Remove unconditional jump created by splitBasicBlock - pCurBB->getTerminator()->eraseFromParent(); + Value* Builder::FCLAMP(Value* src, float low, float high) + { + Value* result = VMAXPS(src, VIMMED1(low)); + result = VMINPS(result, VIMMED1(high)); - // Add terminator to end of original block - IRB()->SetInsertPoint(pCurBB); + return result; + } - // Add conditional branch - COND_BR(pIsUndef, pPostLoop, pLoop); + Value* Builder::FMADDPS(Value* a, Value* b, Value* c) + { + Value* vOut; + // This maps to LLVM fmuladd intrinsic + vOut = VFMADDPS(a, b, c); + return vOut; + } - // Add loop basic block contents - IRB()->SetInsertPoint(pLoop); - PHINode* pIndexPhi = PHI(mInt32Ty, 2); - PHINode* pMaskPhi = PHI(mInt32Ty, 2); + ////////////////////////////////////////////////////////////////////////// + /// @brief pop count on vector mask (e.g. <8 x i1>) + Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); } - pIndexPhi->addIncoming(pIndex, pCurBB); - pMaskPhi->addIncoming(pMask, pCurBB); + ////////////////////////////////////////////////////////////////////////// + /// @brief Float / Fixed-point conversions + ////////////////////////////////////////////////////////////////////////// + Value* Builder::VCVT_F32_FIXED_SI(Value* vFloat, + uint32_t numIntBits, + uint32_t numFracBits, + const llvm::Twine& name) + { + SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values"); + Value* fixed = nullptr; - // Extract elements for this index - Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi }); - Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi }); +#if 0 // This doesn't work for negative numbers!! + { + fixed = FP_TO_SI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))), + C(_MM_FROUND_TO_NEAREST_INT)), + mSimdInt32Ty); + } + else +#endif + { + // Do round to nearest int on fractional bits first + // Not entirely perfect for negative numbers, but close enough + vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))), + C(_MM_FROUND_TO_NEAREST_INT)); + vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits))); - // GEP to this offset in dst - Value* pCurDst = GEP(pDst, pOffsetElem); - pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0)); - STORE(pSrcElem, pCurDst); + // TODO: Handle INF, NAN, overflow / underflow, etc. - // Update the mask - Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi))); + Value* vSgn = FCMP_OLT(vFloat, VIMMED1(0.0f)); + Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty); + Value* vFixed = AND(vFloatInt, VIMMED1((1 << 23) - 1)); + vFixed = OR(vFixed, VIMMED1(1 << 23)); + vFixed = SELECT(vSgn, NEG(vFixed), vFixed); - // Terminator - Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) }); + Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24)); + vExp = SUB(vExp, VIMMED1(127)); - pIsUndef = ICMP_EQ(pNewIndex, C(32)); - COND_BR(pIsUndef, pPostLoop, pLoop); + Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp); - // Update phi edges - pIndexPhi->addIncoming(pNewIndex, pLoop); - pMaskPhi->addIncoming(pNewMask, pLoop); + fixed = ASHR(vFixed, vExtraBits, name); + } - // Move builder to beginning of post loop - IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); + return fixed; } - Value* Builder::VABSPS(Value* a) + Value* Builder::VCVT_FIXED_SI_F32(Value* vFixed, + uint32_t numIntBits, + uint32_t numFracBits, + const llvm::Twine& name) { - Value* asInt = BITCAST(a, mSimdInt32Ty); - Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); - return result; - } + SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values"); + uint32_t extraBits = 32 - numIntBits - numFracBits; + if (numIntBits && extraBits) + { + // Sign extend + Value* shftAmt = VIMMED1(extraBits); + vFixed = ASHR(SHL(vFixed, shftAmt), shftAmt); + } - Value *Builder::ICLAMP(Value* src, Value* low, Value* high) - { - Value *lowCmp = ICMP_SLT(src, low); - Value *ret = SELECT(lowCmp, low, src); + Value* fVal = VIMMED1(0.0f); + Value* fFrac = VIMMED1(0.0f); + if (numIntBits) + { + fVal = SI_TO_FP(ASHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name); + } - Value *highCmp = ICMP_SGT(ret, high); - ret = SELECT(highCmp, high, ret); + if (numFracBits) + { + fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty); + fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name); + } - return ret; + return FADD(fVal, fFrac, name); } - Value *Builder::FCLAMP(Value* src, Value* low, Value* high) + Value* Builder::VCVT_F32_FIXED_UI(Value* vFloat, + uint32_t numIntBits, + uint32_t numFracBits, + const llvm::Twine& name) { - Value *lowCmp = FCMP_OLT(src, low); - Value *ret = SELECT(lowCmp, low, src); + SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values"); + Value* fixed = nullptr; +#if 1 // KNOB_SIM_FAST_MATH? Below works correctly from a precision + // standpoint... + { + fixed = FP_TO_UI(VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))), + C(_MM_FROUND_TO_NEAREST_INT)), + mSimdInt32Ty); + } +#else + { + // Do round to nearest int on fractional bits first + vFloat = VROUND(FMUL(vFloat, VIMMED1(float(1 << numFracBits))), + C(_MM_FROUND_TO_NEAREST_INT)); + vFloat = FMUL(vFloat, VIMMED1(1.0f / float(1 << numFracBits))); - Value *highCmp = FCMP_OGT(ret, high); - ret = SELECT(highCmp, high, ret); + // TODO: Handle INF, NAN, overflow / underflow, etc. - return ret; - } + Value* vSgn = FCMP_OLT(vFloat, VIMMED1(0.0f)); + Value* vFloatInt = BITCAST(vFloat, mSimdInt32Ty); + Value* vFixed = AND(vFloatInt, VIMMED1((1 << 23) - 1)); + vFixed = OR(vFixed, VIMMED1(1 << 23)); - Value *Builder::FCLAMP(Value* src, float low, float high) - { - Value* result = VMAXPS(src, VIMMED1(low)); - result = VMINPS(result, VIMMED1(high)); + Value* vExp = LSHR(SHL(vFloatInt, VIMMED1(1)), VIMMED1(24)); + vExp = SUB(vExp, VIMMED1(127)); - return result; - } + Value* vExtraBits = SUB(VIMMED1(23 - numFracBits), vExp); - ////////////////////////////////////////////////////////////////////////// - /// @brief save/restore stack, providing ability to push/pop the stack and - /// reduce overall stack requirements for temporary stack use - Value* Builder::STACKSAVE() - { - Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); - return CALLA(pfnStackSave); + fixed = LSHR(vFixed, vExtraBits, name); + } +#endif + return fixed; } - void Builder::STACKRESTORE(Value* pSaved) + Value* Builder::VCVT_FIXED_UI_F32(Value* vFixed, + uint32_t numIntBits, + uint32_t numFracBits, + const llvm::Twine& name) { - Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore); - CALL(pfnStackRestore, std::initializer_list{pSaved}); - } + SWR_ASSERT((numIntBits + numFracBits) <= 32, "Can only handle 32-bit fixed-point values"); + uint32_t extraBits = 32 - numIntBits - numFracBits; + if (numIntBits && extraBits) + { + // Sign extend + Value* shftAmt = VIMMED1(extraBits); + vFixed = ASHR(SHL(vFixed, shftAmt), shftAmt); + } - Value *Builder::FMADDPS(Value* a, Value* b, Value* c) - { - Value* vOut; - // use FMADs if available - if(JM()->mArch.AVX2()) + Value* fVal = VIMMED1(0.0f); + Value* fFrac = VIMMED1(0.0f); + if (numIntBits) { - vOut = VFMADDPS(a, b, c); + fVal = UI_TO_FP(LSHR(vFixed, VIMMED1(numFracBits)), mSimdFP32Ty, name); } - else + + if (numFracBits) { - vOut = FADD(FMUL(a, b), c); + fFrac = UI_TO_FP(AND(vFixed, VIMMED1((1 << numFracBits) - 1)), mSimdFP32Ty); + fFrac = FDIV(fFrac, VIMMED1(float(1 << numFracBits)), name); } - return vOut; - } - Value* Builder::POPCNT(Value* a) - { - Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() }); - return CALL(pCtPop, std::initializer_list{a}); + return FADD(fVal, fFrac, name); } ////////////////////////////////////////////////////////////////////////// /// @brief C functions called by LLVM IR ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - /// @brief called in JIT code, inserted by PRINT - /// output to both stdout and visual studio debug console - void __cdecl CallPrint(const char* fmt, ...) - { - va_list args; - va_start(args, fmt); - vprintf(fmt, args); - - #if defined( _WIN32 ) - char strBuf[1024]; - vsnprintf_s(strBuf, _TRUNCATE, fmt, args); - OutputDebugStringA(strBuf); - #endif - - va_end(args); - } - - Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) + Value* Builder::VEXTRACTI128(Value* a, Constant* imm8) { - bool flag = !imm8->isZeroValue(); - SmallVector idx; - for (unsigned i = 0; i < mVWidth / 2; i++) { + bool flag = !imm8->isZeroValue(); + SmallVector idx; + for (unsigned i = 0; i < mVWidth / 2; i++) + { idx.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); } - Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) + Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) { - bool flag = !imm8->isZeroValue(); - SmallVector idx; - for (unsigned i = 0; i < mVWidth; i++) { + bool flag = !imm8->isZeroValue(); + SmallVector idx; + for (unsigned i = 0; i < mVWidth; i++) + { idx.push_back(C(i)); } - Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); + Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); - SmallVector idx2; - for (unsigned i = 0; i < mVWidth / 2; i++) { + SmallVector idx2; + for (unsigned i = 0; i < mVWidth / 2; i++) + { idx2.push_back(C(flag ? i : i + mVWidth)); } - for (unsigned i = mVWidth / 2; i < mVWidth; i++) { + for (unsigned i = mVWidth / 2; i < mVWidth; i++) + { idx2.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, inter, ConstantVector::get(idx2)); @@ -1694,62 +994,75 @@ namespace SwrJit // rdtsc buckets macros void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) { - // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into - // buckets framework when single threaded + // @todo due to an issue with thread local storage propagation in llvm, we can only safely + // call into buckets framework when single threaded if (KNOB_SINGLE_THREADED) { std::vector args{ - PointerType::get(mInt32Ty, 0), // pBucketMgr - mInt32Ty // id + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id }; FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); - Function* pFunc = cast(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); - if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) + Function* pFunc = cast( +#if LLVM_VERSION_MAJOR >= 9 + JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy).getCallee()); +#else + JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); +#endif + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == + nullptr) { - sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); + sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", + (void*)&BucketManager_StartBucket); } - CALL(pFunc, { pBucketMgr, pId }); + CALL(pFunc, {pBucketMgr, pId}); } } void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) { - // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into - // buckets framework when single threaded + // @todo due to an issue with thread local storage propagation in llvm, we can only safely + // call into buckets framework when single threaded if (KNOB_SINGLE_THREADED) { std::vector args{ - PointerType::get(mInt32Ty, 0), // pBucketMgr - mInt32Ty // id + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id }; FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); - Function* pFunc = cast(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); - if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) + Function* pFunc = cast( +#if LLVM_VERSION_MAJOR >= 9 + JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy).getCallee()); +#else + JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); +#endif + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == + nullptr) { - sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); + sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", + (void*)&BucketManager_StopBucket); } - CALL(pFunc, { pBucketMgr, pId }); + CALL(pFunc, {pBucketMgr, pId}); } } - uint32_t Builder::GetTypeSize(Type* pType) { if (pType->isStructTy()) { uint32_t numElems = pType->getStructNumElements(); - Type* pElemTy = pType->getStructElementType(0); + Type* pElemTy = pType->getStructElementType(0); return numElems * GetTypeSize(pElemTy); } if (pType->isArrayTy()) { uint32_t numElems = pType->getArrayNumElements(); - Type* pElemTy = pType->getArrayElementType(); + Type* pElemTy = pType->getArrayElementType(); return numElems * GetTypeSize(pElemTy); } @@ -1777,4 +1090,4 @@ namespace SwrJit SWR_ASSERT(false, "Unimplemented type."); return 0; } -} +} // namespace SwrJit