X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fswr%2Frasterizer%2Fjitter%2Ffunctionpasses%2Flower_x86.cpp;h=216121bc51f454f53b314761ae6c94bcdb03e662;hb=389ff8ef47ba3fc8c586af14454b91c2afd4506d;hp=baf3ab5aa8d7bdb4c7212cad5971641ba250b4e7;hpb=12a002a3a1cef9c86f36e289ce1f391991351b11;p=mesa.git diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp index baf3ab5aa8d..216121bc51f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp @@ -1,45 +1,48 @@ /**************************************************************************** -* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file lower_x86.cpp -* -* @brief llvm pass to lower meta code to x86 -* -* Notes: -* -******************************************************************************/ + * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file lower_x86.cpp + * + * @brief llvm pass to lower meta code to x86 + * + * Notes: + * + ******************************************************************************/ #include "jit_pch.hpp" #include "passes.h" #include "JitManager.h" +#include "common/simdlib.hpp" + #include +extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t); namespace llvm { // foward declare the initializer void initializeLowerX86Pass(PassRegistry&); -} +} // namespace llvm namespace SwrJit { @@ -47,15 +50,15 @@ namespace SwrJit enum TargetArch { - AVX = 0, - AVX2 = 1, + AVX = 0, + AVX2 = 1, AVX512 = 2 }; enum TargetWidth { - W256 = 0, - W512 = 1, + W256 = 0, + W512 = 1, NUM_WIDTHS = 2 }; @@ -65,111 +68,190 @@ namespace SwrJit struct X86Intrinsic { - Intrinsic::ID intrin[NUM_WIDTHS]; - EmuFunc emuFunc; + IntrinsicID intrin[NUM_WIDTHS]; + EmuFunc emuFunc; }; - // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of - // mapping directly to avx/avx2 intrinsics. - static std::map intrinsicMap = { - {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32}, - {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b}, - {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256}, - {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256}, - {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256}, - {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256}, - {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d}, - {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32}, - {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc}, - }; + // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the + // previous behavior of mapping directly to avx/avx2 intrinsics. + using intrinsicMap_t = std::map; + static intrinsicMap_t& getIntrinsicMap() { + static std::map intrinsicMap = { + {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32}, + {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b}, + {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256}, + {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256}, + {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256}, + {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d}, + {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32}, + {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc} + }; + return intrinsicMap; + } // Forward decls Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); - Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* + VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* + VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* + VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* + VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* + VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* + VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + + Instruction* DOUBLE_EMU(LowerX86* pThis, + TargetArch arch, + TargetWidth width, + CallInst* pCallInst, + Intrinsic::ID intrin); - Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin); - static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1; - static std::map intrinsicMap2[] = { - // 256 wide 512 wide - { // AVX - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, - }, - { // AVX2 - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, - }, - { // AVX512 - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}}, - {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}}, - {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512 }, NO_EMU}}, - {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512 }, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}}, + using intrinsicMapAdvanced_t = std::vector>; + + static intrinsicMapAdvanced_t& getIntrinsicMapAdvanced() + { + // clang-format off + static intrinsicMapAdvanced_t intrinsicMapAdvanced = { + // 256 wide 512 wide + { + // AVX + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, + }, + { + // AVX2 + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, + }, + { + // AVX512 + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, + #if LLVM_VERSION_MAJOR < 7 + {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}}, + #else + {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + #endif + {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, + #if LLVM_VERSION_MAJOR < 7 + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}}, + #else + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}}, + #endif + {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}}, + {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}} + }}; + // clang-format on + return intrinsicMapAdvanced; + } + + static uint32_t getBitWidth(VectorType *pVTy) + { +#if LLVM_VERSION_MAJOR >= 11 + return pVTy->getNumElements() * pVTy->getElementType()->getPrimitiveSizeInBits(); +#else + return pVTy->getBitWidth(); +#endif } - }; struct LowerX86 : public FunctionPass { - LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr) - : FunctionPass(ID), mpJitMgr(pJitMgr), B(b) + LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b) { initializeLowerX86Pass(*PassRegistry::getPassRegistry()); // Determine target arch - if (mpJitMgr->mArch.AVX512F()) + if (JM()->mArch.AVX512F()) { mTarget = AVX512; } - else if (mpJitMgr->mArch.AVX2()) + else if (JM()->mArch.AVX2()) { mTarget = AVX2; } - else if (mpJitMgr->mArch.AVX()) + else if (JM()->mArch.AVX()) { mTarget = AVX; - } else { SWR_ASSERT(false, "Unsupported AVX architecture."); mTarget = AVX; } + + // Setup scatter function for 256 wide + uint32_t curWidth = B->mVWidth; + B->SetTargetWidth(8); + std::vector args = { + B->mInt8PtrTy, // pBase + B->mSimdInt32Ty, // vIndices + B->mSimdFP32Ty, // vSrc + B->mInt8Ty, // mask + B->mInt32Ty // scale + }; + + FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false); + mPfnScatter256 = cast( +#if LLVM_VERSION_MAJOR >= 9 + B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee()); +#else + B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy)); +#endif + if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr) + { + sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256); + } + + B->SetTargetWidth(curWidth); } // Try to decipher the vector type of the instruction. This does not work properly // across all intrinsics, and will have to be rethought. Probably need something // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed // intrinsic. - void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* pWidth, Type** pTy) + void GetRequestedWidthAndType(CallInst* pCallInst, + const StringRef intrinName, + TargetWidth* pWidth, + Type** pTy) { - uint32_t vecWidth; + assert(pCallInst); Type* pVecTy = pCallInst->getType(); + + // Check for intrinsic specific types + // VCVTPD2PS type comes from src, not dst + if (intrinName.equals("meta.intrinsic.VCVTPD2PS")) + { + Value* pOp = pCallInst->getOperand(0); + assert(pOp); + pVecTy = pOp->getType(); + } + if (!pVecTy->isVectorTy()) { for (auto& op : pCallInst->arg_operands()) @@ -183,12 +265,17 @@ namespace SwrJit } SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size"); - uint32_t width = cast(pVecTy)->getBitWidth(); + uint32_t width = getBitWidth(cast(pVecTy)); switch (width) { - case 256: *pWidth = W256; break; - case 512: *pWidth = W512; break; - default: SWR_ASSERT(false, "Unhandled vector width %d", width); + case 256: + *pWidth = W256; + break; + case 512: + *pWidth = W512; + break; + default: + SWR_ASSERT(false, "Unhandled vector width %d", width); *pWidth = W256; } @@ -200,11 +287,17 @@ namespace SwrJit uint32_t numElem = 0; switch (width) { - case W256: numElem = 8; break; - case W512: numElem = 16; break; + case W256: + numElem = 8; + break; + case W512: + numElem = 16; + break; + default: + SWR_ASSERT(false, "Unhandled vector width type %d\n", width); } - return ConstantVector::getNullValue(VectorType::get(pTy, numElem)); + return ConstantVector::getNullValue(getVectorType(pTy, numElem)); } Value* GetMask(TargetWidth width) @@ -212,8 +305,14 @@ namespace SwrJit Value* mask; switch (width) { - case W256: mask = B->C((uint8_t)-1); break; - case W512: mask = B->C((uint16_t)-1); break; + case W256: + mask = B->C((uint8_t)-1); + break; + case W512: + mask = B->C((uint16_t)-1); + break; + default: + SWR_ASSERT(false, "Unhandled vector width type %d\n", width); } return mask; } @@ -221,26 +320,33 @@ namespace SwrJit // Convert mask to x86 mask Value* VectorMask(Value* vi1Mask) { +#if LLVM_VERSION_MAJOR >= 11 + uint32_t numElem = cast(vi1Mask->getType())->getNumElements(); +#else uint32_t numElem = vi1Mask->getType()->getVectorNumElements(); - return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem)); +#endif + return B->S_EXT(vi1Mask, getVectorType(B->mInt32Ty, numElem)); } Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst) { - Function* pFunc = pCallInst->getCalledFunction(); - auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()]; + Function* pFunc = pCallInst->getCalledFunction(); + assert(pFunc); + + auto& intrinsic = getIntrinsicMapAdvanced()[mTarget][pFunc->getName().str()]; TargetWidth vecWidth; - Type* pElemTy; - GetRequestedWidthAndType(pCallInst, &vecWidth, &pElemTy); + Type* pElemTy; + GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy); // Check if there is a native intrinsic for this instruction - Intrinsic::ID id = intrinsic.intrin[vecWidth]; + IntrinsicID id = intrinsic.intrin[vecWidth]; if (id == DOUBLE) { // Double pump the next smaller SIMD intrinsic SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width."); Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1]; - SWR_ASSERT(id2 != Intrinsic::not_intrinsic, "Cannot find intrinsic to double pump."); + SWR_ASSERT(id2 != Intrinsic::not_intrinsic, + "Cannot find intrinsic to double pump."); return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2); } else if (id != Intrinsic::not_intrinsic) @@ -252,12 +358,23 @@ namespace SwrJit args.push_back(arg.get()); } - // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now - // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list. + // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and + // full mask for now Assuming the intrinsics are consistent and place the src + // operand and mask last in the argument list. if (mTarget == AVX512) { - args.push_back(GetZeroVec(vecWidth, pElemTy)); - args.push_back(GetMask(vecWidth)); + if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS")) + { + args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType())); + args.push_back(GetMask(W256)); + // for AVX512 VCVTPD2PS, we also have to add rounding mode + args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + } + else + { + args.push_back(GetZeroVec(vecWidth, pElemTy)); + args.push_back(GetMask(vecWidth)); + } } return B->CALLA(pIntrin, args); @@ -275,17 +392,21 @@ namespace SwrJit Instruction* ProcessIntrinsic(CallInst* pCallInst) { Function* pFunc = pCallInst->getCalledFunction(); - + assert(pFunc); + // Forward to the advanced support if found - if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end()) + if (getIntrinsicMapAdvanced()[mTarget].find(pFunc->getName().str()) != getIntrinsicMapAdvanced()[mTarget].end()) { return ProcessIntrinsicAdvanced(pCallInst); } - SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName()); + SWR_ASSERT(getIntrinsicMap().find(pFunc->getName().str()) != getIntrinsicMap().end(), + "Unimplemented intrinsic %s.", + pFunc->getName().str().c_str()); - Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()]; - Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic); + Intrinsic::ID x86Intrinsic = getIntrinsicMap()[pFunc->getName().str()]; + Function* pX86IntrinFunc = + Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic); SmallVector args; for (auto& arg : pCallInst->arg_operands()) @@ -301,26 +422,41 @@ namespace SwrJit virtual bool runOnFunction(Function& F) { std::vector toRemove; + std::vector bbs; + + // Make temp copy of the basic blocks and instructions, as the intrinsic + // replacement code might invalidate the iterators + for (auto& b : F.getBasicBlockList()) + { + bbs.push_back(&b); + } - for (auto& BB : F.getBasicBlockList()) + for (auto* BB : bbs) { - for (auto& I : BB.getInstList()) + std::vector insts; + for (auto& i : BB->getInstList()) { - if (CallInst* pCallInst = dyn_cast(&I)) + insts.push_back(&i); + } + + for (auto* I : insts) + { + if (CallInst* pCallInst = dyn_cast(I)) { Function* pFunc = pCallInst->getCalledFunction(); if (pFunc) { if (pFunc->getName().startswith("meta.intrinsic")) { - B->IRB()->SetInsertPoint(&I); + B->IRB()->SetInsertPoint(I); Instruction* pReplace = ProcessIntrinsic(pCallInst); - SWR_ASSERT(pReplace); toRemove.push_back(pCallInst); - pCallInst->replaceAllUsesWith(pReplace); + if (pReplace) + { + pCallInst->replaceAllUsesWith(pReplace); + } } } - } } } @@ -335,26 +471,19 @@ namespace SwrJit return true; } - virtual void getAnalysisUsage(AnalysisUsage& AU) const - { - } + virtual void getAnalysisUsage(AnalysisUsage& AU) const {} - JitManager* JM() { return mpJitMgr; } + JitManager* JM() { return B->JM(); } + Builder* B; + TargetArch mTarget; + Function* mPfnScatter256; - JitManager* mpJitMgr; - Builder* B; - - TargetArch mTarget; - - static char ID; ///< Needed by LLVM to generate ID for FunctionPass. + static char ID; ///< Needed by LLVM to generate ID for FunctionPass. }; - char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID. + char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID. - FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b) - { - return new LowerX86(pJitMgr, b); - } + FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); } Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) { @@ -367,9 +496,9 @@ namespace SwrJit // Only need vperm emulation for AVX SWR_ASSERT(arch == AVX); - Builder* B = pThis->B; - auto v32A = pCallInst->getArgOperand(0); - auto vi32Index = pCallInst->getArgOperand(1); + Builder* B = pThis->B; + auto v32A = pCallInst->getArgOperand(0); + auto vi32Index = pCallInst->getArgOperand(1); Value* v32Result; if (isa(vi32Index)) @@ -380,70 +509,90 @@ namespace SwrJit else { v32Result = UndefValue::get(v32A->getType()); - for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l) +#if LLVM_VERSION_MAJOR >= 11 + uint32_t numElem = cast(v32A->getType())->getNumElements(); +#else + uint32_t numElem = v32A->getType()->getVectorNumElements(); +#endif + for (uint32_t l = 0; l < numElem; ++l) { auto i32Index = B->VEXTRACT(vi32Index, B->C(l)); - auto val = B->VEXTRACT(v32A, i32Index); - v32Result = B->VINSERT(v32Result, val, B->C(l)); + auto val = B->VEXTRACT(v32A, i32Index); + v32Result = B->VINSERT(v32Result, val, B->C(l)); } } return cast(v32Result); } - Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + Instruction* + VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) { - Builder* B = pThis->B; - auto vSrc = pCallInst->getArgOperand(0); - auto pBase = pCallInst->getArgOperand(1); - auto vi32Indices = pCallInst->getArgOperand(2); - auto vi1Mask = pCallInst->getArgOperand(3); - auto i8Scale = pCallInst->getArgOperand(4); - - pBase = B->INT_TO_PTR(pBase, PointerType::get(B->mInt8Ty, 0)); - uint32_t numElem = vSrc->getType()->getVectorNumElements(); - auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); - auto srcTy = vSrc->getType()->getVectorElementType(); - Value* v32Gather; + Builder* B = pThis->B; + auto vSrc = pCallInst->getArgOperand(0); + auto pBase = pCallInst->getArgOperand(1); + auto vi32Indices = pCallInst->getArgOperand(2); + auto vi1Mask = pCallInst->getArgOperand(3); + auto i8Scale = pCallInst->getArgOperand(4); + + pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0)); +#if LLVM_VERSION_MAJOR >= 11 + VectorType* pVectorType = cast(vSrc->getType()); + uint32_t numElem = pVectorType->getNumElements(); + auto srcTy = pVectorType->getElementType(); +#else + uint32_t numElem = vSrc->getType()->getVectorNumElements(); + auto srcTy = vSrc->getType()->getVectorElementType(); +#endif + auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); + + Value* v32Gather = nullptr; if (arch == AVX) { // Full emulation for AVX // Store source on stack to provide a valid address to load from inactive lanes auto pStack = B->STACKSAVE(); - auto pTmp = B->ALLOCA(vSrc->getType()); + auto pTmp = B->ALLOCA(vSrc->getType()); B->STORE(vSrc, pTmp); - v32Gather = UndefValue::get(vSrc->getType()); - auto vi32Scale = ConstantVector::getSplat(numElem, cast(i32Scale)); + v32Gather = UndefValue::get(vSrc->getType()); +#if LLVM_VERSION_MAJOR > 10 + auto vi32Scale = ConstantVector::getSplat(ElementCount::get(numElem, false), cast(i32Scale)); +#else + auto vi32Scale = ConstantVector::getSplat(numElem, cast(i32Scale)); +#endif auto vi32Offsets = B->MUL(vi32Indices, vi32Scale); for (uint32_t i = 0; i < numElem; ++i) { - auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i)); - auto pLoadAddress = B->GEP(pBase, i32Offset); - pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0)); - auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i }); - auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i)); - auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress); - auto val = B->LOAD(pValidAddress); - v32Gather = B->VINSERT(v32Gather, val, B->C(i)); + auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i)); + auto pLoadAddress = B->GEP(pBase, i32Offset); + pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0)); + auto pMaskedLoadAddress = B->GEP(pTmp, {0, i}); + auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i)); + auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress); + auto val = B->LOAD(pValidAddress); + v32Gather = B->VINSERT(v32Gather, val, B->C(i)); } B->STACKRESTORE(pStack); } else if (arch == AVX2 || (arch == AVX512 && width == W256)) { - Function* pX86IntrinFunc; + Function* pX86IntrinFunc = nullptr; if (srcTy == B->mFP32Ty) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256); - } + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx2_gather_d_ps_256); + } else if (srcTy == B->mInt32Ty) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx2_gather_d_d_256); } else if (srcTy == B->mDoubleTy) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_q_256); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx2_gather_d_q_256); } else { @@ -453,37 +602,64 @@ namespace SwrJit if (width == W256) { auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType()); - v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale }); + v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale}); } else if (width == W512) { // Double pump 4-wide for 64bit elements +#if LLVM_VERSION_MAJOR >= 11 + if (cast(vSrc->getType())->getElementType() == B->mDoubleTy) +#else if (vSrc->getType()->getVectorElementType() == B->mDoubleTy) +#endif { - auto v64Mask = B->S_EXT(pThis->VectorMask(vi1Mask), B->mInt64Ty); + auto v64Mask = pThis->VectorMask(vi1Mask); +#if LLVM_VERSION_MAJOR >= 11 + uint32_t numElem = cast(v64Mask->getType())->getNumElements(); +#else + uint32_t numElem = v64Mask->getType()->getVectorNumElements(); +#endif + v64Mask = B->S_EXT(v64Mask, getVectorType(B->mInt64Ty, numElem)); v64Mask = B->BITCAST(v64Mask, vSrc->getType()); - Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({ 0, 1, 2, 3 })); - Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({ 4, 5, 6, 7 })); - - Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 0, 1, 2, 3 })); - Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 4, 5, 6, 7 })); - - Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 0, 1, 2, 3 })); - Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 4, 5, 6, 7 })); - - Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale }); - Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale }); - - v32Gather = B->VSHUFFLE(gather0, gather1, B->C({ 0, 1, 2, 3, 4, 5, 6, 7 })); + Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3})); + Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7})); + + Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3})); + Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7})); + + Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3})); + Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7})); + +#if LLVM_VERSION_MAJOR >= 11 + uint32_t numElemSrc0 = cast(src0->getType())->getNumElements(); + uint32_t numElemMask0 = cast(mask0->getType())->getNumElements(); + uint32_t numElemSrc1 = cast(src1->getType())->getNumElements(); + uint32_t numElemMask1 = cast(mask1->getType())->getNumElements(); +#else + uint32_t numElemSrc0 = src0->getType()->getVectorNumElements(); + uint32_t numElemMask0 = mask0->getType()->getVectorNumElements(); + uint32_t numElemSrc1 = src1->getType()->getVectorNumElements(); + uint32_t numElemMask1 = mask1->getType()->getVectorNumElements(); +#endif + src0 = B->BITCAST(src0, getVectorType(B->mInt64Ty, numElemSrc0)); + mask0 = B->BITCAST(mask0, getVectorType(B->mInt64Ty, numElemMask0)); + Value* gather0 = + B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale}); + src1 = B->BITCAST(src1, getVectorType(B->mInt64Ty, numElemSrc1)); + mask1 = B->BITCAST(mask1, getVectorType(B->mInt64Ty, numElemMask1)); + Value* gather1 = + B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale}); + v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7})); + v32Gather = B->BITCAST(v32Gather, vSrc->getType()); } else { // Double pump 8-wide for 32bit elements auto v32Mask = pThis->VectorMask(vi1Mask); - v32Mask = B->BITCAST(v32Mask, vSrc->getType()); - Value* src0 = B->EXTRACT_16(vSrc, 0); - Value* src1 = B->EXTRACT_16(vSrc, 1); + v32Mask = B->BITCAST(v32Mask, vSrc->getType()); + Value* src0 = B->EXTRACT_16(vSrc, 0); + Value* src1 = B->EXTRACT_16(vSrc, 1); Value* indices0 = B->EXTRACT_16(vi32Indices, 0); Value* indices1 = B->EXTRACT_16(vi32Indices, 1); @@ -491,8 +667,10 @@ namespace SwrJit Value* mask0 = B->EXTRACT_16(v32Mask, 0); Value* mask1 = B->EXTRACT_16(v32Mask, 1); - Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale }); - Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale }); + Value* gather0 = + B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale}); + Value* gather1 = + B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale}); v32Gather = B->JOIN_16(gather0, gather1); } @@ -500,22 +678,25 @@ namespace SwrJit } else if (arch == AVX512) { - Value* iMask; - Function* pX86IntrinFunc; + Value* iMask = nullptr; + Function* pX86IntrinFunc = nullptr; if (srcTy == B->mFP32Ty) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512); - iMask = B->BITCAST(vi1Mask, B->mInt16Ty); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_gather_dps_512); + iMask = B->BITCAST(vi1Mask, B->mInt16Ty); } else if (srcTy == B->mInt32Ty) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512); - iMask = B->BITCAST(vi1Mask, B->mInt16Ty); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_gather_dpi_512); + iMask = B->BITCAST(vi1Mask, B->mInt16Ty); } else if (srcTy == B->mDoubleTy) { - pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpd_512); - iMask = B->BITCAST(vi1Mask, B->mInt8Ty); + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_gather_dpd_512); + iMask = B->BITCAST(vi1Mask, B->mInt8Ty); } else { @@ -523,21 +704,89 @@ namespace SwrJit } auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty); - v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, iMask, i32Scale }); + v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale}); } return cast(v32Gather); } + Instruction* + VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + { + Builder* B = pThis->B; + auto pBase = pCallInst->getArgOperand(0); + auto vi1Mask = pCallInst->getArgOperand(1); + auto vi32Indices = pCallInst->getArgOperand(2); + auto v32Src = pCallInst->getArgOperand(3); + auto i32Scale = pCallInst->getArgOperand(4); + + if (arch != AVX512) + { + // Call into C function to do the scatter. This has significantly better compile perf + // compared to jitting scatter loops for every scatter + if (width == W256) + { + auto mask = B->BITCAST(vi1Mask, B->mInt8Ty); + B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale}); + } + else + { + // Need to break up 512 wide scatter to two 256 wide + auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7})); + auto indicesLo = + B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7})); + auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7})); + + auto mask = B->BITCAST(maskLo, B->mInt8Ty); + B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale}); + + auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15})); + auto indicesHi = + B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15})); + auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15})); + + mask = B->BITCAST(maskHi, B->mInt8Ty); + B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale}); + } + return nullptr; + } - // No support for vroundps in avx512 (it is available in kncni), so emulate with avx instructions - Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + Value* iMask; + Function* pX86IntrinFunc; + if (width == W256) + { + // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we + // can use the scatter of 8 elements with 64bit indices + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_scatter_qps_512); + + auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty); + iMask = B->BITCAST(vi1Mask, B->mInt8Ty); + B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale}); + } + else if (width == W512) + { + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_scatter_dps_512); + iMask = B->BITCAST(vi1Mask, B->mInt16Ty); + B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale}); + } + return nullptr; + } + + // No support for vroundps in avx512 (it is available in kncni), so emulate with avx + // instructions + Instruction* + VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) { SWR_ASSERT(arch == AVX512); - auto B = pThis->B; + auto B = pThis->B; auto vf32Src = pCallInst->getOperand(0); + assert(vf32Src); auto i8Round = pCallInst->getOperand(1); - auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256); + assert(i8Round); + auto pfnFunc = + Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256); if (width == W256) { @@ -561,26 +810,56 @@ namespace SwrJit return nullptr; } + Instruction* + VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + { + SWR_ASSERT(arch == AVX512); + + auto B = pThis->B; + auto vf32Src = pCallInst->getOperand(0); + + if (width == W256) + { + auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx_round_ps_256); + return cast(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty)); + } + else if (width == W512) + { + // 512 can use intrinsic + auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_mask_cvtpd2ps_512); + return cast(B->CALL(pfnFunc, vf32Src)); + } + else + { + SWR_ASSERT(false, "Unimplemented vector width."); + } + + return nullptr; + } + // No support for hsub in AVX512 Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) { SWR_ASSERT(arch == AVX512); - auto B = pThis->B; + auto B = pThis->B; auto src0 = pCallInst->getOperand(0); auto src1 = pCallInst->getOperand(1); // 256b hsub can just use avx intrinsic if (width == W256) { - auto pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256); + auto pX86IntrinFunc = + Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256); return cast(B->CALL2(pX86IntrinFunc, src0, src1)); } else if (width == W512) { // 512b hsub can be accomplished with shuf/sub combo - auto minuend = B->VSHUFFLE(src0, src1, B->C({ 0, 2, 8, 10, 4, 6, 12, 14 })); - auto subtrahend = B->VSHUFFLE(src0, src1, B->C({ 1, 3, 9, 11, 5, 7, 13, 15 })); + auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14})); + auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15})); return cast(B->SUB(minuend, subtrahend)); } else @@ -590,30 +869,67 @@ namespace SwrJit } } - // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from each vector argument and - // calls the 256 wide intrinsic, then merges the results to 512 wide - Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin) + // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from + // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide + Instruction* DOUBLE_EMU(LowerX86* pThis, + TargetArch arch, + TargetWidth width, + CallInst* pCallInst, + Intrinsic::ID intrin) { auto B = pThis->B; SWR_ASSERT(width == W512); - Value* result[2]; + Value* result[2]; Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin); for (uint32_t i = 0; i < 2; ++i) { SmallVector args; for (auto& arg : pCallInst->arg_operands()) { - args.push_back(arg.get()->getType()->isVectorTy() ? B->EXTRACT_16(arg.get(), i) : arg.get()); + auto argType = arg.get()->getType(); + if (argType->isVectorTy()) + { +#if LLVM_VERSION_MAJOR >= 11 + uint32_t vecWidth = cast(argType)->getNumElements(); + auto elemTy = cast(argType)->getElementType(); +#else + uint32_t vecWidth = argType->getVectorNumElements(); + auto elemTy = argType->getVectorElementType(); +#endif + Value* lanes = B->CInc(i * vecWidth / 2, vecWidth / 2); + Value* argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(elemTy, vecWidth), lanes); + args.push_back(argToPush); + } + else + { + args.push_back(arg.get()); + } } result[i] = B->CALLA(pX86IntrinFunc, args); } - return cast(B->JOIN_16(result[0], result[1])); + uint32_t vecWidth; + if (result[0]->getType()->isVectorTy()) + { + assert(result[1]->getType()->isVectorTy()); +#if LLVM_VERSION_MAJOR >= 11 + vecWidth = cast(result[0]->getType())->getNumElements() + + cast(result[1]->getType())->getNumElements(); +#else + vecWidth = result[0]->getType()->getVectorNumElements() + + result[1]->getType()->getVectorNumElements(); +#endif + } + else + { + vecWidth = 2; + } + Value* lanes = B->CInc(0, vecWidth); + return cast(B->VSHUFFLE(result[0], result[1], lanes)); } -} +} // namespace SwrJit using namespace SwrJit; INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false) INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false) -