}
intrinsics = [
- ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
- ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']],
- ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']],
- ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']],
- ['VPERMD', 'x86_avx2_permd', ['a', 'idx']],
- ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']],
- ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a']],
- ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a']],
- ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round']],
- ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b']],
- ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b']],
- ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']],
- ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']],
- ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']],
- ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']],
- ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b']],
- ['RDTSC', 'x86_rdtsc', []],
+ ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd4FP64Ty'],
+ ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdFP32Ty'],
+ ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16FP32Ty'],
+ ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdInt32Ty'],
+ ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16Int32Ty'],
+ ['VRCPPS', 'x86_avx_rcp_ps_256', ['a'], 'mSimdFP32Ty'],
+ ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding'], 'mSimdFP32Ty'],
+ ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control'], 'mInt32Ty'],
+ ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b'], 'mSimd32Int8Ty'],
+ ['VPERMD', 'x86_avx2_permd', ['a', 'idx'], 'mSimdInt32Ty'],
+ ['VPERMPS', 'x86_avx2_permps', ['idx', 'a'], 'mSimdFP32Ty'],
+ ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a'], 'mSimdFP32Ty'],
+ ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a'], 'mSimdFP32Ty'],
+ ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round'], 'mSimdFP16Ty'],
+ ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b'], 'mSimdFP32Ty'],
+ ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b'], 'mInt32Ty'],
+ ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b'], 'mInt32Ty'],
+ ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c'], 'mSimdFP32Ty'],
+ ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a'], 'mInt32Ty'],
+ ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b'], 'mSimdInt32Ty'],
+ ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b'], 'mInt32Ty'],
+ ['RDTSC', 'x86_rdtsc', [], 'mInt64Ty'],
]
llvm_intrinsics = [
'''
Auto-generates macros for LLVM IR
'''
-def generate_x86_h(output_dir):
- filename = 'gen_builder_x86.hpp'
+def generate_meta_h(output_dir):
+ filename = 'gen_builder_meta.hpp'
output_filename = os.path.join(output_dir, filename)
functions = []
functions.append({
'decl' : decl,
+ 'name' : inst[0],
'intrin' : inst[1],
'args' : inst[2],
+ 'returnType': inst[3]
})
MakoTemplateWriter.to_file(
template,
output_filename,
cmdline=sys.argv,
- comment='x86 intrinsics',
+ comment='meta intrinsics',
filename=filename,
functions=functions,
isX86=True, isIntrin=False)
parser.add_argument('--input', '-i', type=FileType('r'), help='Path to IRBuilder.h', required=False)
parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True)
parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False)
- parser.add_argument('--gen_x86_h', help='Generate x86 intrinsics. No input is needed.', action='store_true', default=False)
+ parser.add_argument('--gen_meta_h', help='Generate meta intrinsics. No input is needed.', action='store_true', default=False)
parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False)
args = parser.parse_args()
elif args.gen_h:
print('Need to specify --input for --gen_h!')
- if args.gen_x86_h:
- generate_x86_h(args.output)
+ if args.gen_meta_h:
+ generate_meta_h(args.output)
if args.gen_intrin_h:
generate_intrin_h(args.output)
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file lower_x86.cpp
+*
+* @brief llvm pass to lower meta code to x86
+*
+* Notes:
+*
+******************************************************************************/
+
+#include "jit_pch.hpp"
+#include "passes.h"
+#include "JitManager.h"
+
+#include <unordered_map>
+
+
+namespace llvm
+{
+ // foward declare the initializer
+ void initializeLowerX86Pass(PassRegistry&);
+}
+
+namespace SwrJit
+{
+ using namespace llvm;
+
+ enum TargetArch
+ {
+ AVX = 0,
+ AVX2 = 1,
+ AVX512 = 2
+ };
+
+ enum TargetWidth
+ {
+ W256 = 0,
+ W512 = 1,
+ NUM_WIDTHS = 2
+ };
+
+ struct LowerX86;
+
+ typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
+
+ struct X86Intrinsic
+ {
+ Intrinsic::ID intrin[NUM_WIDTHS];
+ EmuFunc emuFunc;
+ };
+
+ // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
+ // mapping directly to avx/avx2 intrinsics.
+ static std::map<std::string, Intrinsic::ID> intrinsicMap = {
+ {"meta.intrinsic.VGATHERPD", Intrinsic::x86_avx2_gather_d_pd_256},
+ {"meta.intrinsic.VROUND", Intrinsic::x86_avx_round_ps_256},
+ {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
+ {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
+ {"meta.intrinsic.VCVTPD2PS", Intrinsic::x86_avx_cvt_pd2_ps_256},
+ {"meta.intrinsic.VCVTPH2PS", Intrinsic::x86_vcvtph2ps_256},
+ {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
+ {"meta.intrinsic.VHSUBPS", Intrinsic::x86_avx_hsub_ps_256},
+ {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
+ {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
+ {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256},
+ {"meta.intrinsic.VMOVMSKPS", Intrinsic::x86_avx_movmsk_ps_256},
+ {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
+ {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
+ {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
+ };
+
+ // Forward decls
+ Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+ Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+ Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+
+ static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
+ // 256 wide 512 wide
+ { // AVX
+ {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
+ {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+ {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+ {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ },
+ { // AVX2
+ {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
+ {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
+ {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
+ {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ },
+ { // AVX512
+ {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
+ {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
+ {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
+ {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+ }
+ };
+
+ struct LowerX86 : public FunctionPass
+ {
+ LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
+ : FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
+ {
+ initializeLowerX86Pass(*PassRegistry::getPassRegistry());
+
+ // Determine target arch
+ if (mpJitMgr->mArch.AVX512F())
+ {
+ mTarget = AVX512;
+ }
+ else if (mpJitMgr->mArch.AVX2())
+ {
+ mTarget = AVX2;
+ }
+ else if (mpJitMgr->mArch.AVX())
+ {
+ mTarget = AVX;
+
+ }
+ else
+ {
+ SWR_ASSERT(false, "Unsupported AVX architecture.");
+ mTarget = AVX;
+ }
+ }
+
+ // Try to decipher the vector type of the instruction. This does not work properly
+ // across all intrinsics, and will have to be rethought. Probably need something
+ // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
+ // intrinsic.
+ void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* pWidth, Type** pTy)
+ {
+ uint32_t vecWidth;
+ Type* pVecTy = pCallInst->getType();
+ if (!pVecTy->isVectorTy())
+ {
+ for (auto& op : pCallInst->arg_operands())
+ {
+ if (op.get()->getType()->isVectorTy())
+ {
+ pVecTy = op.get()->getType();
+ break;
+ }
+ }
+ }
+ SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
+
+ uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
+ switch (width)
+ {
+ case 256: *pWidth = W256; break;
+ case 512: *pWidth = W512; break;
+ default: SWR_ASSERT(false, "Unhandled vector width %d", width);
+ *pWidth = W256;
+ }
+
+ *pTy = pVecTy->getScalarType();
+ }
+
+ Value* GetZeroVec(TargetWidth width, Type* pTy)
+ {
+ uint32_t numElem = 0;
+ switch (width)
+ {
+ case W256: numElem = 8; break;
+ case W512: numElem = 16; break;
+ }
+
+ return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
+ }
+
+ Value* GetMask(TargetWidth width)
+ {
+ Value* mask;
+ switch (width)
+ {
+ case W256: mask = B->C((uint8_t)-1); break;
+ case W512: mask = B->C((uint16_t)-1); break;
+ }
+ return mask;
+ }
+
+ Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
+ {
+ Function* pFunc = pCallInst->getCalledFunction();
+ auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
+ TargetWidth vecWidth;
+ Type* pElemTy;
+ GetRequestedWidthAndType(pCallInst, &vecWidth, &pElemTy);
+
+ // Check if there is a native intrinsic for this instruction
+ Intrinsic::ID id = intrinsic.intrin[vecWidth];
+ if (id != Intrinsic::not_intrinsic)
+ {
+ Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
+ SmallVector<Value*, 8> args;
+ for (auto& arg : pCallInst->arg_operands())
+ {
+ args.push_back(arg.get());
+ }
+
+ // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
+ // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
+ if (mTarget == AVX512)
+ {
+ args.push_back(GetZeroVec(vecWidth, pElemTy));
+ args.push_back(GetMask(vecWidth));
+ }
+
+ return B->CALLA(pIntrin, args);
+ }
+ else
+ {
+ // No native intrinsic, call emulation function
+ return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
+ }
+
+ SWR_ASSERT(false);
+ return nullptr;
+ }
+
+ Instruction* ProcessIntrinsic(CallInst* pCallInst)
+ {
+ Function* pFunc = pCallInst->getCalledFunction();
+
+ // Forward to the advanced support if found
+ if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
+ {
+ return ProcessIntrinsicAdvanced(pCallInst);
+ }
+
+ SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName());
+
+ Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
+ Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
+
+ SmallVector<Value*, 8> args;
+ for (auto& arg : pCallInst->arg_operands())
+ {
+ args.push_back(arg.get());
+ }
+ return B->CALLA(pX86IntrinFunc, args);
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief LLVM funtion pass run method.
+ /// @param f- The function we're working on with this pass.
+ virtual bool runOnFunction(Function& F)
+ {
+ std::vector<Instruction*> toRemove;
+
+ for (auto& BB : F.getBasicBlockList())
+ {
+ for (auto& I : BB.getInstList())
+ {
+ if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
+ {
+ Function* pFunc = pCallInst->getCalledFunction();
+ if (pFunc)
+ {
+ if (pFunc->getName().startswith("meta.intrinsic"))
+ {
+ B->IRB()->SetInsertPoint(&I);
+ Instruction* pReplace = ProcessIntrinsic(pCallInst);
+ SWR_ASSERT(pReplace);
+ toRemove.push_back(pCallInst);
+ pCallInst->replaceAllUsesWith(pReplace);
+ }
+ }
+
+ }
+ }
+ }
+
+ for (auto* pInst : toRemove)
+ {
+ pInst->eraseFromParent();
+ }
+
+ JitManager::DumpToFile(&F, "lowerx86");
+
+ return true;
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage& AU) const
+ {
+ }
+
+ JitManager* JM() { return mpJitMgr; }
+
+ JitManager* mpJitMgr;
+ Builder* B;
+
+ TargetArch mTarget;
+
+ static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
+ };
+
+ char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
+
+ FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
+ {
+ return new LowerX86(pJitMgr, b);
+ }
+
+ Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+ {
+ SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
+ return nullptr;
+ }
+
+ Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+ {
+ // Only need vperm emulation for AVX
+ SWR_ASSERT(arch == AVX);
+
+ Builder* B = pThis->B;
+ auto v32A = pCallInst->getArgOperand(0);
+ auto vi32Index = pCallInst->getArgOperand(1);
+
+ Value* v32Result;
+ if (isa<Constant>(vi32Index))
+ {
+ // Can use llvm shuffle vector directly with constant shuffle indices
+ v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
+ }
+ else
+ {
+ v32Result = UndefValue::get(v32A->getType());
+ for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
+ {
+ auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
+ auto val = B->VEXTRACT(v32A, i32Index);
+ v32Result = B->VINSERT(v32Result, val, B->C(l));
+ }
+ }
+ return cast<Instruction>(v32Result);
+ }
+
+ Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+ {
+ Builder* B = pThis->B;
+ auto vSrc = pCallInst->getArgOperand(0);
+ auto pBase = pCallInst->getArgOperand(1);
+ auto vi32Indices = pCallInst->getArgOperand(2);
+ auto vi1Mask = pCallInst->getArgOperand(3);
+ auto i8Scale = pCallInst->getArgOperand(4);
+
+ pBase = B->INT_TO_PTR(pBase, PointerType::get(B->mInt8Ty, 0));
+ uint32_t numElem = vSrc->getType()->getVectorNumElements();
+ auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
+ auto srcTy = vSrc->getType()->getVectorElementType();
+ Value* v32Gather;
+ if (arch == AVX)
+ {
+ // Full emulation for AVX
+ // Store source on stack to provide a valid address to load from inactive lanes
+ auto pStack = B->STACKSAVE();
+ auto pTmp = B->ALLOCA(vSrc->getType());
+ B->STORE(vSrc, pTmp);
+
+ v32Gather = UndefValue::get(vSrc->getType());
+ auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
+ auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
+
+ for (uint32_t i = 0; i < numElem; ++i)
+ {
+ auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
+ auto pLoadAddress = B->GEP(pBase, i32Offset);
+ pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
+ auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i });
+ auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
+ auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
+ auto val = B->LOAD(pValidAddress);
+ v32Gather = B->VINSERT(v32Gather, val, B->C(i));
+ }
+
+ B->STACKRESTORE(pStack);
+ }
+ else if (arch == AVX2 || (arch == AVX512 && width == W256))
+ {
+ Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256) :
+ Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
+ if (width == W256)
+ {
+ auto v32Mask = B->BITCAST(B->VMASK(vi1Mask), vSrc->getType());
+ v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale });
+ }
+ else if (width == W512)
+ {
+ // Double pump 8-wide
+ auto v32Mask = B->BITCAST(B->VMASK_16(vi1Mask), vSrc->getType());
+ Value *src0 = B->EXTRACT_16(vSrc, 0);
+ Value *src1 = B->EXTRACT_16(vSrc, 1);
+
+ Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
+ Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
+
+ Value *mask0 = B->EXTRACT_16(v32Mask, 0);
+ Value *mask1 = B->EXTRACT_16(v32Mask, 1);
+
+ Value *gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
+ Value *gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
+
+ v32Gather = B->JOIN_16(gather0, gather1);
+ }
+ }
+ else if (arch == AVX512)
+ {
+ auto i16Mask = B->BITCAST(vi1Mask, B->mInt16Ty);
+
+ Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512) :
+ Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512);
+ auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
+ v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, i16Mask, i32Scale });
+ }
+
+ return cast<Instruction>(v32Gather);
+ }
+}
+
+using namespace SwrJit;
+
+INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
+INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
+