1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief llvm pass to lower meta code to x86
29 ******************************************************************************/
31 #include "jit_pch.hpp"
33 #include "JitManager.h"
35 #include <unordered_map>
40 // foward declare the initializer
41 void initializeLowerX86Pass(PassRegistry
&);
64 typedef std::function
<Instruction
*(LowerX86
*, TargetArch
, TargetWidth
, CallInst
*)> EmuFunc
;
68 Intrinsic::ID intrin
[NUM_WIDTHS
];
72 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
73 // mapping directly to avx/avx2 intrinsics.
74 static std::map
<std::string
, Intrinsic::ID
> intrinsicMap
= {
75 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32
},
76 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b
},
77 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256
},
78 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256
},
79 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256
},
80 {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256
},
81 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d
},
82 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32
},
83 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc
},
87 Instruction
* NO_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
88 Instruction
* VPERM_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
89 Instruction
* VGATHER_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
90 Instruction
* VROUND_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
91 Instruction
* VHSUB_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
93 Instruction
* DOUBLE_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
, Intrinsic::ID intrin
);
95 static Intrinsic::ID DOUBLE
= (Intrinsic::ID
)-1;
97 static std::map
<std::string
, X86Intrinsic
> intrinsicMap2
[] = {
100 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256
, DOUBLE
}, NO_EMU
}},
101 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
102 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
103 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
104 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
105 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
106 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
107 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
108 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256
, DOUBLE
}, NO_EMU
}},
109 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256
, DOUBLE
}, NO_EMU
}},
112 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256
, DOUBLE
}, NO_EMU
}},
113 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
114 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
115 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
116 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
117 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
118 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256
, DOUBLE
}, NO_EMU
}},
119 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
120 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256
, DOUBLE
}, NO_EMU
}},
121 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256
, DOUBLE
}, NO_EMU
}},
124 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256
, Intrinsic::x86_avx512_rcp14_ps_512
}, NO_EMU
}},
125 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256
, Intrinsic::x86_avx512_mask_permvar_sf_512
}, NO_EMU
}},
126 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256
, Intrinsic::x86_avx512_mask_permvar_si_512
}, NO_EMU
}},
127 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
128 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
129 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
130 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256
, Intrinsic::x86_avx512_mask_cvtpd2ps_512
}, NO_EMU
}},
131 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256
, Intrinsic::x86_avx512_mask_vcvtph2ps_512
}, NO_EMU
}},
132 {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VROUND_EMU
}},
133 {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VHSUB_EMU
}},
137 struct LowerX86
: public FunctionPass
139 LowerX86(JitManager
* pJitMgr
= nullptr, Builder
* b
= nullptr)
140 : FunctionPass(ID
), mpJitMgr(pJitMgr
), B(b
)
142 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
144 // Determine target arch
145 if (mpJitMgr
->mArch
.AVX512F())
149 else if (mpJitMgr
->mArch
.AVX2())
153 else if (mpJitMgr
->mArch
.AVX())
160 SWR_ASSERT(false, "Unsupported AVX architecture.");
165 // Try to decipher the vector type of the instruction. This does not work properly
166 // across all intrinsics, and will have to be rethought. Probably need something
167 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
169 void GetRequestedWidthAndType(CallInst
* pCallInst
, const StringRef intrinName
, TargetWidth
* pWidth
, Type
** pTy
)
172 Type
* pVecTy
= pCallInst
->getType();
174 // Check for intrinsic specific types
175 // VCVTPD2PS type comes from src, not dst
176 if (intrinName
.equals("meta.intrinsic.VCVTPD2PS"))
178 pVecTy
= pCallInst
->getOperand(0)->getType();
181 if (!pVecTy
->isVectorTy())
183 for (auto& op
: pCallInst
->arg_operands())
185 if (op
.get()->getType()->isVectorTy())
187 pVecTy
= op
.get()->getType();
192 SWR_ASSERT(pVecTy
->isVectorTy(), "Couldn't determine vector size");
194 uint32_t width
= cast
<VectorType
>(pVecTy
)->getBitWidth();
197 case 256: *pWidth
= W256
; break;
198 case 512: *pWidth
= W512
; break;
199 default: SWR_ASSERT(false, "Unhandled vector width %d", width
);
203 *pTy
= pVecTy
->getScalarType();
206 Value
* GetZeroVec(TargetWidth width
, Type
* pTy
)
208 uint32_t numElem
= 0;
211 case W256
: numElem
= 8; break;
212 case W512
: numElem
= 16; break;
215 return ConstantVector::getNullValue(VectorType::get(pTy
, numElem
));
218 Value
* GetMask(TargetWidth width
)
223 case W256
: mask
= B
->C((uint8_t)-1); break;
224 case W512
: mask
= B
->C((uint16_t)-1); break;
229 // Convert <N x i1> mask to <N x i32> x86 mask
230 Value
* VectorMask(Value
* vi1Mask
)
232 uint32_t numElem
= vi1Mask
->getType()->getVectorNumElements();
233 return B
->S_EXT(vi1Mask
, VectorType::get(B
->mInt32Ty
, numElem
));
236 Instruction
* ProcessIntrinsicAdvanced(CallInst
* pCallInst
)
238 Function
* pFunc
= pCallInst
->getCalledFunction();
239 auto& intrinsic
= intrinsicMap2
[mTarget
][pFunc
->getName()];
240 TargetWidth vecWidth
;
242 GetRequestedWidthAndType(pCallInst
, pFunc
->getName(), &vecWidth
, &pElemTy
);
244 // Check if there is a native intrinsic for this instruction
245 Intrinsic::ID id
= intrinsic
.intrin
[vecWidth
];
248 // Double pump the next smaller SIMD intrinsic
249 SWR_ASSERT(vecWidth
!= 0, "Cannot double pump smallest SIMD width.");
250 Intrinsic::ID id2
= intrinsic
.intrin
[vecWidth
- 1];
251 SWR_ASSERT(id2
!= Intrinsic::not_intrinsic
, "Cannot find intrinsic to double pump.");
252 return DOUBLE_EMU(this, mTarget
, vecWidth
, pCallInst
, id2
);
254 else if (id
!= Intrinsic::not_intrinsic
)
256 Function
* pIntrin
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, id
);
257 SmallVector
<Value
*, 8> args
;
258 for (auto& arg
: pCallInst
->arg_operands())
260 args
.push_back(arg
.get());
263 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
264 // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
265 if (mTarget
== AVX512
)
267 args
.push_back(GetZeroVec(vecWidth
, pElemTy
));
268 args
.push_back(GetMask(vecWidth
));
271 return B
->CALLA(pIntrin
, args
);
275 // No native intrinsic, call emulation function
276 return intrinsic
.emuFunc(this, mTarget
, vecWidth
, pCallInst
);
283 Instruction
* ProcessIntrinsic(CallInst
* pCallInst
)
285 Function
* pFunc
= pCallInst
->getCalledFunction();
287 // Forward to the advanced support if found
288 if (intrinsicMap2
[mTarget
].find(pFunc
->getName()) != intrinsicMap2
[mTarget
].end())
290 return ProcessIntrinsicAdvanced(pCallInst
);
293 SWR_ASSERT(intrinsicMap
.find(pFunc
->getName()) != intrinsicMap
.end(), "Unimplemented intrinsic %s.", pFunc
->getName());
295 Intrinsic::ID x86Intrinsic
= intrinsicMap
[pFunc
->getName()];
296 Function
* pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, x86Intrinsic
);
298 SmallVector
<Value
*, 8> args
;
299 for (auto& arg
: pCallInst
->arg_operands())
301 args
.push_back(arg
.get());
303 return B
->CALLA(pX86IntrinFunc
, args
);
306 //////////////////////////////////////////////////////////////////////////
307 /// @brief LLVM funtion pass run method.
308 /// @param f- The function we're working on with this pass.
309 virtual bool runOnFunction(Function
& F
)
311 std::vector
<Instruction
*> toRemove
;
313 for (auto& BB
: F
.getBasicBlockList())
315 for (auto& I
: BB
.getInstList())
317 if (CallInst
* pCallInst
= dyn_cast
<CallInst
>(&I
))
319 Function
* pFunc
= pCallInst
->getCalledFunction();
322 if (pFunc
->getName().startswith("meta.intrinsic"))
324 B
->IRB()->SetInsertPoint(&I
);
325 Instruction
* pReplace
= ProcessIntrinsic(pCallInst
);
326 SWR_ASSERT(pReplace
);
327 toRemove
.push_back(pCallInst
);
328 pCallInst
->replaceAllUsesWith(pReplace
);
336 for (auto* pInst
: toRemove
)
338 pInst
->eraseFromParent();
341 JitManager::DumpToFile(&F
, "lowerx86");
346 virtual void getAnalysisUsage(AnalysisUsage
& AU
) const
350 JitManager
* JM() { return mpJitMgr
; }
352 JitManager
* mpJitMgr
;
357 static char ID
; ///< Needed by LLVM to generate ID for FunctionPass.
360 char LowerX86::ID
= 0; // LLVM uses address of ID as the actual ID.
362 FunctionPass
* createLowerX86Pass(JitManager
* pJitMgr
, Builder
* b
)
364 return new LowerX86(pJitMgr
, b
);
367 Instruction
* NO_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
369 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
373 Instruction
* VPERM_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
375 // Only need vperm emulation for AVX
376 SWR_ASSERT(arch
== AVX
);
378 Builder
* B
= pThis
->B
;
379 auto v32A
= pCallInst
->getArgOperand(0);
380 auto vi32Index
= pCallInst
->getArgOperand(1);
383 if (isa
<Constant
>(vi32Index
))
385 // Can use llvm shuffle vector directly with constant shuffle indices
386 v32Result
= B
->VSHUFFLE(v32A
, v32A
, vi32Index
);
390 v32Result
= UndefValue::get(v32A
->getType());
391 for (uint32_t l
= 0; l
< v32A
->getType()->getVectorNumElements(); ++l
)
393 auto i32Index
= B
->VEXTRACT(vi32Index
, B
->C(l
));
394 auto val
= B
->VEXTRACT(v32A
, i32Index
);
395 v32Result
= B
->VINSERT(v32Result
, val
, B
->C(l
));
398 return cast
<Instruction
>(v32Result
);
401 Instruction
* VGATHER_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
403 Builder
* B
= pThis
->B
;
404 auto vSrc
= pCallInst
->getArgOperand(0);
405 auto pBase
= pCallInst
->getArgOperand(1);
406 auto vi32Indices
= pCallInst
->getArgOperand(2);
407 auto vi1Mask
= pCallInst
->getArgOperand(3);
408 auto i8Scale
= pCallInst
->getArgOperand(4);
410 pBase
= B
->INT_TO_PTR(pBase
, PointerType::get(B
->mInt8Ty
, 0));
411 uint32_t numElem
= vSrc
->getType()->getVectorNumElements();
412 auto i32Scale
= B
->Z_EXT(i8Scale
, B
->mInt32Ty
);
413 auto srcTy
= vSrc
->getType()->getVectorElementType();
417 // Full emulation for AVX
418 // Store source on stack to provide a valid address to load from inactive lanes
419 auto pStack
= B
->STACKSAVE();
420 auto pTmp
= B
->ALLOCA(vSrc
->getType());
421 B
->STORE(vSrc
, pTmp
);
423 v32Gather
= UndefValue::get(vSrc
->getType());
424 auto vi32Scale
= ConstantVector::getSplat(numElem
, cast
<ConstantInt
>(i32Scale
));
425 auto vi32Offsets
= B
->MUL(vi32Indices
, vi32Scale
);
427 for (uint32_t i
= 0; i
< numElem
; ++i
)
429 auto i32Offset
= B
->VEXTRACT(vi32Offsets
, B
->C(i
));
430 auto pLoadAddress
= B
->GEP(pBase
, i32Offset
);
431 pLoadAddress
= B
->BITCAST(pLoadAddress
, PointerType::get(srcTy
, 0));
432 auto pMaskedLoadAddress
= B
->GEP(pTmp
, { 0, i
});
433 auto i1Mask
= B
->VEXTRACT(vi1Mask
, B
->C(i
));
434 auto pValidAddress
= B
->SELECT(i1Mask
, pLoadAddress
, pMaskedLoadAddress
);
435 auto val
= B
->LOAD(pValidAddress
);
436 v32Gather
= B
->VINSERT(v32Gather
, val
, B
->C(i
));
439 B
->STACKRESTORE(pStack
);
441 else if (arch
== AVX2
|| (arch
== AVX512
&& width
== W256
))
443 Function
* pX86IntrinFunc
;
444 if (srcTy
== B
->mFP32Ty
)
446 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx2_gather_d_ps_256
);
448 else if (srcTy
== B
->mInt32Ty
)
450 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx2_gather_d_d_256
);
452 else if (srcTy
== B
->mDoubleTy
)
454 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx2_gather_d_q_256
);
458 SWR_ASSERT(false, "Unsupported vector element type for gather.");
463 auto v32Mask
= B
->BITCAST(pThis
->VectorMask(vi1Mask
), vSrc
->getType());
464 v32Gather
= B
->CALL(pX86IntrinFunc
, { vSrc
, pBase
, vi32Indices
, v32Mask
, i8Scale
});
466 else if (width
== W512
)
468 // Double pump 4-wide for 64bit elements
469 if (vSrc
->getType()->getVectorElementType() == B
->mDoubleTy
)
471 auto v64Mask
= pThis
->VectorMask(vi1Mask
);
472 v64Mask
= B
->S_EXT(v64Mask
,
473 VectorType::get(B
->mInt64Ty
, v64Mask
->getType()->getVectorNumElements()));
474 v64Mask
= B
->BITCAST(v64Mask
, vSrc
->getType());
476 Value
* src0
= B
->VSHUFFLE(vSrc
, vSrc
, B
->C({ 0, 1, 2, 3 }));
477 Value
* src1
= B
->VSHUFFLE(vSrc
, vSrc
, B
->C({ 4, 5, 6, 7 }));
479 Value
* indices0
= B
->VSHUFFLE(vi32Indices
, vi32Indices
, B
->C({ 0, 1, 2, 3 }));
480 Value
* indices1
= B
->VSHUFFLE(vi32Indices
, vi32Indices
, B
->C({ 4, 5, 6, 7 }));
482 Value
* mask0
= B
->VSHUFFLE(v64Mask
, v64Mask
, B
->C({ 0, 1, 2, 3 }));
483 Value
* mask1
= B
->VSHUFFLE(v64Mask
, v64Mask
, B
->C({ 4, 5, 6, 7 }));
485 src0
= B
->BITCAST(src0
, VectorType::get(B
->mInt64Ty
, src0
->getType()->getVectorNumElements()));
486 mask0
= B
->BITCAST(mask0
, VectorType::get(B
->mInt64Ty
, mask0
->getType()->getVectorNumElements()));
487 Value
* gather0
= B
->CALL(pX86IntrinFunc
, { src0
, pBase
, indices0
, mask0
, i8Scale
});
488 src1
= B
->BITCAST(src1
, VectorType::get(B
->mInt64Ty
, src1
->getType()->getVectorNumElements()));
489 mask1
= B
->BITCAST(mask1
, VectorType::get(B
->mInt64Ty
, mask1
->getType()->getVectorNumElements()));
490 Value
* gather1
= B
->CALL(pX86IntrinFunc
, { src1
, pBase
, indices1
, mask1
, i8Scale
});
492 v32Gather
= B
->VSHUFFLE(gather0
, gather1
, B
->C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
493 v32Gather
= B
->BITCAST(v32Gather
, vSrc
->getType());
497 // Double pump 8-wide for 32bit elements
498 auto v32Mask
= pThis
->VectorMask(vi1Mask
);
499 v32Mask
= B
->BITCAST(v32Mask
, vSrc
->getType());
500 Value
* src0
= B
->EXTRACT_16(vSrc
, 0);
501 Value
* src1
= B
->EXTRACT_16(vSrc
, 1);
503 Value
* indices0
= B
->EXTRACT_16(vi32Indices
, 0);
504 Value
* indices1
= B
->EXTRACT_16(vi32Indices
, 1);
506 Value
* mask0
= B
->EXTRACT_16(v32Mask
, 0);
507 Value
* mask1
= B
->EXTRACT_16(v32Mask
, 1);
509 Value
* gather0
= B
->CALL(pX86IntrinFunc
, { src0
, pBase
, indices0
, mask0
, i8Scale
});
510 Value
* gather1
= B
->CALL(pX86IntrinFunc
, { src1
, pBase
, indices1
, mask1
, i8Scale
});
512 v32Gather
= B
->JOIN_16(gather0
, gather1
);
516 else if (arch
== AVX512
)
519 Function
* pX86IntrinFunc
;
520 if (srcTy
== B
->mFP32Ty
)
522 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx512_gather_dps_512
);
523 iMask
= B
->BITCAST(vi1Mask
, B
->mInt16Ty
);
525 else if (srcTy
== B
->mInt32Ty
)
527 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx512_gather_dpi_512
);
528 iMask
= B
->BITCAST(vi1Mask
, B
->mInt16Ty
);
530 else if (srcTy
== B
->mDoubleTy
)
532 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx512_gather_dpd_512
);
533 iMask
= B
->BITCAST(vi1Mask
, B
->mInt8Ty
);
537 SWR_ASSERT(false, "Unsupported vector element type for gather.");
540 auto i32Scale
= B
->Z_EXT(i8Scale
, B
->mInt32Ty
);
541 v32Gather
= B
->CALL(pX86IntrinFunc
, { vSrc
, pBase
, vi32Indices
, iMask
, i32Scale
});
544 return cast
<Instruction
>(v32Gather
);
547 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx instructions
548 Instruction
* VROUND_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
550 SWR_ASSERT(arch
== AVX512
);
553 auto vf32Src
= pCallInst
->getOperand(0);
554 auto i8Round
= pCallInst
->getOperand(1);
555 auto pfnFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx_round_ps_256
);
559 return cast
<Instruction
>(B
->CALL2(pfnFunc
, vf32Src
, i8Round
));
561 else if (width
== W512
)
563 auto v8f32SrcLo
= B
->EXTRACT_16(vf32Src
, 0);
564 auto v8f32SrcHi
= B
->EXTRACT_16(vf32Src
, 1);
566 auto v8f32ResLo
= B
->CALL2(pfnFunc
, v8f32SrcLo
, i8Round
);
567 auto v8f32ResHi
= B
->CALL2(pfnFunc
, v8f32SrcHi
, i8Round
);
569 return cast
<Instruction
>(B
->JOIN_16(v8f32ResLo
, v8f32ResHi
));
573 SWR_ASSERT(false, "Unimplemented vector width.");
579 // No support for hsub in AVX512
580 Instruction
* VHSUB_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
582 SWR_ASSERT(arch
== AVX512
);
585 auto src0
= pCallInst
->getOperand(0);
586 auto src1
= pCallInst
->getOperand(1);
588 // 256b hsub can just use avx intrinsic
591 auto pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx_hsub_ps_256
);
592 return cast
<Instruction
>(B
->CALL2(pX86IntrinFunc
, src0
, src1
));
594 else if (width
== W512
)
596 // 512b hsub can be accomplished with shuf/sub combo
597 auto minuend
= B
->VSHUFFLE(src0
, src1
, B
->C({ 0, 2, 8, 10, 4, 6, 12, 14 }));
598 auto subtrahend
= B
->VSHUFFLE(src0
, src1
, B
->C({ 1, 3, 9, 11, 5, 7, 13, 15 }));
599 return cast
<Instruction
>(B
->SUB(minuend
, subtrahend
));
603 SWR_ASSERT(false, "Unimplemented vector width.");
608 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from each vector argument and
609 // calls the 256 wide intrinsic, then merges the results to 512 wide
610 Instruction
* DOUBLE_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
, Intrinsic::ID intrin
)
613 SWR_ASSERT(width
== W512
);
615 Function
* pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, intrin
);
616 for (uint32_t i
= 0; i
< 2; ++i
)
618 SmallVector
<Value
*, 8> args
;
619 for (auto& arg
: pCallInst
->arg_operands())
621 auto argType
= arg
.get()->getType();
622 if (argType
->isVectorTy())
624 uint32_t vecWidth
= argType
->getVectorNumElements();
625 Value
*lanes
= B
->CInc
<int>(i
*vecWidth
/2, vecWidth
/2);
626 Value
*argToPush
= B
->VSHUFFLE(arg
.get(), B
->VUNDEF(argType
->getVectorElementType(), vecWidth
), lanes
);
627 args
.push_back(argToPush
);
631 args
.push_back(arg
.get());
634 result
[i
] = B
->CALLA(pX86IntrinFunc
, args
);
637 if (result
[0]->getType()->isVectorTy())
639 assert(result
[1]->getType()->isVectorTy());
640 vecWidth
= result
[0]->getType()->getVectorNumElements() +
641 result
[1]->getType()->getVectorNumElements();
647 Value
*lanes
= B
->CInc
<int>(0, vecWidth
);
648 return cast
<Instruction
>(B
->VSHUFFLE(result
[0], result
[1], lanes
));
653 using namespace SwrJit
;
655 INITIALIZE_PASS_BEGIN(LowerX86
, "LowerX86", "LowerX86", false, false)
656 INITIALIZE_PASS_END(LowerX86
, "LowerX86", "LowerX86", false, false)