1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief llvm pass to lower meta code to x86
29 ******************************************************************************/
31 #include "jit_pch.hpp"
33 #include "JitManager.h"
35 #include <unordered_map>
39 // foward declare the initializer
40 void initializeLowerX86Pass(PassRegistry
&);
63 typedef std::function
<Instruction
*(LowerX86
*, TargetArch
, TargetWidth
, CallInst
*)> EmuFunc
;
67 Intrinsic::ID intrin
[NUM_WIDTHS
];
71 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
72 // previous behavior of mapping directly to avx/avx2 intrinsics.
73 static std::map
<std::string
, Intrinsic::ID
> intrinsicMap
= {
74 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32
},
75 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b
},
76 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256
},
77 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256
},
78 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256
},
79 {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256
},
80 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d
},
81 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32
},
82 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc
},
86 Instruction
*NO_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
);
88 VPERM_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
);
90 VGATHER_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
);
92 VROUND_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
);
94 VHSUB_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
);
96 VCONVERT_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
);
98 Instruction
*DOUBLE_EMU(LowerX86
* pThis
,
101 CallInst
* pCallInst
,
102 Intrinsic::ID intrin
);
104 static Intrinsic::ID DOUBLE
= (Intrinsic::ID
)-1;
106 static std::map
<std::string
, X86Intrinsic
> intrinsicMap2
[] = {
110 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256
, DOUBLE
}, NO_EMU
}},
111 {"meta.intrinsic.VPERMPS",
112 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
113 {"meta.intrinsic.VPERMD",
114 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
115 {"meta.intrinsic.VGATHERPD",
116 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
117 {"meta.intrinsic.VGATHERPS",
118 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
119 {"meta.intrinsic.VGATHERDD",
120 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
121 {"meta.intrinsic.VCVTPD2PS",
122 {{Intrinsic::x86_avx_cvt_pd2_ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
123 {"meta.intrinsic.VCVTPH2PS",
124 {{Intrinsic::x86_vcvtph2ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
125 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256
, DOUBLE
}, NO_EMU
}},
126 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256
, DOUBLE
}, NO_EMU
}},
130 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256
, DOUBLE
}, NO_EMU
}},
131 {"meta.intrinsic.VPERMPS",
132 {{Intrinsic::x86_avx2_permps
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
133 {"meta.intrinsic.VPERMD",
134 {{Intrinsic::x86_avx2_permd
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
135 {"meta.intrinsic.VGATHERPD",
136 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
137 {"meta.intrinsic.VGATHERPS",
138 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
139 {"meta.intrinsic.VGATHERDD",
140 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
141 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256
, DOUBLE
}, NO_EMU
}},
142 {"meta.intrinsic.VCVTPH2PS",
143 {{Intrinsic::x86_vcvtph2ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
144 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256
, DOUBLE
}, NO_EMU
}},
145 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256
, DOUBLE
}, NO_EMU
}},
149 {"meta.intrinsic.VRCPPS",
150 {{Intrinsic::x86_avx512_rcp14_ps_256
, Intrinsic::x86_avx512_rcp14_ps_512
}, NO_EMU
}},
151 #if LLVM_VERSION_MAJOR < 7
152 {"meta.intrinsic.VPERMPS",
153 {{Intrinsic::x86_avx512_mask_permvar_sf_256
,
154 Intrinsic::x86_avx512_mask_permvar_sf_512
},
156 {"meta.intrinsic.VPERMD",
157 {{Intrinsic::x86_avx512_mask_permvar_si_256
,
158 Intrinsic::x86_avx512_mask_permvar_si_512
},
161 {"meta.intrinsic.VPERMPS",
162 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
163 {"meta.intrinsic.VPERMD",
164 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
166 {"meta.intrinsic.VGATHERPD",
167 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
168 {"meta.intrinsic.VGATHERPS",
169 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
170 {"meta.intrinsic.VGATHERDD",
171 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
172 #if LLVM_VERSION_MAJOR < 7
173 {"meta.intrinsic.VCVTPD2PS",
174 {{Intrinsic::x86_avx512_mask_cvtpd2ps_256
, Intrinsic::x86_avx512_mask_cvtpd2ps_512
},
177 {"meta.intrinsic.VCVTPD2PS",
178 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VCONVERT_EMU
}},
180 {"meta.intrinsic.VCVTPH2PS",
181 {{Intrinsic::x86_avx512_mask_vcvtph2ps_256
, Intrinsic::x86_avx512_mask_vcvtph2ps_512
},
183 {"meta.intrinsic.VROUND",
184 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VROUND_EMU
}},
185 {"meta.intrinsic.VHSUBPS",
186 {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VHSUB_EMU
}},
189 struct LowerX86
: public FunctionPass
191 LowerX86(Builder
*b
= nullptr) : FunctionPass(ID
), B(b
)
193 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
195 // Determine target arch
196 if (JM()->mArch
.AVX512F())
200 else if (JM()->mArch
.AVX2())
204 else if (JM()->mArch
.AVX())
210 SWR_ASSERT(false, "Unsupported AVX architecture.");
215 // Try to decipher the vector type of the instruction. This does not work properly
216 // across all intrinsics, and will have to be rethought. Probably need something
217 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
219 void GetRequestedWidthAndType(CallInst
* pCallInst
,
220 const StringRef intrinName
,
221 TargetWidth
* pWidth
,
224 Type
*pVecTy
= pCallInst
->getType();
226 // Check for intrinsic specific types
227 // VCVTPD2PS type comes from src, not dst
228 if (intrinName
.equals("meta.intrinsic.VCVTPD2PS"))
230 pVecTy
= pCallInst
->getOperand(0)->getType();
233 if (!pVecTy
->isVectorTy())
235 for (auto &op
: pCallInst
->arg_operands())
237 if (op
.get()->getType()->isVectorTy())
239 pVecTy
= op
.get()->getType();
244 SWR_ASSERT(pVecTy
->isVectorTy(), "Couldn't determine vector size");
246 uint32_t width
= cast
<VectorType
>(pVecTy
)->getBitWidth();
256 SWR_ASSERT(false, "Unhandled vector width %d", width
);
260 *pTy
= pVecTy
->getScalarType();
263 Value
*GetZeroVec(TargetWidth width
, Type
*pTy
)
265 uint32_t numElem
= 0;
275 SWR_ASSERT(false, "Unhandled vector width type %d\n", width
);
278 return ConstantVector::getNullValue(VectorType::get(pTy
, numElem
));
281 Value
*GetMask(TargetWidth width
)
287 mask
= B
->C((uint8_t)-1);
290 mask
= B
->C((uint16_t)-1);
293 SWR_ASSERT(false, "Unhandled vector width type %d\n", width
);
298 // Convert <N x i1> mask to <N x i32> x86 mask
299 Value
*VectorMask(Value
*vi1Mask
)
301 uint32_t numElem
= vi1Mask
->getType()->getVectorNumElements();
302 return B
->S_EXT(vi1Mask
, VectorType::get(B
->mInt32Ty
, numElem
));
305 Instruction
*ProcessIntrinsicAdvanced(CallInst
*pCallInst
)
307 Function
* pFunc
= pCallInst
->getCalledFunction();
308 auto & intrinsic
= intrinsicMap2
[mTarget
][pFunc
->getName()];
309 TargetWidth vecWidth
;
311 GetRequestedWidthAndType(pCallInst
, pFunc
->getName(), &vecWidth
, &pElemTy
);
313 // Check if there is a native intrinsic for this instruction
314 Intrinsic::ID id
= intrinsic
.intrin
[vecWidth
];
317 // Double pump the next smaller SIMD intrinsic
318 SWR_ASSERT(vecWidth
!= 0, "Cannot double pump smallest SIMD width.");
319 Intrinsic::ID id2
= intrinsic
.intrin
[vecWidth
- 1];
320 SWR_ASSERT(id2
!= Intrinsic::not_intrinsic
,
321 "Cannot find intrinsic to double pump.");
322 return DOUBLE_EMU(this, mTarget
, vecWidth
, pCallInst
, id2
);
324 else if (id
!= Intrinsic::not_intrinsic
)
326 Function
*pIntrin
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, id
);
327 SmallVector
<Value
*, 8> args
;
328 for (auto &arg
: pCallInst
->arg_operands())
330 args
.push_back(arg
.get());
333 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
334 // full mask for now Assuming the intrinsics are consistent and place the src
335 // operand and mask last in the argument list.
336 if (mTarget
== AVX512
)
338 if (pFunc
->getName().equals("meta.intrinsic.VCVTPD2PS"))
340 args
.push_back(GetZeroVec(W256
, pCallInst
->getType()->getScalarType()));
341 args
.push_back(GetMask(W256
));
342 // for AVX512 VCVTPD2PS, we also have to add rounding mode
343 args
.push_back(B
->C(_MM_FROUND_TO_NEAREST_INT
| _MM_FROUND_NO_EXC
));
347 args
.push_back(GetZeroVec(vecWidth
, pElemTy
));
348 args
.push_back(GetMask(vecWidth
));
352 return B
->CALLA(pIntrin
, args
);
356 // No native intrinsic, call emulation function
357 return intrinsic
.emuFunc(this, mTarget
, vecWidth
, pCallInst
);
364 Instruction
*ProcessIntrinsic(CallInst
*pCallInst
)
366 Function
*pFunc
= pCallInst
->getCalledFunction();
368 // Forward to the advanced support if found
369 if (intrinsicMap2
[mTarget
].find(pFunc
->getName()) != intrinsicMap2
[mTarget
].end())
371 return ProcessIntrinsicAdvanced(pCallInst
);
374 SWR_ASSERT(intrinsicMap
.find(pFunc
->getName()) != intrinsicMap
.end(),
375 "Unimplemented intrinsic %s.",
378 Intrinsic::ID x86Intrinsic
= intrinsicMap
[pFunc
->getName()];
379 Function
* pX86IntrinFunc
=
380 Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, x86Intrinsic
);
382 SmallVector
<Value
*, 8> args
;
383 for (auto &arg
: pCallInst
->arg_operands())
385 args
.push_back(arg
.get());
387 return B
->CALLA(pX86IntrinFunc
, args
);
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief LLVM funtion pass run method.
392 /// @param f- The function we're working on with this pass.
393 virtual bool runOnFunction(Function
&F
)
395 std::vector
<Instruction
*> toRemove
;
397 for (auto &BB
: F
.getBasicBlockList())
399 for (auto &I
: BB
.getInstList())
401 if (CallInst
*pCallInst
= dyn_cast
<CallInst
>(&I
))
403 Function
*pFunc
= pCallInst
->getCalledFunction();
406 if (pFunc
->getName().startswith("meta.intrinsic"))
408 B
->IRB()->SetInsertPoint(&I
);
409 Instruction
*pReplace
= ProcessIntrinsic(pCallInst
);
410 SWR_ASSERT(pReplace
);
411 toRemove
.push_back(pCallInst
);
412 pCallInst
->replaceAllUsesWith(pReplace
);
419 for (auto *pInst
: toRemove
)
421 pInst
->eraseFromParent();
424 JitManager::DumpToFile(&F
, "lowerx86");
429 virtual void getAnalysisUsage(AnalysisUsage
&AU
) const {}
431 JitManager
*JM() { return B
->JM(); }
437 static char ID
; ///< Needed by LLVM to generate ID for FunctionPass.
440 char LowerX86::ID
= 0; // LLVM uses address of ID as the actual ID.
442 FunctionPass
*createLowerX86Pass(Builder
*b
) { return new LowerX86(b
); }
444 Instruction
*NO_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
)
446 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
450 Instruction
*VPERM_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
)
452 // Only need vperm emulation for AVX
453 SWR_ASSERT(arch
== AVX
);
455 Builder
*B
= pThis
->B
;
456 auto v32A
= pCallInst
->getArgOperand(0);
457 auto vi32Index
= pCallInst
->getArgOperand(1);
460 if (isa
<Constant
>(vi32Index
))
462 // Can use llvm shuffle vector directly with constant shuffle indices
463 v32Result
= B
->VSHUFFLE(v32A
, v32A
, vi32Index
);
467 v32Result
= UndefValue::get(v32A
->getType());
468 for (uint32_t l
= 0; l
< v32A
->getType()->getVectorNumElements(); ++l
)
470 auto i32Index
= B
->VEXTRACT(vi32Index
, B
->C(l
));
471 auto val
= B
->VEXTRACT(v32A
, i32Index
);
472 v32Result
= B
->VINSERT(v32Result
, val
, B
->C(l
));
475 return cast
<Instruction
>(v32Result
);
479 VGATHER_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
)
481 Builder
*B
= pThis
->B
;
482 auto vSrc
= pCallInst
->getArgOperand(0);
483 auto pBase
= pCallInst
->getArgOperand(1);
484 auto vi32Indices
= pCallInst
->getArgOperand(2);
485 auto vi1Mask
= pCallInst
->getArgOperand(3);
486 auto i8Scale
= pCallInst
->getArgOperand(4);
488 pBase
= B
->POINTER_CAST(pBase
, PointerType::get(B
->mInt8Ty
, 0));
489 uint32_t numElem
= vSrc
->getType()->getVectorNumElements();
490 auto i32Scale
= B
->Z_EXT(i8Scale
, B
->mInt32Ty
);
491 auto srcTy
= vSrc
->getType()->getVectorElementType();
495 // Full emulation for AVX
496 // Store source on stack to provide a valid address to load from inactive lanes
497 auto pStack
= B
->STACKSAVE();
498 auto pTmp
= B
->ALLOCA(vSrc
->getType());
499 B
->STORE(vSrc
, pTmp
);
501 v32Gather
= UndefValue::get(vSrc
->getType());
502 auto vi32Scale
= ConstantVector::getSplat(numElem
, cast
<ConstantInt
>(i32Scale
));
503 auto vi32Offsets
= B
->MUL(vi32Indices
, vi32Scale
);
505 for (uint32_t i
= 0; i
< numElem
; ++i
)
507 auto i32Offset
= B
->VEXTRACT(vi32Offsets
, B
->C(i
));
508 auto pLoadAddress
= B
->GEP(pBase
, i32Offset
);
509 pLoadAddress
= B
->BITCAST(pLoadAddress
, PointerType::get(srcTy
, 0));
510 auto pMaskedLoadAddress
= B
->GEP(pTmp
, {0, i
});
511 auto i1Mask
= B
->VEXTRACT(vi1Mask
, B
->C(i
));
512 auto pValidAddress
= B
->SELECT(i1Mask
, pLoadAddress
, pMaskedLoadAddress
);
513 auto val
= B
->LOAD(pValidAddress
);
514 v32Gather
= B
->VINSERT(v32Gather
, val
, B
->C(i
));
517 B
->STACKRESTORE(pStack
);
519 else if (arch
== AVX2
|| (arch
== AVX512
&& width
== W256
))
521 Function
*pX86IntrinFunc
;
522 if (srcTy
== B
->mFP32Ty
)
524 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
525 Intrinsic::x86_avx2_gather_d_ps_256
);
527 else if (srcTy
== B
->mInt32Ty
)
529 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
530 Intrinsic::x86_avx2_gather_d_d_256
);
532 else if (srcTy
== B
->mDoubleTy
)
534 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
535 Intrinsic::x86_avx2_gather_d_q_256
);
539 SWR_ASSERT(false, "Unsupported vector element type for gather.");
544 auto v32Mask
= B
->BITCAST(pThis
->VectorMask(vi1Mask
), vSrc
->getType());
545 v32Gather
= B
->CALL(pX86IntrinFunc
, {vSrc
, pBase
, vi32Indices
, v32Mask
, i8Scale
});
547 else if (width
== W512
)
549 // Double pump 4-wide for 64bit elements
550 if (vSrc
->getType()->getVectorElementType() == B
->mDoubleTy
)
552 auto v64Mask
= pThis
->VectorMask(vi1Mask
);
555 VectorType::get(B
->mInt64Ty
, v64Mask
->getType()->getVectorNumElements()));
556 v64Mask
= B
->BITCAST(v64Mask
, vSrc
->getType());
558 Value
*src0
= B
->VSHUFFLE(vSrc
, vSrc
, B
->C({0, 1, 2, 3}));
559 Value
*src1
= B
->VSHUFFLE(vSrc
, vSrc
, B
->C({4, 5, 6, 7}));
561 Value
*indices0
= B
->VSHUFFLE(vi32Indices
, vi32Indices
, B
->C({0, 1, 2, 3}));
562 Value
*indices1
= B
->VSHUFFLE(vi32Indices
, vi32Indices
, B
->C({4, 5, 6, 7}));
564 Value
*mask0
= B
->VSHUFFLE(v64Mask
, v64Mask
, B
->C({0, 1, 2, 3}));
565 Value
*mask1
= B
->VSHUFFLE(v64Mask
, v64Mask
, B
->C({4, 5, 6, 7}));
569 VectorType::get(B
->mInt64Ty
, src0
->getType()->getVectorNumElements()));
572 VectorType::get(B
->mInt64Ty
, mask0
->getType()->getVectorNumElements()));
574 B
->CALL(pX86IntrinFunc
, {src0
, pBase
, indices0
, mask0
, i8Scale
});
577 VectorType::get(B
->mInt64Ty
, src1
->getType()->getVectorNumElements()));
580 VectorType::get(B
->mInt64Ty
, mask1
->getType()->getVectorNumElements()));
582 B
->CALL(pX86IntrinFunc
, {src1
, pBase
, indices1
, mask1
, i8Scale
});
584 v32Gather
= B
->VSHUFFLE(gather0
, gather1
, B
->C({0, 1, 2, 3, 4, 5, 6, 7}));
585 v32Gather
= B
->BITCAST(v32Gather
, vSrc
->getType());
589 // Double pump 8-wide for 32bit elements
590 auto v32Mask
= pThis
->VectorMask(vi1Mask
);
591 v32Mask
= B
->BITCAST(v32Mask
, vSrc
->getType());
592 Value
*src0
= B
->EXTRACT_16(vSrc
, 0);
593 Value
*src1
= B
->EXTRACT_16(vSrc
, 1);
595 Value
*indices0
= B
->EXTRACT_16(vi32Indices
, 0);
596 Value
*indices1
= B
->EXTRACT_16(vi32Indices
, 1);
598 Value
*mask0
= B
->EXTRACT_16(v32Mask
, 0);
599 Value
*mask1
= B
->EXTRACT_16(v32Mask
, 1);
602 B
->CALL(pX86IntrinFunc
, {src0
, pBase
, indices0
, mask0
, i8Scale
});
604 B
->CALL(pX86IntrinFunc
, {src1
, pBase
, indices1
, mask1
, i8Scale
});
606 v32Gather
= B
->JOIN_16(gather0
, gather1
);
610 else if (arch
== AVX512
)
613 Function
*pX86IntrinFunc
;
614 if (srcTy
== B
->mFP32Ty
)
616 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
617 Intrinsic::x86_avx512_gather_dps_512
);
618 iMask
= B
->BITCAST(vi1Mask
, B
->mInt16Ty
);
620 else if (srcTy
== B
->mInt32Ty
)
622 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
623 Intrinsic::x86_avx512_gather_dpi_512
);
624 iMask
= B
->BITCAST(vi1Mask
, B
->mInt16Ty
);
626 else if (srcTy
== B
->mDoubleTy
)
628 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
629 Intrinsic::x86_avx512_gather_dpd_512
);
630 iMask
= B
->BITCAST(vi1Mask
, B
->mInt8Ty
);
634 SWR_ASSERT(false, "Unsupported vector element type for gather.");
637 auto i32Scale
= B
->Z_EXT(i8Scale
, B
->mInt32Ty
);
638 v32Gather
= B
->CALL(pX86IntrinFunc
, {vSrc
, pBase
, vi32Indices
, iMask
, i32Scale
});
641 return cast
<Instruction
>(v32Gather
);
644 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
647 VROUND_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
)
649 SWR_ASSERT(arch
== AVX512
);
652 auto vf32Src
= pCallInst
->getOperand(0);
653 auto i8Round
= pCallInst
->getOperand(1);
655 Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx_round_ps_256
);
659 return cast
<Instruction
>(B
->CALL2(pfnFunc
, vf32Src
, i8Round
));
661 else if (width
== W512
)
663 auto v8f32SrcLo
= B
->EXTRACT_16(vf32Src
, 0);
664 auto v8f32SrcHi
= B
->EXTRACT_16(vf32Src
, 1);
666 auto v8f32ResLo
= B
->CALL2(pfnFunc
, v8f32SrcLo
, i8Round
);
667 auto v8f32ResHi
= B
->CALL2(pfnFunc
, v8f32SrcHi
, i8Round
);
669 return cast
<Instruction
>(B
->JOIN_16(v8f32ResLo
, v8f32ResHi
));
673 SWR_ASSERT(false, "Unimplemented vector width.");
679 Instruction
*VCONVERT_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
681 SWR_ASSERT(arch
== AVX512
);
684 auto vf32Src
= pCallInst
->getOperand(0);
688 auto vf32SrcRound
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx_round_ps_256
);
689 return cast
<Instruction
>(B
->FP_TRUNC(vf32SrcRound
, B
->mFP32Ty
));
691 else if (width
== W512
)
693 // 512 can use intrinsic
694 auto pfnFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx512_mask_cvtpd2ps_512
);
695 return cast
<Instruction
>(B
->CALL(pfnFunc
, vf32Src
));
699 SWR_ASSERT(false, "Unimplemented vector width.");
705 // No support for hsub in AVX512
706 Instruction
*VHSUB_EMU(LowerX86
*pThis
, TargetArch arch
, TargetWidth width
, CallInst
*pCallInst
)
708 SWR_ASSERT(arch
== AVX512
);
711 auto src0
= pCallInst
->getOperand(0);
712 auto src1
= pCallInst
->getOperand(1);
714 // 256b hsub can just use avx intrinsic
717 auto pX86IntrinFunc
=
718 Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx_hsub_ps_256
);
719 return cast
<Instruction
>(B
->CALL2(pX86IntrinFunc
, src0
, src1
));
721 else if (width
== W512
)
723 // 512b hsub can be accomplished with shuf/sub combo
724 auto minuend
= B
->VSHUFFLE(src0
, src1
, B
->C({0, 2, 8, 10, 4, 6, 12, 14}));
725 auto subtrahend
= B
->VSHUFFLE(src0
, src1
, B
->C({1, 3, 9, 11, 5, 7, 13, 15}));
726 return cast
<Instruction
>(B
->SUB(minuend
, subtrahend
));
730 SWR_ASSERT(false, "Unimplemented vector width.");
735 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
736 // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
737 Instruction
*DOUBLE_EMU(LowerX86
* pThis
,
740 CallInst
* pCallInst
,
741 Intrinsic::ID intrin
)
744 SWR_ASSERT(width
== W512
);
746 Function
*pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, intrin
);
747 for (uint32_t i
= 0; i
< 2; ++i
)
749 SmallVector
<Value
*, 8> args
;
750 for (auto &arg
: pCallInst
->arg_operands())
752 auto argType
= arg
.get()->getType();
753 if (argType
->isVectorTy())
755 uint32_t vecWidth
= argType
->getVectorNumElements();
756 Value
* lanes
= B
->CInc
<int>(i
* vecWidth
/ 2, vecWidth
/ 2);
757 Value
* argToPush
= B
->VSHUFFLE(
758 arg
.get(), B
->VUNDEF(argType
->getVectorElementType(), vecWidth
), lanes
);
759 args
.push_back(argToPush
);
763 args
.push_back(arg
.get());
766 result
[i
] = B
->CALLA(pX86IntrinFunc
, args
);
769 if (result
[0]->getType()->isVectorTy())
771 assert(result
[1]->getType()->isVectorTy());
772 vecWidth
= result
[0]->getType()->getVectorNumElements() +
773 result
[1]->getType()->getVectorNumElements();
779 Value
*lanes
= B
->CInc
<int>(0, vecWidth
);
780 return cast
<Instruction
>(B
->VSHUFFLE(result
[0], result
[1], lanes
));
783 } // namespace SwrJit
785 using namespace SwrJit
;
787 INITIALIZE_PASS_BEGIN(LowerX86
, "LowerX86", "LowerX86", false, false)
788 INITIALIZE_PASS_END(LowerX86
, "LowerX86", "LowerX86", false, false)