swr/rast: Whitespace and tab-to-spaces changes
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / functionpasses / lower_x86.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file lower_x86.cpp
24 *
25 * @brief llvm pass to lower meta code to x86
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30
31 #include "jit_pch.hpp"
32 #include "passes.h"
33 #include "JitManager.h"
34
35 #include <unordered_map>
36
37
38 namespace llvm
39 {
40 // foward declare the initializer
41 void initializeLowerX86Pass(PassRegistry&);
42 }
43
44 namespace SwrJit
45 {
46 using namespace llvm;
47
48 enum TargetArch
49 {
50 AVX = 0,
51 AVX2 = 1,
52 AVX512 = 2
53 };
54
55 enum TargetWidth
56 {
57 W256 = 0,
58 W512 = 1,
59 NUM_WIDTHS = 2
60 };
61
62 struct LowerX86;
63
64 typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
65
66 struct X86Intrinsic
67 {
68 Intrinsic::ID intrin[NUM_WIDTHS];
69 EmuFunc emuFunc;
70 };
71
72 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
73 // mapping directly to avx/avx2 intrinsics.
74 static std::map<std::string, Intrinsic::ID> intrinsicMap = {
75 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
76 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
77 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
78 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
79 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
80 {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256},
81 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
82 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
83 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
84 };
85
86 // Forward decls
87 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
88 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
89 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
90 Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
91 Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
92
93 Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin);
94
95 static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
96
97 static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
98 // 256 wide 512 wide
99 { // AVX
100 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
101 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
102 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
103 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
104 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
105 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
106 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
107 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
108 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
109 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
110 },
111 { // AVX2
112 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
113 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
114 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
115 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
116 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
117 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
118 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
119 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
120 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
121 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
122 },
123 { // AVX512
124 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
125 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
126 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
127 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
128 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
129 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
130 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512 }, NO_EMU}},
131 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512 }, NO_EMU}},
132 {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
133 {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
134 }
135 };
136
137 struct LowerX86 : public FunctionPass
138 {
139 LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
140 : FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
141 {
142 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
143
144 // Determine target arch
145 if (mpJitMgr->mArch.AVX512F())
146 {
147 mTarget = AVX512;
148 }
149 else if (mpJitMgr->mArch.AVX2())
150 {
151 mTarget = AVX2;
152 }
153 else if (mpJitMgr->mArch.AVX())
154 {
155 mTarget = AVX;
156
157 }
158 else
159 {
160 SWR_ASSERT(false, "Unsupported AVX architecture.");
161 mTarget = AVX;
162 }
163 }
164
165 // Try to decipher the vector type of the instruction. This does not work properly
166 // across all intrinsics, and will have to be rethought. Probably need something
167 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
168 // intrinsic.
169 void GetRequestedWidthAndType(CallInst* pCallInst, const StringRef intrinName, TargetWidth* pWidth, Type** pTy)
170 {
171 Type* pVecTy = pCallInst->getType();
172
173 // Check for intrinsic specific types
174 // VCVTPD2PS type comes from src, not dst
175 if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
176 {
177 pVecTy = pCallInst->getOperand(0)->getType();
178 }
179
180 if (!pVecTy->isVectorTy())
181 {
182 for (auto& op : pCallInst->arg_operands())
183 {
184 if (op.get()->getType()->isVectorTy())
185 {
186 pVecTy = op.get()->getType();
187 break;
188 }
189 }
190 }
191 SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
192
193 uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
194 switch (width)
195 {
196 case 256: *pWidth = W256; break;
197 case 512: *pWidth = W512; break;
198 default: SWR_ASSERT(false, "Unhandled vector width %d", width);
199 *pWidth = W256;
200 }
201
202 *pTy = pVecTy->getScalarType();
203 }
204
205 Value* GetZeroVec(TargetWidth width, Type* pTy)
206 {
207 uint32_t numElem = 0;
208 switch (width)
209 {
210 case W256: numElem = 8; break;
211 case W512: numElem = 16; break;
212 default: SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
213 }
214
215 return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
216 }
217
218 Value* GetMask(TargetWidth width)
219 {
220 Value* mask;
221 switch (width)
222 {
223 case W256: mask = B->C((uint8_t)-1); break;
224 case W512: mask = B->C((uint16_t)-1); break;
225 default: SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
226 }
227 return mask;
228 }
229
230 // Convert <N x i1> mask to <N x i32> x86 mask
231 Value* VectorMask(Value* vi1Mask)
232 {
233 uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
234 return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
235 }
236
237 Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
238 {
239 Function* pFunc = pCallInst->getCalledFunction();
240 auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
241 TargetWidth vecWidth;
242 Type* pElemTy;
243 GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
244
245 // Check if there is a native intrinsic for this instruction
246 Intrinsic::ID id = intrinsic.intrin[vecWidth];
247 if (id == DOUBLE)
248 {
249 // Double pump the next smaller SIMD intrinsic
250 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
251 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
252 SWR_ASSERT(id2 != Intrinsic::not_intrinsic, "Cannot find intrinsic to double pump.");
253 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
254 }
255 else if (id != Intrinsic::not_intrinsic)
256 {
257 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
258 SmallVector<Value*, 8> args;
259 for (auto& arg : pCallInst->arg_operands())
260 {
261 args.push_back(arg.get());
262 }
263
264 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
265 // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
266 if (mTarget == AVX512)
267 {
268 if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS")) {
269 args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
270 args.push_back(GetMask(W256));
271 // for AVX512 VCVTPD2PS, we also have to add rounding mode
272 args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT |
273 _MM_FROUND_NO_EXC));
274 } else {
275 args.push_back(GetZeroVec(vecWidth, pElemTy));
276 args.push_back(GetMask(vecWidth));
277 }
278 }
279
280 return B->CALLA(pIntrin, args);
281 }
282 else
283 {
284 // No native intrinsic, call emulation function
285 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
286 }
287
288 SWR_ASSERT(false);
289 return nullptr;
290 }
291
292 Instruction* ProcessIntrinsic(CallInst* pCallInst)
293 {
294 Function* pFunc = pCallInst->getCalledFunction();
295
296 // Forward to the advanced support if found
297 if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
298 {
299 return ProcessIntrinsicAdvanced(pCallInst);
300 }
301
302 SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName());
303
304 Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
305 Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
306
307 SmallVector<Value*, 8> args;
308 for (auto& arg : pCallInst->arg_operands())
309 {
310 args.push_back(arg.get());
311 }
312 return B->CALLA(pX86IntrinFunc, args);
313 }
314
315 //////////////////////////////////////////////////////////////////////////
316 /// @brief LLVM funtion pass run method.
317 /// @param f- The function we're working on with this pass.
318 virtual bool runOnFunction(Function& F)
319 {
320 std::vector<Instruction*> toRemove;
321
322 for (auto& BB : F.getBasicBlockList())
323 {
324 for (auto& I : BB.getInstList())
325 {
326 if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
327 {
328 Function* pFunc = pCallInst->getCalledFunction();
329 if (pFunc)
330 {
331 if (pFunc->getName().startswith("meta.intrinsic"))
332 {
333 B->IRB()->SetInsertPoint(&I);
334 Instruction* pReplace = ProcessIntrinsic(pCallInst);
335 SWR_ASSERT(pReplace);
336 toRemove.push_back(pCallInst);
337 pCallInst->replaceAllUsesWith(pReplace);
338 }
339 }
340
341 }
342 }
343 }
344
345 for (auto* pInst : toRemove)
346 {
347 pInst->eraseFromParent();
348 }
349
350 JitManager::DumpToFile(&F, "lowerx86");
351
352 return true;
353 }
354
355 virtual void getAnalysisUsage(AnalysisUsage& AU) const
356 {
357 }
358
359 JitManager* JM() { return mpJitMgr; }
360
361 JitManager* mpJitMgr;
362 Builder* B;
363
364 TargetArch mTarget;
365
366 static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
367 };
368
369 char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
370
371 FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
372 {
373 return new LowerX86(pJitMgr, b);
374 }
375
376 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
377 {
378 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
379 return nullptr;
380 }
381
382 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
383 {
384 // Only need vperm emulation for AVX
385 SWR_ASSERT(arch == AVX);
386
387 Builder* B = pThis->B;
388 auto v32A = pCallInst->getArgOperand(0);
389 auto vi32Index = pCallInst->getArgOperand(1);
390
391 Value* v32Result;
392 if (isa<Constant>(vi32Index))
393 {
394 // Can use llvm shuffle vector directly with constant shuffle indices
395 v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
396 }
397 else
398 {
399 v32Result = UndefValue::get(v32A->getType());
400 for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
401 {
402 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
403 auto val = B->VEXTRACT(v32A, i32Index);
404 v32Result = B->VINSERT(v32Result, val, B->C(l));
405 }
406 }
407 return cast<Instruction>(v32Result);
408 }
409
410 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
411 {
412 Builder* B = pThis->B;
413 auto vSrc = pCallInst->getArgOperand(0);
414 auto pBase = pCallInst->getArgOperand(1);
415 auto vi32Indices = pCallInst->getArgOperand(2);
416 auto vi1Mask = pCallInst->getArgOperand(3);
417 auto i8Scale = pCallInst->getArgOperand(4);
418
419 pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
420 uint32_t numElem = vSrc->getType()->getVectorNumElements();
421 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
422 auto srcTy = vSrc->getType()->getVectorElementType();
423 Value* v32Gather;
424 if (arch == AVX)
425 {
426 // Full emulation for AVX
427 // Store source on stack to provide a valid address to load from inactive lanes
428 auto pStack = B->STACKSAVE();
429 auto pTmp = B->ALLOCA(vSrc->getType());
430 B->STORE(vSrc, pTmp);
431
432 v32Gather = UndefValue::get(vSrc->getType());
433 auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
434 auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
435
436 for (uint32_t i = 0; i < numElem; ++i)
437 {
438 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
439 auto pLoadAddress = B->GEP(pBase, i32Offset);
440 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
441 auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i });
442 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
443 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
444 auto val = B->LOAD(pValidAddress);
445 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
446 }
447
448 B->STACKRESTORE(pStack);
449 }
450 else if (arch == AVX2 || (arch == AVX512 && width == W256))
451 {
452 Function* pX86IntrinFunc;
453 if (srcTy == B->mFP32Ty)
454 {
455 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256);
456 }
457 else if (srcTy == B->mInt32Ty)
458 {
459 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
460 }
461 else if (srcTy == B->mDoubleTy)
462 {
463 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_q_256);
464 }
465 else
466 {
467 SWR_ASSERT(false, "Unsupported vector element type for gather.");
468 }
469
470 if (width == W256)
471 {
472 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
473 v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale });
474 }
475 else if (width == W512)
476 {
477 // Double pump 4-wide for 64bit elements
478 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
479 {
480 auto v64Mask = pThis->VectorMask(vi1Mask);
481 v64Mask = B->S_EXT(v64Mask,
482 VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
483 v64Mask = B->BITCAST(v64Mask, vSrc->getType());
484
485 Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({ 0, 1, 2, 3 }));
486 Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({ 4, 5, 6, 7 }));
487
488 Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 0, 1, 2, 3 }));
489 Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 4, 5, 6, 7 }));
490
491 Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 0, 1, 2, 3 }));
492 Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 4, 5, 6, 7 }));
493
494 src0 = B->BITCAST(src0, VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
495 mask0 = B->BITCAST(mask0, VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
496 Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
497 src1 = B->BITCAST(src1, VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
498 mask1 = B->BITCAST(mask1, VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
499 Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
500
501 v32Gather = B->VSHUFFLE(gather0, gather1, B->C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
502 v32Gather = B->BITCAST(v32Gather, vSrc->getType());
503 }
504 else
505 {
506 // Double pump 8-wide for 32bit elements
507 auto v32Mask = pThis->VectorMask(vi1Mask);
508 v32Mask = B->BITCAST(v32Mask, vSrc->getType());
509 Value* src0 = B->EXTRACT_16(vSrc, 0);
510 Value* src1 = B->EXTRACT_16(vSrc, 1);
511
512 Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
513 Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
514
515 Value* mask0 = B->EXTRACT_16(v32Mask, 0);
516 Value* mask1 = B->EXTRACT_16(v32Mask, 1);
517
518 Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
519 Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
520
521 v32Gather = B->JOIN_16(gather0, gather1);
522 }
523 }
524 }
525 else if (arch == AVX512)
526 {
527 Value* iMask;
528 Function* pX86IntrinFunc;
529 if (srcTy == B->mFP32Ty)
530 {
531 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512);
532 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
533 }
534 else if (srcTy == B->mInt32Ty)
535 {
536 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512);
537 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
538 }
539 else if (srcTy == B->mDoubleTy)
540 {
541 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpd_512);
542 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
543 }
544 else
545 {
546 SWR_ASSERT(false, "Unsupported vector element type for gather.");
547 }
548
549 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
550 v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, iMask, i32Scale });
551 }
552
553 return cast<Instruction>(v32Gather);
554 }
555
556 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx instructions
557 Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
558 {
559 SWR_ASSERT(arch == AVX512);
560
561 auto B = pThis->B;
562 auto vf32Src = pCallInst->getOperand(0);
563 auto i8Round = pCallInst->getOperand(1);
564 auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
565
566 if (width == W256)
567 {
568 return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
569 }
570 else if (width == W512)
571 {
572 auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
573 auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
574
575 auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
576 auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
577
578 return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
579 }
580 else
581 {
582 SWR_ASSERT(false, "Unimplemented vector width.");
583 }
584
585 return nullptr;
586 }
587
588 // No support for hsub in AVX512
589 Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
590 {
591 SWR_ASSERT(arch == AVX512);
592
593 auto B = pThis->B;
594 auto src0 = pCallInst->getOperand(0);
595 auto src1 = pCallInst->getOperand(1);
596
597 // 256b hsub can just use avx intrinsic
598 if (width == W256)
599 {
600 auto pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
601 return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
602 }
603 else if (width == W512)
604 {
605 // 512b hsub can be accomplished with shuf/sub combo
606 auto minuend = B->VSHUFFLE(src0, src1, B->C({ 0, 2, 8, 10, 4, 6, 12, 14 }));
607 auto subtrahend = B->VSHUFFLE(src0, src1, B->C({ 1, 3, 9, 11, 5, 7, 13, 15 }));
608 return cast<Instruction>(B->SUB(minuend, subtrahend));
609 }
610 else
611 {
612 SWR_ASSERT(false, "Unimplemented vector width.");
613 return nullptr;
614 }
615 }
616
617 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from each vector argument and
618 // calls the 256 wide intrinsic, then merges the results to 512 wide
619 Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin)
620 {
621 auto B = pThis->B;
622 SWR_ASSERT(width == W512);
623 Value* result[2];
624 Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
625 for (uint32_t i = 0; i < 2; ++i)
626 {
627 SmallVector<Value*, 8> args;
628 for (auto& arg : pCallInst->arg_operands())
629 {
630 auto argType = arg.get()->getType();
631 if (argType->isVectorTy())
632 {
633 uint32_t vecWidth = argType->getVectorNumElements();
634 Value *lanes = B->CInc<int>(i*vecWidth/2, vecWidth/2);
635 Value *argToPush = B->VSHUFFLE(arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
636 args.push_back(argToPush);
637 }
638 else
639 {
640 args.push_back(arg.get());
641 }
642 }
643 result[i] = B->CALLA(pX86IntrinFunc, args);
644 }
645 uint32_t vecWidth;
646 if (result[0]->getType()->isVectorTy())
647 {
648 assert(result[1]->getType()->isVectorTy());
649 vecWidth = result[0]->getType()->getVectorNumElements() +
650 result[1]->getType()->getVectorNumElements();
651 }
652 else
653 {
654 vecWidth = 2;
655 }
656 Value *lanes = B->CInc<int>(0, vecWidth);
657 return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
658 }
659
660 }
661
662 using namespace SwrJit;
663
664 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
665 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
666