swr/rast: Handling removed LLVM intrinsics in trunk
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / functionpasses / lower_x86.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file lower_x86.cpp
24 *
25 * @brief llvm pass to lower meta code to x86
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30
31 #include "jit_pch.hpp"
32 #include "passes.h"
33 #include "JitManager.h"
34
35 #include <unordered_map>
36
37 namespace llvm
38 {
39 // foward declare the initializer
40 void initializeLowerX86Pass(PassRegistry &);
41 } // namespace llvm
42
43 namespace SwrJit
44 {
45 using namespace llvm;
46
47 enum TargetArch
48 {
49 AVX = 0,
50 AVX2 = 1,
51 AVX512 = 2
52 };
53
54 enum TargetWidth
55 {
56 W256 = 0,
57 W512 = 1,
58 NUM_WIDTHS = 2
59 };
60
61 struct LowerX86;
62
63 typedef std::function<Instruction *(LowerX86 *, TargetArch, TargetWidth, CallInst *)> EmuFunc;
64
65 struct X86Intrinsic
66 {
67 Intrinsic::ID intrin[NUM_WIDTHS];
68 EmuFunc emuFunc;
69 };
70
71 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
72 // previous behavior of mapping directly to avx/avx2 intrinsics.
73 static std::map<std::string, Intrinsic::ID> intrinsicMap = {
74 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
75 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
76 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
77 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
78 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
79 {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256},
80 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
81 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
82 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
83 };
84
85 // Forward decls
86 Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
87 Instruction *
88 VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
89 Instruction *
90 VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
91 Instruction *
92 VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
93 Instruction *
94 VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
95 Instruction *
96 VCONVERT_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
97
98 Instruction *DOUBLE_EMU(LowerX86 * pThis,
99 TargetArch arch,
100 TargetWidth width,
101 CallInst * pCallInst,
102 Intrinsic::ID intrin);
103
104 static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
105
106 static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
107 // 256 wide 512 wide
108 {
109 // AVX
110 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
111 {"meta.intrinsic.VPERMPS",
112 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
113 {"meta.intrinsic.VPERMD",
114 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
115 {"meta.intrinsic.VGATHERPD",
116 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
117 {"meta.intrinsic.VGATHERPS",
118 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
119 {"meta.intrinsic.VGATHERDD",
120 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
121 {"meta.intrinsic.VCVTPD2PS",
122 {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
123 {"meta.intrinsic.VCVTPH2PS",
124 {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
125 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
126 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
127 },
128 {
129 // AVX2
130 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
131 {"meta.intrinsic.VPERMPS",
132 {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
133 {"meta.intrinsic.VPERMD",
134 {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
135 {"meta.intrinsic.VGATHERPD",
136 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
137 {"meta.intrinsic.VGATHERPS",
138 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
139 {"meta.intrinsic.VGATHERDD",
140 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
141 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
142 {"meta.intrinsic.VCVTPH2PS",
143 {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
144 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
145 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
146 },
147 {
148 // AVX512
149 {"meta.intrinsic.VRCPPS",
150 {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
151 #if LLVM_VERSION_MAJOR < 7
152 {"meta.intrinsic.VPERMPS",
153 {{Intrinsic::x86_avx512_mask_permvar_sf_256,
154 Intrinsic::x86_avx512_mask_permvar_sf_512},
155 NO_EMU}},
156 {"meta.intrinsic.VPERMD",
157 {{Intrinsic::x86_avx512_mask_permvar_si_256,
158 Intrinsic::x86_avx512_mask_permvar_si_512},
159 NO_EMU}},
160 #else
161 {"meta.intrinsic.VPERMPS",
162 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
163 {"meta.intrinsic.VPERMD",
164 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
165 #endif
166 {"meta.intrinsic.VGATHERPD",
167 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
168 {"meta.intrinsic.VGATHERPS",
169 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
170 {"meta.intrinsic.VGATHERDD",
171 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
172 #if LLVM_VERSION_MAJOR < 7
173 {"meta.intrinsic.VCVTPD2PS",
174 {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512},
175 NO_EMU}},
176 #else
177 {"meta.intrinsic.VCVTPD2PS",
178 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
179 #endif
180 {"meta.intrinsic.VCVTPH2PS",
181 {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512},
182 NO_EMU}},
183 {"meta.intrinsic.VROUND",
184 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
185 {"meta.intrinsic.VHSUBPS",
186 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
187 }};
188
189 struct LowerX86 : public FunctionPass
190 {
191 LowerX86(Builder *b = nullptr) : FunctionPass(ID), B(b)
192 {
193 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
194
195 // Determine target arch
196 if (JM()->mArch.AVX512F())
197 {
198 mTarget = AVX512;
199 }
200 else if (JM()->mArch.AVX2())
201 {
202 mTarget = AVX2;
203 }
204 else if (JM()->mArch.AVX())
205 {
206 mTarget = AVX;
207 }
208 else
209 {
210 SWR_ASSERT(false, "Unsupported AVX architecture.");
211 mTarget = AVX;
212 }
213 }
214
215 // Try to decipher the vector type of the instruction. This does not work properly
216 // across all intrinsics, and will have to be rethought. Probably need something
217 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
218 // intrinsic.
219 void GetRequestedWidthAndType(CallInst * pCallInst,
220 const StringRef intrinName,
221 TargetWidth * pWidth,
222 Type ** pTy)
223 {
224 Type *pVecTy = pCallInst->getType();
225
226 // Check for intrinsic specific types
227 // VCVTPD2PS type comes from src, not dst
228 if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
229 {
230 pVecTy = pCallInst->getOperand(0)->getType();
231 }
232
233 if (!pVecTy->isVectorTy())
234 {
235 for (auto &op : pCallInst->arg_operands())
236 {
237 if (op.get()->getType()->isVectorTy())
238 {
239 pVecTy = op.get()->getType();
240 break;
241 }
242 }
243 }
244 SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
245
246 uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
247 switch (width)
248 {
249 case 256:
250 *pWidth = W256;
251 break;
252 case 512:
253 *pWidth = W512;
254 break;
255 default:
256 SWR_ASSERT(false, "Unhandled vector width %d", width);
257 *pWidth = W256;
258 }
259
260 *pTy = pVecTy->getScalarType();
261 }
262
263 Value *GetZeroVec(TargetWidth width, Type *pTy)
264 {
265 uint32_t numElem = 0;
266 switch (width)
267 {
268 case W256:
269 numElem = 8;
270 break;
271 case W512:
272 numElem = 16;
273 break;
274 default:
275 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
276 }
277
278 return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
279 }
280
281 Value *GetMask(TargetWidth width)
282 {
283 Value *mask;
284 switch (width)
285 {
286 case W256:
287 mask = B->C((uint8_t)-1);
288 break;
289 case W512:
290 mask = B->C((uint16_t)-1);
291 break;
292 default:
293 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
294 }
295 return mask;
296 }
297
298 // Convert <N x i1> mask to <N x i32> x86 mask
299 Value *VectorMask(Value *vi1Mask)
300 {
301 uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
302 return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
303 }
304
305 Instruction *ProcessIntrinsicAdvanced(CallInst *pCallInst)
306 {
307 Function * pFunc = pCallInst->getCalledFunction();
308 auto & intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
309 TargetWidth vecWidth;
310 Type * pElemTy;
311 GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
312
313 // Check if there is a native intrinsic for this instruction
314 Intrinsic::ID id = intrinsic.intrin[vecWidth];
315 if (id == DOUBLE)
316 {
317 // Double pump the next smaller SIMD intrinsic
318 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
319 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
320 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
321 "Cannot find intrinsic to double pump.");
322 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
323 }
324 else if (id != Intrinsic::not_intrinsic)
325 {
326 Function *pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
327 SmallVector<Value *, 8> args;
328 for (auto &arg : pCallInst->arg_operands())
329 {
330 args.push_back(arg.get());
331 }
332
333 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
334 // full mask for now Assuming the intrinsics are consistent and place the src
335 // operand and mask last in the argument list.
336 if (mTarget == AVX512)
337 {
338 if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
339 {
340 args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
341 args.push_back(GetMask(W256));
342 // for AVX512 VCVTPD2PS, we also have to add rounding mode
343 args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
344 }
345 else
346 {
347 args.push_back(GetZeroVec(vecWidth, pElemTy));
348 args.push_back(GetMask(vecWidth));
349 }
350 }
351
352 return B->CALLA(pIntrin, args);
353 }
354 else
355 {
356 // No native intrinsic, call emulation function
357 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
358 }
359
360 SWR_ASSERT(false);
361 return nullptr;
362 }
363
364 Instruction *ProcessIntrinsic(CallInst *pCallInst)
365 {
366 Function *pFunc = pCallInst->getCalledFunction();
367
368 // Forward to the advanced support if found
369 if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
370 {
371 return ProcessIntrinsicAdvanced(pCallInst);
372 }
373
374 SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(),
375 "Unimplemented intrinsic %s.",
376 pFunc->getName());
377
378 Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
379 Function * pX86IntrinFunc =
380 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
381
382 SmallVector<Value *, 8> args;
383 for (auto &arg : pCallInst->arg_operands())
384 {
385 args.push_back(arg.get());
386 }
387 return B->CALLA(pX86IntrinFunc, args);
388 }
389
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief LLVM funtion pass run method.
392 /// @param f- The function we're working on with this pass.
393 virtual bool runOnFunction(Function &F)
394 {
395 std::vector<Instruction *> toRemove;
396
397 for (auto &BB : F.getBasicBlockList())
398 {
399 for (auto &I : BB.getInstList())
400 {
401 if (CallInst *pCallInst = dyn_cast<CallInst>(&I))
402 {
403 Function *pFunc = pCallInst->getCalledFunction();
404 if (pFunc)
405 {
406 if (pFunc->getName().startswith("meta.intrinsic"))
407 {
408 B->IRB()->SetInsertPoint(&I);
409 Instruction *pReplace = ProcessIntrinsic(pCallInst);
410 SWR_ASSERT(pReplace);
411 toRemove.push_back(pCallInst);
412 pCallInst->replaceAllUsesWith(pReplace);
413 }
414 }
415 }
416 }
417 }
418
419 for (auto *pInst : toRemove)
420 {
421 pInst->eraseFromParent();
422 }
423
424 JitManager::DumpToFile(&F, "lowerx86");
425
426 return true;
427 }
428
429 virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
430
431 JitManager *JM() { return B->JM(); }
432
433 Builder *B;
434
435 TargetArch mTarget;
436
437 static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
438 };
439
440 char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
441
442 FunctionPass *createLowerX86Pass(Builder *b) { return new LowerX86(b); }
443
444 Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
445 {
446 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
447 return nullptr;
448 }
449
450 Instruction *VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
451 {
452 // Only need vperm emulation for AVX
453 SWR_ASSERT(arch == AVX);
454
455 Builder *B = pThis->B;
456 auto v32A = pCallInst->getArgOperand(0);
457 auto vi32Index = pCallInst->getArgOperand(1);
458
459 Value *v32Result;
460 if (isa<Constant>(vi32Index))
461 {
462 // Can use llvm shuffle vector directly with constant shuffle indices
463 v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
464 }
465 else
466 {
467 v32Result = UndefValue::get(v32A->getType());
468 for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
469 {
470 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
471 auto val = B->VEXTRACT(v32A, i32Index);
472 v32Result = B->VINSERT(v32Result, val, B->C(l));
473 }
474 }
475 return cast<Instruction>(v32Result);
476 }
477
478 Instruction *
479 VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
480 {
481 Builder *B = pThis->B;
482 auto vSrc = pCallInst->getArgOperand(0);
483 auto pBase = pCallInst->getArgOperand(1);
484 auto vi32Indices = pCallInst->getArgOperand(2);
485 auto vi1Mask = pCallInst->getArgOperand(3);
486 auto i8Scale = pCallInst->getArgOperand(4);
487
488 pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
489 uint32_t numElem = vSrc->getType()->getVectorNumElements();
490 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
491 auto srcTy = vSrc->getType()->getVectorElementType();
492 Value * v32Gather;
493 if (arch == AVX)
494 {
495 // Full emulation for AVX
496 // Store source on stack to provide a valid address to load from inactive lanes
497 auto pStack = B->STACKSAVE();
498 auto pTmp = B->ALLOCA(vSrc->getType());
499 B->STORE(vSrc, pTmp);
500
501 v32Gather = UndefValue::get(vSrc->getType());
502 auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
503 auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
504
505 for (uint32_t i = 0; i < numElem; ++i)
506 {
507 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
508 auto pLoadAddress = B->GEP(pBase, i32Offset);
509 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
510 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
511 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
512 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
513 auto val = B->LOAD(pValidAddress);
514 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
515 }
516
517 B->STACKRESTORE(pStack);
518 }
519 else if (arch == AVX2 || (arch == AVX512 && width == W256))
520 {
521 Function *pX86IntrinFunc;
522 if (srcTy == B->mFP32Ty)
523 {
524 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
525 Intrinsic::x86_avx2_gather_d_ps_256);
526 }
527 else if (srcTy == B->mInt32Ty)
528 {
529 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
530 Intrinsic::x86_avx2_gather_d_d_256);
531 }
532 else if (srcTy == B->mDoubleTy)
533 {
534 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
535 Intrinsic::x86_avx2_gather_d_q_256);
536 }
537 else
538 {
539 SWR_ASSERT(false, "Unsupported vector element type for gather.");
540 }
541
542 if (width == W256)
543 {
544 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
545 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
546 }
547 else if (width == W512)
548 {
549 // Double pump 4-wide for 64bit elements
550 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
551 {
552 auto v64Mask = pThis->VectorMask(vi1Mask);
553 v64Mask = B->S_EXT(
554 v64Mask,
555 VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
556 v64Mask = B->BITCAST(v64Mask, vSrc->getType());
557
558 Value *src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
559 Value *src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
560
561 Value *indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
562 Value *indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
563
564 Value *mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
565 Value *mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
566
567 src0 = B->BITCAST(
568 src0,
569 VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
570 mask0 = B->BITCAST(
571 mask0,
572 VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
573 Value *gather0 =
574 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
575 src1 = B->BITCAST(
576 src1,
577 VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
578 mask1 = B->BITCAST(
579 mask1,
580 VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
581 Value *gather1 =
582 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
583
584 v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
585 v32Gather = B->BITCAST(v32Gather, vSrc->getType());
586 }
587 else
588 {
589 // Double pump 8-wide for 32bit elements
590 auto v32Mask = pThis->VectorMask(vi1Mask);
591 v32Mask = B->BITCAST(v32Mask, vSrc->getType());
592 Value *src0 = B->EXTRACT_16(vSrc, 0);
593 Value *src1 = B->EXTRACT_16(vSrc, 1);
594
595 Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
596 Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
597
598 Value *mask0 = B->EXTRACT_16(v32Mask, 0);
599 Value *mask1 = B->EXTRACT_16(v32Mask, 1);
600
601 Value *gather0 =
602 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
603 Value *gather1 =
604 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
605
606 v32Gather = B->JOIN_16(gather0, gather1);
607 }
608 }
609 }
610 else if (arch == AVX512)
611 {
612 Value * iMask;
613 Function *pX86IntrinFunc;
614 if (srcTy == B->mFP32Ty)
615 {
616 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
617 Intrinsic::x86_avx512_gather_dps_512);
618 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
619 }
620 else if (srcTy == B->mInt32Ty)
621 {
622 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
623 Intrinsic::x86_avx512_gather_dpi_512);
624 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
625 }
626 else if (srcTy == B->mDoubleTy)
627 {
628 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
629 Intrinsic::x86_avx512_gather_dpd_512);
630 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
631 }
632 else
633 {
634 SWR_ASSERT(false, "Unsupported vector element type for gather.");
635 }
636
637 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
638 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
639 }
640
641 return cast<Instruction>(v32Gather);
642 }
643
644 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
645 // instructions
646 Instruction *
647 VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
648 {
649 SWR_ASSERT(arch == AVX512);
650
651 auto B = pThis->B;
652 auto vf32Src = pCallInst->getOperand(0);
653 auto i8Round = pCallInst->getOperand(1);
654 auto pfnFunc =
655 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
656
657 if (width == W256)
658 {
659 return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
660 }
661 else if (width == W512)
662 {
663 auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
664 auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
665
666 auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
667 auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
668
669 return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
670 }
671 else
672 {
673 SWR_ASSERT(false, "Unimplemented vector width.");
674 }
675
676 return nullptr;
677 }
678
679 Instruction *VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
680 {
681 SWR_ASSERT(arch == AVX512);
682
683 auto B = pThis->B;
684 auto vf32Src = pCallInst->getOperand(0);
685
686 if (width == W256)
687 {
688 auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
689 return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
690 }
691 else if (width == W512)
692 {
693 // 512 can use intrinsic
694 auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_mask_cvtpd2ps_512);
695 return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
696 }
697 else
698 {
699 SWR_ASSERT(false, "Unimplemented vector width.");
700 }
701
702 return nullptr;
703 }
704
705 // No support for hsub in AVX512
706 Instruction *VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
707 {
708 SWR_ASSERT(arch == AVX512);
709
710 auto B = pThis->B;
711 auto src0 = pCallInst->getOperand(0);
712 auto src1 = pCallInst->getOperand(1);
713
714 // 256b hsub can just use avx intrinsic
715 if (width == W256)
716 {
717 auto pX86IntrinFunc =
718 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
719 return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
720 }
721 else if (width == W512)
722 {
723 // 512b hsub can be accomplished with shuf/sub combo
724 auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
725 auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
726 return cast<Instruction>(B->SUB(minuend, subtrahend));
727 }
728 else
729 {
730 SWR_ASSERT(false, "Unimplemented vector width.");
731 return nullptr;
732 }
733 }
734
735 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
736 // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
737 Instruction *DOUBLE_EMU(LowerX86 * pThis,
738 TargetArch arch,
739 TargetWidth width,
740 CallInst * pCallInst,
741 Intrinsic::ID intrin)
742 {
743 auto B = pThis->B;
744 SWR_ASSERT(width == W512);
745 Value * result[2];
746 Function *pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
747 for (uint32_t i = 0; i < 2; ++i)
748 {
749 SmallVector<Value *, 8> args;
750 for (auto &arg : pCallInst->arg_operands())
751 {
752 auto argType = arg.get()->getType();
753 if (argType->isVectorTy())
754 {
755 uint32_t vecWidth = argType->getVectorNumElements();
756 Value * lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
757 Value * argToPush = B->VSHUFFLE(
758 arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
759 args.push_back(argToPush);
760 }
761 else
762 {
763 args.push_back(arg.get());
764 }
765 }
766 result[i] = B->CALLA(pX86IntrinFunc, args);
767 }
768 uint32_t vecWidth;
769 if (result[0]->getType()->isVectorTy())
770 {
771 assert(result[1]->getType()->isVectorTy());
772 vecWidth = result[0]->getType()->getVectorNumElements() +
773 result[1]->getType()->getVectorNumElements();
774 }
775 else
776 {
777 vecWidth = 2;
778 }
779 Value *lanes = B->CInc<int>(0, vecWidth);
780 return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
781 }
782
783 } // namespace SwrJit
784
785 using namespace SwrJit;
786
787 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
788 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)