2a01c706b9688915db96ce8a4bc57ded5c6db76c
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / functionpasses / lower_x86.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file lower_x86.cpp
24 *
25 * @brief llvm pass to lower meta code to x86
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30
31 #include "jit_pch.hpp"
32 #include "passes.h"
33 #include "JitManager.h"
34
35 #include <unordered_map>
36
37 namespace llvm
38 {
39 // foward declare the initializer
40 void initializeLowerX86Pass(PassRegistry &);
41 } // namespace llvm
42
43 namespace SwrJit
44 {
45 using namespace llvm;
46
47 enum TargetArch
48 {
49 AVX = 0,
50 AVX2 = 1,
51 AVX512 = 2
52 };
53
54 enum TargetWidth
55 {
56 W256 = 0,
57 W512 = 1,
58 NUM_WIDTHS = 2
59 };
60
61 struct LowerX86;
62
63 typedef std::function<Instruction *(LowerX86 *, TargetArch, TargetWidth, CallInst *)> EmuFunc;
64
65 struct X86Intrinsic
66 {
67 Intrinsic::ID intrin[NUM_WIDTHS];
68 EmuFunc emuFunc;
69 };
70
71 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
72 // previous behavior of mapping directly to avx/avx2 intrinsics.
73 static std::map<std::string, Intrinsic::ID> intrinsicMap = {
74 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
75 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
76 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
77 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
78 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
79 {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256},
80 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
81 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
82 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
83 };
84
85 // Forward decls
86 Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
87 Instruction *
88 VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
89 Instruction *
90 VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
91 Instruction *
92 VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
93 Instruction *
94 VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst);
95
96 Instruction *DOUBLE_EMU(LowerX86 * pThis,
97 TargetArch arch,
98 TargetWidth width,
99 CallInst * pCallInst,
100 Intrinsic::ID intrin);
101
102 static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
103
104 static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
105 // 256 wide 512 wide
106 {
107 // AVX
108 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
109 {"meta.intrinsic.VPERMPS",
110 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
111 {"meta.intrinsic.VPERMD",
112 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
113 {"meta.intrinsic.VGATHERPD",
114 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
115 {"meta.intrinsic.VGATHERPS",
116 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
117 {"meta.intrinsic.VGATHERDD",
118 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
119 {"meta.intrinsic.VCVTPD2PS",
120 {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
121 {"meta.intrinsic.VCVTPH2PS",
122 {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
123 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
124 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
125 },
126 {
127 // AVX2
128 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
129 {"meta.intrinsic.VPERMPS",
130 {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
131 {"meta.intrinsic.VPERMD",
132 {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
133 {"meta.intrinsic.VGATHERPD",
134 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
135 {"meta.intrinsic.VGATHERPS",
136 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
137 {"meta.intrinsic.VGATHERDD",
138 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
139 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
140 {"meta.intrinsic.VCVTPH2PS",
141 {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
142 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
143 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
144 },
145 {
146 // AVX512
147 {"meta.intrinsic.VRCPPS",
148 {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
149 {"meta.intrinsic.VPERMPS",
150 {{Intrinsic::x86_avx512_mask_permvar_sf_256,
151 Intrinsic::x86_avx512_mask_permvar_sf_512},
152 NO_EMU}},
153 {"meta.intrinsic.VPERMD",
154 {{Intrinsic::x86_avx512_mask_permvar_si_256,
155 Intrinsic::x86_avx512_mask_permvar_si_512},
156 NO_EMU}},
157 {"meta.intrinsic.VGATHERPD",
158 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
159 {"meta.intrinsic.VGATHERPS",
160 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
161 {"meta.intrinsic.VGATHERDD",
162 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
163 {"meta.intrinsic.VCVTPD2PS",
164 {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512},
165 NO_EMU}},
166 {"meta.intrinsic.VCVTPH2PS",
167 {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512},
168 NO_EMU}},
169 {"meta.intrinsic.VROUND",
170 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
171 {"meta.intrinsic.VHSUBPS",
172 {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
173 }};
174
175 struct LowerX86 : public FunctionPass
176 {
177 LowerX86(Builder *b = nullptr) : FunctionPass(ID), B(b)
178 {
179 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
180
181 // Determine target arch
182 if (JM()->mArch.AVX512F())
183 {
184 mTarget = AVX512;
185 }
186 else if (JM()->mArch.AVX2())
187 {
188 mTarget = AVX2;
189 }
190 else if (JM()->mArch.AVX())
191 {
192 mTarget = AVX;
193 }
194 else
195 {
196 SWR_ASSERT(false, "Unsupported AVX architecture.");
197 mTarget = AVX;
198 }
199 }
200
201 // Try to decipher the vector type of the instruction. This does not work properly
202 // across all intrinsics, and will have to be rethought. Probably need something
203 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
204 // intrinsic.
205 void GetRequestedWidthAndType(CallInst * pCallInst,
206 const StringRef intrinName,
207 TargetWidth * pWidth,
208 Type ** pTy)
209 {
210 Type *pVecTy = pCallInst->getType();
211
212 // Check for intrinsic specific types
213 // VCVTPD2PS type comes from src, not dst
214 if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
215 {
216 pVecTy = pCallInst->getOperand(0)->getType();
217 }
218
219 if (!pVecTy->isVectorTy())
220 {
221 for (auto &op : pCallInst->arg_operands())
222 {
223 if (op.get()->getType()->isVectorTy())
224 {
225 pVecTy = op.get()->getType();
226 break;
227 }
228 }
229 }
230 SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
231
232 uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
233 switch (width)
234 {
235 case 256:
236 *pWidth = W256;
237 break;
238 case 512:
239 *pWidth = W512;
240 break;
241 default:
242 SWR_ASSERT(false, "Unhandled vector width %d", width);
243 *pWidth = W256;
244 }
245
246 *pTy = pVecTy->getScalarType();
247 }
248
249 Value *GetZeroVec(TargetWidth width, Type *pTy)
250 {
251 uint32_t numElem = 0;
252 switch (width)
253 {
254 case W256:
255 numElem = 8;
256 break;
257 case W512:
258 numElem = 16;
259 break;
260 default:
261 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
262 }
263
264 return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
265 }
266
267 Value *GetMask(TargetWidth width)
268 {
269 Value *mask;
270 switch (width)
271 {
272 case W256:
273 mask = B->C((uint8_t)-1);
274 break;
275 case W512:
276 mask = B->C((uint16_t)-1);
277 break;
278 default:
279 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
280 }
281 return mask;
282 }
283
284 // Convert <N x i1> mask to <N x i32> x86 mask
285 Value *VectorMask(Value *vi1Mask)
286 {
287 uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
288 return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
289 }
290
291 Instruction *ProcessIntrinsicAdvanced(CallInst *pCallInst)
292 {
293 Function * pFunc = pCallInst->getCalledFunction();
294 auto & intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
295 TargetWidth vecWidth;
296 Type * pElemTy;
297 GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
298
299 // Check if there is a native intrinsic for this instruction
300 Intrinsic::ID id = intrinsic.intrin[vecWidth];
301 if (id == DOUBLE)
302 {
303 // Double pump the next smaller SIMD intrinsic
304 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
305 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
306 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
307 "Cannot find intrinsic to double pump.");
308 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
309 }
310 else if (id != Intrinsic::not_intrinsic)
311 {
312 Function *pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
313 SmallVector<Value *, 8> args;
314 for (auto &arg : pCallInst->arg_operands())
315 {
316 args.push_back(arg.get());
317 }
318
319 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
320 // full mask for now Assuming the intrinsics are consistent and place the src
321 // operand and mask last in the argument list.
322 if (mTarget == AVX512)
323 {
324 if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
325 {
326 args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
327 args.push_back(GetMask(W256));
328 // for AVX512 VCVTPD2PS, we also have to add rounding mode
329 args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
330 }
331 else
332 {
333 args.push_back(GetZeroVec(vecWidth, pElemTy));
334 args.push_back(GetMask(vecWidth));
335 }
336 }
337
338 return B->CALLA(pIntrin, args);
339 }
340 else
341 {
342 // No native intrinsic, call emulation function
343 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
344 }
345
346 SWR_ASSERT(false);
347 return nullptr;
348 }
349
350 Instruction *ProcessIntrinsic(CallInst *pCallInst)
351 {
352 Function *pFunc = pCallInst->getCalledFunction();
353
354 // Forward to the advanced support if found
355 if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
356 {
357 return ProcessIntrinsicAdvanced(pCallInst);
358 }
359
360 SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(),
361 "Unimplemented intrinsic %s.",
362 pFunc->getName());
363
364 Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
365 Function * pX86IntrinFunc =
366 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
367
368 SmallVector<Value *, 8> args;
369 for (auto &arg : pCallInst->arg_operands())
370 {
371 args.push_back(arg.get());
372 }
373 return B->CALLA(pX86IntrinFunc, args);
374 }
375
376 //////////////////////////////////////////////////////////////////////////
377 /// @brief LLVM funtion pass run method.
378 /// @param f- The function we're working on with this pass.
379 virtual bool runOnFunction(Function &F)
380 {
381 std::vector<Instruction *> toRemove;
382
383 for (auto &BB : F.getBasicBlockList())
384 {
385 for (auto &I : BB.getInstList())
386 {
387 if (CallInst *pCallInst = dyn_cast<CallInst>(&I))
388 {
389 Function *pFunc = pCallInst->getCalledFunction();
390 if (pFunc)
391 {
392 if (pFunc->getName().startswith("meta.intrinsic"))
393 {
394 B->IRB()->SetInsertPoint(&I);
395 Instruction *pReplace = ProcessIntrinsic(pCallInst);
396 SWR_ASSERT(pReplace);
397 toRemove.push_back(pCallInst);
398 pCallInst->replaceAllUsesWith(pReplace);
399 }
400 }
401 }
402 }
403 }
404
405 for (auto *pInst : toRemove)
406 {
407 pInst->eraseFromParent();
408 }
409
410 JitManager::DumpToFile(&F, "lowerx86");
411
412 return true;
413 }
414
415 virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
416
417 JitManager *JM() { return B->JM(); }
418
419 Builder *B;
420
421 TargetArch mTarget;
422
423 static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
424 };
425
426 char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
427
428 FunctionPass *createLowerX86Pass(Builder *b) { return new LowerX86(b); }
429
430 Instruction *NO_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
431 {
432 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
433 return nullptr;
434 }
435
436 Instruction *VPERM_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
437 {
438 // Only need vperm emulation for AVX
439 SWR_ASSERT(arch == AVX);
440
441 Builder *B = pThis->B;
442 auto v32A = pCallInst->getArgOperand(0);
443 auto vi32Index = pCallInst->getArgOperand(1);
444
445 Value *v32Result;
446 if (isa<Constant>(vi32Index))
447 {
448 // Can use llvm shuffle vector directly with constant shuffle indices
449 v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
450 }
451 else
452 {
453 v32Result = UndefValue::get(v32A->getType());
454 for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
455 {
456 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
457 auto val = B->VEXTRACT(v32A, i32Index);
458 v32Result = B->VINSERT(v32Result, val, B->C(l));
459 }
460 }
461 return cast<Instruction>(v32Result);
462 }
463
464 Instruction *
465 VGATHER_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
466 {
467 Builder *B = pThis->B;
468 auto vSrc = pCallInst->getArgOperand(0);
469 auto pBase = pCallInst->getArgOperand(1);
470 auto vi32Indices = pCallInst->getArgOperand(2);
471 auto vi1Mask = pCallInst->getArgOperand(3);
472 auto i8Scale = pCallInst->getArgOperand(4);
473
474 pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
475 uint32_t numElem = vSrc->getType()->getVectorNumElements();
476 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
477 auto srcTy = vSrc->getType()->getVectorElementType();
478 Value * v32Gather;
479 if (arch == AVX)
480 {
481 // Full emulation for AVX
482 // Store source on stack to provide a valid address to load from inactive lanes
483 auto pStack = B->STACKSAVE();
484 auto pTmp = B->ALLOCA(vSrc->getType());
485 B->STORE(vSrc, pTmp);
486
487 v32Gather = UndefValue::get(vSrc->getType());
488 auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
489 auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
490
491 for (uint32_t i = 0; i < numElem; ++i)
492 {
493 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
494 auto pLoadAddress = B->GEP(pBase, i32Offset);
495 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
496 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
497 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
498 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
499 auto val = B->LOAD(pValidAddress);
500 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
501 }
502
503 B->STACKRESTORE(pStack);
504 }
505 else if (arch == AVX2 || (arch == AVX512 && width == W256))
506 {
507 Function *pX86IntrinFunc;
508 if (srcTy == B->mFP32Ty)
509 {
510 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
511 Intrinsic::x86_avx2_gather_d_ps_256);
512 }
513 else if (srcTy == B->mInt32Ty)
514 {
515 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
516 Intrinsic::x86_avx2_gather_d_d_256);
517 }
518 else if (srcTy == B->mDoubleTy)
519 {
520 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
521 Intrinsic::x86_avx2_gather_d_q_256);
522 }
523 else
524 {
525 SWR_ASSERT(false, "Unsupported vector element type for gather.");
526 }
527
528 if (width == W256)
529 {
530 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
531 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
532 }
533 else if (width == W512)
534 {
535 // Double pump 4-wide for 64bit elements
536 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
537 {
538 auto v64Mask = pThis->VectorMask(vi1Mask);
539 v64Mask = B->S_EXT(
540 v64Mask,
541 VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
542 v64Mask = B->BITCAST(v64Mask, vSrc->getType());
543
544 Value *src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
545 Value *src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
546
547 Value *indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
548 Value *indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
549
550 Value *mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
551 Value *mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
552
553 src0 = B->BITCAST(
554 src0,
555 VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
556 mask0 = B->BITCAST(
557 mask0,
558 VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
559 Value *gather0 =
560 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
561 src1 = B->BITCAST(
562 src1,
563 VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
564 mask1 = B->BITCAST(
565 mask1,
566 VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
567 Value *gather1 =
568 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
569
570 v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
571 v32Gather = B->BITCAST(v32Gather, vSrc->getType());
572 }
573 else
574 {
575 // Double pump 8-wide for 32bit elements
576 auto v32Mask = pThis->VectorMask(vi1Mask);
577 v32Mask = B->BITCAST(v32Mask, vSrc->getType());
578 Value *src0 = B->EXTRACT_16(vSrc, 0);
579 Value *src1 = B->EXTRACT_16(vSrc, 1);
580
581 Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
582 Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
583
584 Value *mask0 = B->EXTRACT_16(v32Mask, 0);
585 Value *mask1 = B->EXTRACT_16(v32Mask, 1);
586
587 Value *gather0 =
588 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
589 Value *gather1 =
590 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
591
592 v32Gather = B->JOIN_16(gather0, gather1);
593 }
594 }
595 }
596 else if (arch == AVX512)
597 {
598 Value * iMask;
599 Function *pX86IntrinFunc;
600 if (srcTy == B->mFP32Ty)
601 {
602 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
603 Intrinsic::x86_avx512_gather_dps_512);
604 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
605 }
606 else if (srcTy == B->mInt32Ty)
607 {
608 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
609 Intrinsic::x86_avx512_gather_dpi_512);
610 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
611 }
612 else if (srcTy == B->mDoubleTy)
613 {
614 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
615 Intrinsic::x86_avx512_gather_dpd_512);
616 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
617 }
618 else
619 {
620 SWR_ASSERT(false, "Unsupported vector element type for gather.");
621 }
622
623 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
624 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
625 }
626
627 return cast<Instruction>(v32Gather);
628 }
629
630 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
631 // instructions
632 Instruction *
633 VROUND_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
634 {
635 SWR_ASSERT(arch == AVX512);
636
637 auto B = pThis->B;
638 auto vf32Src = pCallInst->getOperand(0);
639 auto i8Round = pCallInst->getOperand(1);
640 auto pfnFunc =
641 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
642
643 if (width == W256)
644 {
645 return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
646 }
647 else if (width == W512)
648 {
649 auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
650 auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
651
652 auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
653 auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
654
655 return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
656 }
657 else
658 {
659 SWR_ASSERT(false, "Unimplemented vector width.");
660 }
661
662 return nullptr;
663 }
664
665 // No support for hsub in AVX512
666 Instruction *VHSUB_EMU(LowerX86 *pThis, TargetArch arch, TargetWidth width, CallInst *pCallInst)
667 {
668 SWR_ASSERT(arch == AVX512);
669
670 auto B = pThis->B;
671 auto src0 = pCallInst->getOperand(0);
672 auto src1 = pCallInst->getOperand(1);
673
674 // 256b hsub can just use avx intrinsic
675 if (width == W256)
676 {
677 auto pX86IntrinFunc =
678 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
679 return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
680 }
681 else if (width == W512)
682 {
683 // 512b hsub can be accomplished with shuf/sub combo
684 auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
685 auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
686 return cast<Instruction>(B->SUB(minuend, subtrahend));
687 }
688 else
689 {
690 SWR_ASSERT(false, "Unimplemented vector width.");
691 return nullptr;
692 }
693 }
694
695 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
696 // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
697 Instruction *DOUBLE_EMU(LowerX86 * pThis,
698 TargetArch arch,
699 TargetWidth width,
700 CallInst * pCallInst,
701 Intrinsic::ID intrin)
702 {
703 auto B = pThis->B;
704 SWR_ASSERT(width == W512);
705 Value * result[2];
706 Function *pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
707 for (uint32_t i = 0; i < 2; ++i)
708 {
709 SmallVector<Value *, 8> args;
710 for (auto &arg : pCallInst->arg_operands())
711 {
712 auto argType = arg.get()->getType();
713 if (argType->isVectorTy())
714 {
715 uint32_t vecWidth = argType->getVectorNumElements();
716 Value * lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
717 Value * argToPush = B->VSHUFFLE(
718 arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
719 args.push_back(argToPush);
720 }
721 else
722 {
723 args.push_back(arg.get());
724 }
725 }
726 result[i] = B->CALLA(pX86IntrinFunc, args);
727 }
728 uint32_t vecWidth;
729 if (result[0]->getType()->isVectorTy())
730 {
731 assert(result[1]->getType()->isVectorTy());
732 vecWidth = result[0]->getType()->getVectorNumElements() +
733 result[1]->getType()->getVectorNumElements();
734 }
735 else
736 {
737 vecWidth = 2;
738 }
739 Value *lanes = B->CInc<int>(0, vecWidth);
740 return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
741 }
742
743 } // namespace SwrJit
744
745 using namespace SwrJit;
746
747 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
748 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)