gallium/swr: Fix vcvtph2ps llvm intrinsic compile error
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / functionpasses / lower_x86.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file lower_x86.cpp
24 *
25 * @brief llvm pass to lower meta code to x86
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30
31 #include "jit_pch.hpp"
32 #include "passes.h"
33 #include "JitManager.h"
34
35 #include "common/simdlib.hpp"
36
37 #include <unordered_map>
38
39 extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
40
41 namespace llvm
42 {
43 // foward declare the initializer
44 void initializeLowerX86Pass(PassRegistry&);
45 } // namespace llvm
46
47 namespace SwrJit
48 {
49 using namespace llvm;
50
51 #if LLVM_VERSION_MAJOR > 10
52 typedef unsigned IntrinsicID;
53 #else
54 typedef Intrinsic::ID IntrinsicID;
55 #endif
56
57 enum TargetArch
58 {
59 AVX = 0,
60 AVX2 = 1,
61 AVX512 = 2
62 };
63
64 enum TargetWidth
65 {
66 W256 = 0,
67 W512 = 1,
68 NUM_WIDTHS = 2
69 };
70
71 struct LowerX86;
72
73 typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
74
75 struct X86Intrinsic
76 {
77 IntrinsicID intrin[NUM_WIDTHS];
78 EmuFunc emuFunc;
79 };
80
81 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
82 // previous behavior of mapping directly to avx/avx2 intrinsics.
83 static std::map<std::string, IntrinsicID> intrinsicMap = {
84 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
85 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
86 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
87 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
88 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
89 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
90 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
91 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
92 };
93
94 // Forward decls
95 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
96 Instruction*
97 VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
98 Instruction*
99 VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
100 Instruction*
101 VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
102 Instruction*
103 VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
104 Instruction*
105 VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
106 Instruction*
107 VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
108
109 Instruction* DOUBLE_EMU(LowerX86* pThis,
110 TargetArch arch,
111 TargetWidth width,
112 CallInst* pCallInst,
113 Intrinsic::ID intrin);
114
115 static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
116
117 // clang-format off
118 static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
119 // 256 wide 512 wide
120 {
121 // AVX
122 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
123 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
124 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
125 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
126 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
127 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
128 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
129 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
130 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
131 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
132 },
133 {
134 // AVX2
135 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
136 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
137 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
138 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
139 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
140 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
141 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
142 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
143 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
144 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
145 },
146 {
147 // AVX512
148 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
149 #if LLVM_VERSION_MAJOR < 7
150 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
151 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
152 #else
153 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
154 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
155 #endif
156 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
157 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
158 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
159 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
160 #if LLVM_VERSION_MAJOR < 7
161 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
162 #else
163 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
164 #endif
165 {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
166 {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
167 }};
168 // clang-format on
169
170 struct LowerX86 : public FunctionPass
171 {
172 LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
173 {
174 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
175
176 // Determine target arch
177 if (JM()->mArch.AVX512F())
178 {
179 mTarget = AVX512;
180 }
181 else if (JM()->mArch.AVX2())
182 {
183 mTarget = AVX2;
184 }
185 else if (JM()->mArch.AVX())
186 {
187 mTarget = AVX;
188 }
189 else
190 {
191 SWR_ASSERT(false, "Unsupported AVX architecture.");
192 mTarget = AVX;
193 }
194
195 // Setup scatter function for 256 wide
196 uint32_t curWidth = B->mVWidth;
197 B->SetTargetWidth(8);
198 std::vector<Type*> args = {
199 B->mInt8PtrTy, // pBase
200 B->mSimdInt32Ty, // vIndices
201 B->mSimdFP32Ty, // vSrc
202 B->mInt8Ty, // mask
203 B->mInt32Ty // scale
204 };
205
206 FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
207 mPfnScatter256 = cast<Function>(
208 #if LLVM_VERSION_MAJOR >= 9
209 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());
210 #else
211 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
212 #endif
213 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
214 {
215 sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
216 }
217
218 B->SetTargetWidth(curWidth);
219 }
220
221 // Try to decipher the vector type of the instruction. This does not work properly
222 // across all intrinsics, and will have to be rethought. Probably need something
223 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
224 // intrinsic.
225 void GetRequestedWidthAndType(CallInst* pCallInst,
226 const StringRef intrinName,
227 TargetWidth* pWidth,
228 Type** pTy)
229 {
230 assert(pCallInst);
231 Type* pVecTy = pCallInst->getType();
232
233 // Check for intrinsic specific types
234 // VCVTPD2PS type comes from src, not dst
235 if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
236 {
237 Value* pOp = pCallInst->getOperand(0);
238 assert(pOp);
239 pVecTy = pOp->getType();
240 }
241
242 if (!pVecTy->isVectorTy())
243 {
244 for (auto& op : pCallInst->arg_operands())
245 {
246 if (op.get()->getType()->isVectorTy())
247 {
248 pVecTy = op.get()->getType();
249 break;
250 }
251 }
252 }
253 SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
254
255 uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
256 switch (width)
257 {
258 case 256:
259 *pWidth = W256;
260 break;
261 case 512:
262 *pWidth = W512;
263 break;
264 default:
265 SWR_ASSERT(false, "Unhandled vector width %d", width);
266 *pWidth = W256;
267 }
268
269 *pTy = pVecTy->getScalarType();
270 }
271
272 Value* GetZeroVec(TargetWidth width, Type* pTy)
273 {
274 uint32_t numElem = 0;
275 switch (width)
276 {
277 case W256:
278 numElem = 8;
279 break;
280 case W512:
281 numElem = 16;
282 break;
283 default:
284 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
285 }
286
287 return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
288 }
289
290 Value* GetMask(TargetWidth width)
291 {
292 Value* mask;
293 switch (width)
294 {
295 case W256:
296 mask = B->C((uint8_t)-1);
297 break;
298 case W512:
299 mask = B->C((uint16_t)-1);
300 break;
301 default:
302 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
303 }
304 return mask;
305 }
306
307 // Convert <N x i1> mask to <N x i32> x86 mask
308 Value* VectorMask(Value* vi1Mask)
309 {
310 uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
311 return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
312 }
313
314 Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
315 {
316 Function* pFunc = pCallInst->getCalledFunction();
317 assert(pFunc);
318
319 auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName().str()];
320 TargetWidth vecWidth;
321 Type* pElemTy;
322 GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
323
324 // Check if there is a native intrinsic for this instruction
325 IntrinsicID id = intrinsic.intrin[vecWidth];
326 if (id == DOUBLE)
327 {
328 // Double pump the next smaller SIMD intrinsic
329 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
330 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
331 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
332 "Cannot find intrinsic to double pump.");
333 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
334 }
335 else if (id != Intrinsic::not_intrinsic)
336 {
337 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
338 SmallVector<Value*, 8> args;
339 for (auto& arg : pCallInst->arg_operands())
340 {
341 args.push_back(arg.get());
342 }
343
344 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
345 // full mask for now Assuming the intrinsics are consistent and place the src
346 // operand and mask last in the argument list.
347 if (mTarget == AVX512)
348 {
349 if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
350 {
351 args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
352 args.push_back(GetMask(W256));
353 // for AVX512 VCVTPD2PS, we also have to add rounding mode
354 args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
355 }
356 else
357 {
358 args.push_back(GetZeroVec(vecWidth, pElemTy));
359 args.push_back(GetMask(vecWidth));
360 }
361 }
362
363 return B->CALLA(pIntrin, args);
364 }
365 else
366 {
367 // No native intrinsic, call emulation function
368 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
369 }
370
371 SWR_ASSERT(false);
372 return nullptr;
373 }
374
375 Instruction* ProcessIntrinsic(CallInst* pCallInst)
376 {
377 Function* pFunc = pCallInst->getCalledFunction();
378 assert(pFunc);
379
380 // Forward to the advanced support if found
381 if (intrinsicMap2[mTarget].find(pFunc->getName().str()) != intrinsicMap2[mTarget].end())
382 {
383 return ProcessIntrinsicAdvanced(pCallInst);
384 }
385
386 SWR_ASSERT(intrinsicMap.find(pFunc->getName().str()) != intrinsicMap.end(),
387 "Unimplemented intrinsic %s.",
388 pFunc->getName().str().c_str());
389
390 Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName().str()];
391 Function* pX86IntrinFunc =
392 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
393
394 SmallVector<Value*, 8> args;
395 for (auto& arg : pCallInst->arg_operands())
396 {
397 args.push_back(arg.get());
398 }
399 return B->CALLA(pX86IntrinFunc, args);
400 }
401
402 //////////////////////////////////////////////////////////////////////////
403 /// @brief LLVM funtion pass run method.
404 /// @param f- The function we're working on with this pass.
405 virtual bool runOnFunction(Function& F)
406 {
407 std::vector<Instruction*> toRemove;
408 std::vector<BasicBlock*> bbs;
409
410 // Make temp copy of the basic blocks and instructions, as the intrinsic
411 // replacement code might invalidate the iterators
412 for (auto& b : F.getBasicBlockList())
413 {
414 bbs.push_back(&b);
415 }
416
417 for (auto* BB : bbs)
418 {
419 std::vector<Instruction*> insts;
420 for (auto& i : BB->getInstList())
421 {
422 insts.push_back(&i);
423 }
424
425 for (auto* I : insts)
426 {
427 if (CallInst* pCallInst = dyn_cast<CallInst>(I))
428 {
429 Function* pFunc = pCallInst->getCalledFunction();
430 if (pFunc)
431 {
432 if (pFunc->getName().startswith("meta.intrinsic"))
433 {
434 B->IRB()->SetInsertPoint(I);
435 Instruction* pReplace = ProcessIntrinsic(pCallInst);
436 toRemove.push_back(pCallInst);
437 if (pReplace)
438 {
439 pCallInst->replaceAllUsesWith(pReplace);
440 }
441 }
442 }
443 }
444 }
445 }
446
447 for (auto* pInst : toRemove)
448 {
449 pInst->eraseFromParent();
450 }
451
452 JitManager::DumpToFile(&F, "lowerx86");
453
454 return true;
455 }
456
457 virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
458
459 JitManager* JM() { return B->JM(); }
460 Builder* B;
461 TargetArch mTarget;
462 Function* mPfnScatter256;
463
464 static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
465 };
466
467 char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
468
469 FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
470
471 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
472 {
473 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
474 return nullptr;
475 }
476
477 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
478 {
479 // Only need vperm emulation for AVX
480 SWR_ASSERT(arch == AVX);
481
482 Builder* B = pThis->B;
483 auto v32A = pCallInst->getArgOperand(0);
484 auto vi32Index = pCallInst->getArgOperand(1);
485
486 Value* v32Result;
487 if (isa<Constant>(vi32Index))
488 {
489 // Can use llvm shuffle vector directly with constant shuffle indices
490 v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
491 }
492 else
493 {
494 v32Result = UndefValue::get(v32A->getType());
495 for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
496 {
497 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
498 auto val = B->VEXTRACT(v32A, i32Index);
499 v32Result = B->VINSERT(v32Result, val, B->C(l));
500 }
501 }
502 return cast<Instruction>(v32Result);
503 }
504
505 Instruction*
506 VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
507 {
508 Builder* B = pThis->B;
509 auto vSrc = pCallInst->getArgOperand(0);
510 auto pBase = pCallInst->getArgOperand(1);
511 auto vi32Indices = pCallInst->getArgOperand(2);
512 auto vi1Mask = pCallInst->getArgOperand(3);
513 auto i8Scale = pCallInst->getArgOperand(4);
514
515 pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
516 uint32_t numElem = vSrc->getType()->getVectorNumElements();
517 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
518 auto srcTy = vSrc->getType()->getVectorElementType();
519 Value* v32Gather = nullptr;
520 if (arch == AVX)
521 {
522 // Full emulation for AVX
523 // Store source on stack to provide a valid address to load from inactive lanes
524 auto pStack = B->STACKSAVE();
525 auto pTmp = B->ALLOCA(vSrc->getType());
526 B->STORE(vSrc, pTmp);
527
528 v32Gather = UndefValue::get(vSrc->getType());
529 auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
530 auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
531
532 for (uint32_t i = 0; i < numElem; ++i)
533 {
534 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
535 auto pLoadAddress = B->GEP(pBase, i32Offset);
536 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
537 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
538 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
539 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
540 auto val = B->LOAD(pValidAddress);
541 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
542 }
543
544 B->STACKRESTORE(pStack);
545 }
546 else if (arch == AVX2 || (arch == AVX512 && width == W256))
547 {
548 Function* pX86IntrinFunc = nullptr;
549 if (srcTy == B->mFP32Ty)
550 {
551 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
552 Intrinsic::x86_avx2_gather_d_ps_256);
553 }
554 else if (srcTy == B->mInt32Ty)
555 {
556 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
557 Intrinsic::x86_avx2_gather_d_d_256);
558 }
559 else if (srcTy == B->mDoubleTy)
560 {
561 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
562 Intrinsic::x86_avx2_gather_d_q_256);
563 }
564 else
565 {
566 SWR_ASSERT(false, "Unsupported vector element type for gather.");
567 }
568
569 if (width == W256)
570 {
571 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
572 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
573 }
574 else if (width == W512)
575 {
576 // Double pump 4-wide for 64bit elements
577 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
578 {
579 auto v64Mask = pThis->VectorMask(vi1Mask);
580 v64Mask = B->S_EXT(
581 v64Mask,
582 VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
583 v64Mask = B->BITCAST(v64Mask, vSrc->getType());
584
585 Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
586 Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
587
588 Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
589 Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
590
591 Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
592 Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
593
594 src0 = B->BITCAST(
595 src0,
596 VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
597 mask0 = B->BITCAST(
598 mask0,
599 VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
600 Value* gather0 =
601 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
602 src1 = B->BITCAST(
603 src1,
604 VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
605 mask1 = B->BITCAST(
606 mask1,
607 VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
608 Value* gather1 =
609 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
610
611 v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
612 v32Gather = B->BITCAST(v32Gather, vSrc->getType());
613 }
614 else
615 {
616 // Double pump 8-wide for 32bit elements
617 auto v32Mask = pThis->VectorMask(vi1Mask);
618 v32Mask = B->BITCAST(v32Mask, vSrc->getType());
619 Value* src0 = B->EXTRACT_16(vSrc, 0);
620 Value* src1 = B->EXTRACT_16(vSrc, 1);
621
622 Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
623 Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
624
625 Value* mask0 = B->EXTRACT_16(v32Mask, 0);
626 Value* mask1 = B->EXTRACT_16(v32Mask, 1);
627
628 Value* gather0 =
629 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
630 Value* gather1 =
631 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
632
633 v32Gather = B->JOIN_16(gather0, gather1);
634 }
635 }
636 }
637 else if (arch == AVX512)
638 {
639 Value* iMask = nullptr;
640 Function* pX86IntrinFunc = nullptr;
641 if (srcTy == B->mFP32Ty)
642 {
643 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
644 Intrinsic::x86_avx512_gather_dps_512);
645 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
646 }
647 else if (srcTy == B->mInt32Ty)
648 {
649 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
650 Intrinsic::x86_avx512_gather_dpi_512);
651 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
652 }
653 else if (srcTy == B->mDoubleTy)
654 {
655 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
656 Intrinsic::x86_avx512_gather_dpd_512);
657 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
658 }
659 else
660 {
661 SWR_ASSERT(false, "Unsupported vector element type for gather.");
662 }
663
664 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
665 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
666 }
667
668 return cast<Instruction>(v32Gather);
669 }
670 Instruction*
671 VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
672 {
673 Builder* B = pThis->B;
674 auto pBase = pCallInst->getArgOperand(0);
675 auto vi1Mask = pCallInst->getArgOperand(1);
676 auto vi32Indices = pCallInst->getArgOperand(2);
677 auto v32Src = pCallInst->getArgOperand(3);
678 auto i32Scale = pCallInst->getArgOperand(4);
679
680 if (arch != AVX512)
681 {
682 // Call into C function to do the scatter. This has significantly better compile perf
683 // compared to jitting scatter loops for every scatter
684 if (width == W256)
685 {
686 auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
687 B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
688 }
689 else
690 {
691 // Need to break up 512 wide scatter to two 256 wide
692 auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
693 auto indicesLo =
694 B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
695 auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
696
697 auto mask = B->BITCAST(maskLo, B->mInt8Ty);
698 B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
699
700 auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
701 auto indicesHi =
702 B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
703 auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
704
705 mask = B->BITCAST(maskHi, B->mInt8Ty);
706 B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
707 }
708 return nullptr;
709 }
710
711 Value* iMask;
712 Function* pX86IntrinFunc;
713 if (width == W256)
714 {
715 // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
716 // can use the scatter of 8 elements with 64bit indices
717 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
718 Intrinsic::x86_avx512_scatter_qps_512);
719
720 auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
721 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
722 B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
723 }
724 else if (width == W512)
725 {
726 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
727 Intrinsic::x86_avx512_scatter_dps_512);
728 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
729 B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
730 }
731 return nullptr;
732 }
733
734 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
735 // instructions
736 Instruction*
737 VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
738 {
739 SWR_ASSERT(arch == AVX512);
740
741 auto B = pThis->B;
742 auto vf32Src = pCallInst->getOperand(0);
743 assert(vf32Src);
744 auto i8Round = pCallInst->getOperand(1);
745 assert(i8Round);
746 auto pfnFunc =
747 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
748
749 if (width == W256)
750 {
751 return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
752 }
753 else if (width == W512)
754 {
755 auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
756 auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
757
758 auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
759 auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
760
761 return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
762 }
763 else
764 {
765 SWR_ASSERT(false, "Unimplemented vector width.");
766 }
767
768 return nullptr;
769 }
770
771 Instruction*
772 VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
773 {
774 SWR_ASSERT(arch == AVX512);
775
776 auto B = pThis->B;
777 auto vf32Src = pCallInst->getOperand(0);
778
779 if (width == W256)
780 {
781 auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
782 Intrinsic::x86_avx_round_ps_256);
783 return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
784 }
785 else if (width == W512)
786 {
787 // 512 can use intrinsic
788 auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
789 Intrinsic::x86_avx512_mask_cvtpd2ps_512);
790 return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
791 }
792 else
793 {
794 SWR_ASSERT(false, "Unimplemented vector width.");
795 }
796
797 return nullptr;
798 }
799
800 // No support for hsub in AVX512
801 Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
802 {
803 SWR_ASSERT(arch == AVX512);
804
805 auto B = pThis->B;
806 auto src0 = pCallInst->getOperand(0);
807 auto src1 = pCallInst->getOperand(1);
808
809 // 256b hsub can just use avx intrinsic
810 if (width == W256)
811 {
812 auto pX86IntrinFunc =
813 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
814 return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
815 }
816 else if (width == W512)
817 {
818 // 512b hsub can be accomplished with shuf/sub combo
819 auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
820 auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
821 return cast<Instruction>(B->SUB(minuend, subtrahend));
822 }
823 else
824 {
825 SWR_ASSERT(false, "Unimplemented vector width.");
826 return nullptr;
827 }
828 }
829
830 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
831 // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
832 Instruction* DOUBLE_EMU(LowerX86* pThis,
833 TargetArch arch,
834 TargetWidth width,
835 CallInst* pCallInst,
836 Intrinsic::ID intrin)
837 {
838 auto B = pThis->B;
839 SWR_ASSERT(width == W512);
840 Value* result[2];
841 Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
842 for (uint32_t i = 0; i < 2; ++i)
843 {
844 SmallVector<Value*, 8> args;
845 for (auto& arg : pCallInst->arg_operands())
846 {
847 auto argType = arg.get()->getType();
848 if (argType->isVectorTy())
849 {
850 uint32_t vecWidth = argType->getVectorNumElements();
851 Value* lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
852 Value* argToPush = B->VSHUFFLE(
853 arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
854 args.push_back(argToPush);
855 }
856 else
857 {
858 args.push_back(arg.get());
859 }
860 }
861 result[i] = B->CALLA(pX86IntrinFunc, args);
862 }
863 uint32_t vecWidth;
864 if (result[0]->getType()->isVectorTy())
865 {
866 assert(result[1]->getType()->isVectorTy());
867 vecWidth = result[0]->getType()->getVectorNumElements() +
868 result[1]->getType()->getVectorNumElements();
869 }
870 else
871 {
872 vecWidth = 2;
873 }
874 Value* lanes = B->CInc<int>(0, vecWidth);
875 return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
876 }
877
878 } // namespace SwrJit
879
880 using namespace SwrJit;
881
882 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
883 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)