gallium/swr: fix gcc warnings
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / functionpasses / lower_x86.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file lower_x86.cpp
24 *
25 * @brief llvm pass to lower meta code to x86
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30
31 #include "jit_pch.hpp"
32 #include "passes.h"
33 #include "JitManager.h"
34
35 #include "common/simdlib.hpp"
36
37 #include <unordered_map>
38
39 extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t);
40
41 namespace llvm
42 {
43 // foward declare the initializer
44 void initializeLowerX86Pass(PassRegistry&);
45 } // namespace llvm
46
47 namespace SwrJit
48 {
49 using namespace llvm;
50
51 enum TargetArch
52 {
53 AVX = 0,
54 AVX2 = 1,
55 AVX512 = 2
56 };
57
58 enum TargetWidth
59 {
60 W256 = 0,
61 W512 = 1,
62 NUM_WIDTHS = 2
63 };
64
65 struct LowerX86;
66
67 typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
68
69 struct X86Intrinsic
70 {
71 Intrinsic::ID intrin[NUM_WIDTHS];
72 EmuFunc emuFunc;
73 };
74
75 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
76 // previous behavior of mapping directly to avx/avx2 intrinsics.
77 static std::map<std::string, Intrinsic::ID> intrinsicMap = {
78 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
79 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
80 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
81 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
82 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
83 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
84 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
85 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
86 };
87
88 // Forward decls
89 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
90 Instruction*
91 VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
92 Instruction*
93 VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
94 Instruction*
95 VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
96 Instruction*
97 VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
98 Instruction*
99 VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
100 Instruction*
101 VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
102
103 Instruction* DOUBLE_EMU(LowerX86* pThis,
104 TargetArch arch,
105 TargetWidth width,
106 CallInst* pCallInst,
107 Intrinsic::ID intrin);
108
109 static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
110
111 // clang-format off
112 static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
113 // 256 wide 512 wide
114 {
115 // AVX
116 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
117 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
118 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
119 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
120 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
121 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
122 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
123 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
124 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
125 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
126 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
127 },
128 {
129 // AVX2
130 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
131 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
132 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
133 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
134 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
135 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
136 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
137 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
138 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
139 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
140 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
141 },
142 {
143 // AVX512
144 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
145 #if LLVM_VERSION_MAJOR < 7
146 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
147 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
148 #else
149 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
150 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
151 #endif
152 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
153 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
154 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
155 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}},
156 #if LLVM_VERSION_MAJOR < 7
157 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}},
158 #else
159 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
160 #endif
161 {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512}, NO_EMU}},
162 {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
163 {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
164 }};
165 // clang-format on
166
167 struct LowerX86 : public FunctionPass
168 {
169 LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
170 {
171 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
172
173 // Determine target arch
174 if (JM()->mArch.AVX512F())
175 {
176 mTarget = AVX512;
177 }
178 else if (JM()->mArch.AVX2())
179 {
180 mTarget = AVX2;
181 }
182 else if (JM()->mArch.AVX())
183 {
184 mTarget = AVX;
185 }
186 else
187 {
188 SWR_ASSERT(false, "Unsupported AVX architecture.");
189 mTarget = AVX;
190 }
191
192 // Setup scatter function for 256 wide
193 uint32_t curWidth = B->mVWidth;
194 B->SetTargetWidth(8);
195 std::vector<Type*> args = {
196 B->mInt8PtrTy, // pBase
197 B->mSimdInt32Ty, // vIndices
198 B->mSimdFP32Ty, // vSrc
199 B->mInt8Ty, // mask
200 B->mInt32Ty // scale
201 };
202
203 FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false);
204 mPfnScatter256 = cast<Function>(
205 #if LLVM_VERSION_MAJOR >= 9
206 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy).getCallee());
207 #else
208 B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy));
209 #endif
210 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
211 {
212 sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256);
213 }
214
215 B->SetTargetWidth(curWidth);
216 }
217
218 // Try to decipher the vector type of the instruction. This does not work properly
219 // across all intrinsics, and will have to be rethought. Probably need something
220 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
221 // intrinsic.
222 void GetRequestedWidthAndType(CallInst* pCallInst,
223 const StringRef intrinName,
224 TargetWidth* pWidth,
225 Type** pTy)
226 {
227 Type* pVecTy = pCallInst->getType();
228
229 // Check for intrinsic specific types
230 // VCVTPD2PS type comes from src, not dst
231 if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
232 {
233 pVecTy = pCallInst->getOperand(0)->getType();
234 }
235
236 if (!pVecTy->isVectorTy())
237 {
238 for (auto& op : pCallInst->arg_operands())
239 {
240 if (op.get()->getType()->isVectorTy())
241 {
242 pVecTy = op.get()->getType();
243 break;
244 }
245 }
246 }
247 SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
248
249 uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
250 switch (width)
251 {
252 case 256:
253 *pWidth = W256;
254 break;
255 case 512:
256 *pWidth = W512;
257 break;
258 default:
259 SWR_ASSERT(false, "Unhandled vector width %d", width);
260 *pWidth = W256;
261 }
262
263 *pTy = pVecTy->getScalarType();
264 }
265
266 Value* GetZeroVec(TargetWidth width, Type* pTy)
267 {
268 uint32_t numElem = 0;
269 switch (width)
270 {
271 case W256:
272 numElem = 8;
273 break;
274 case W512:
275 numElem = 16;
276 break;
277 default:
278 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
279 }
280
281 return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
282 }
283
284 Value* GetMask(TargetWidth width)
285 {
286 Value* mask;
287 switch (width)
288 {
289 case W256:
290 mask = B->C((uint8_t)-1);
291 break;
292 case W512:
293 mask = B->C((uint16_t)-1);
294 break;
295 default:
296 SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
297 }
298 return mask;
299 }
300
301 // Convert <N x i1> mask to <N x i32> x86 mask
302 Value* VectorMask(Value* vi1Mask)
303 {
304 uint32_t numElem = vi1Mask->getType()->getVectorNumElements();
305 return B->S_EXT(vi1Mask, VectorType::get(B->mInt32Ty, numElem));
306 }
307
308 Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
309 {
310 Function* pFunc = pCallInst->getCalledFunction();
311 auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
312 TargetWidth vecWidth;
313 Type* pElemTy;
314 GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
315
316 // Check if there is a native intrinsic for this instruction
317 Intrinsic::ID id = intrinsic.intrin[vecWidth];
318 if (id == DOUBLE)
319 {
320 // Double pump the next smaller SIMD intrinsic
321 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
322 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
323 SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
324 "Cannot find intrinsic to double pump.");
325 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
326 }
327 else if (id != Intrinsic::not_intrinsic)
328 {
329 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
330 SmallVector<Value*, 8> args;
331 for (auto& arg : pCallInst->arg_operands())
332 {
333 args.push_back(arg.get());
334 }
335
336 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
337 // full mask for now Assuming the intrinsics are consistent and place the src
338 // operand and mask last in the argument list.
339 if (mTarget == AVX512)
340 {
341 if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
342 {
343 args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
344 args.push_back(GetMask(W256));
345 // for AVX512 VCVTPD2PS, we also have to add rounding mode
346 args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
347 }
348 else
349 {
350 args.push_back(GetZeroVec(vecWidth, pElemTy));
351 args.push_back(GetMask(vecWidth));
352 }
353 }
354
355 return B->CALLA(pIntrin, args);
356 }
357 else
358 {
359 // No native intrinsic, call emulation function
360 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
361 }
362
363 SWR_ASSERT(false);
364 return nullptr;
365 }
366
367 Instruction* ProcessIntrinsic(CallInst* pCallInst)
368 {
369 Function* pFunc = pCallInst->getCalledFunction();
370
371 // Forward to the advanced support if found
372 if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
373 {
374 return ProcessIntrinsicAdvanced(pCallInst);
375 }
376
377 SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(),
378 "Unimplemented intrinsic %s.",
379 pFunc->getName());
380
381 Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
382 Function* pX86IntrinFunc =
383 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
384
385 SmallVector<Value*, 8> args;
386 for (auto& arg : pCallInst->arg_operands())
387 {
388 args.push_back(arg.get());
389 }
390 return B->CALLA(pX86IntrinFunc, args);
391 }
392
393 //////////////////////////////////////////////////////////////////////////
394 /// @brief LLVM funtion pass run method.
395 /// @param f- The function we're working on with this pass.
396 virtual bool runOnFunction(Function& F)
397 {
398 std::vector<Instruction*> toRemove;
399 std::vector<BasicBlock*> bbs;
400
401 // Make temp copy of the basic blocks and instructions, as the intrinsic
402 // replacement code might invalidate the iterators
403 for (auto& b : F.getBasicBlockList())
404 {
405 bbs.push_back(&b);
406 }
407
408 for (auto* BB : bbs)
409 {
410 std::vector<Instruction*> insts;
411 for (auto& i : BB->getInstList())
412 {
413 insts.push_back(&i);
414 }
415
416 for (auto* I : insts)
417 {
418 if (CallInst* pCallInst = dyn_cast<CallInst>(I))
419 {
420 Function* pFunc = pCallInst->getCalledFunction();
421 if (pFunc)
422 {
423 if (pFunc->getName().startswith("meta.intrinsic"))
424 {
425 B->IRB()->SetInsertPoint(I);
426 Instruction* pReplace = ProcessIntrinsic(pCallInst);
427 toRemove.push_back(pCallInst);
428 if (pReplace)
429 {
430 pCallInst->replaceAllUsesWith(pReplace);
431 }
432 }
433 }
434 }
435 }
436 }
437
438 for (auto* pInst : toRemove)
439 {
440 pInst->eraseFromParent();
441 }
442
443 JitManager::DumpToFile(&F, "lowerx86");
444
445 return true;
446 }
447
448 virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
449
450 JitManager* JM() { return B->JM(); }
451 Builder* B;
452 TargetArch mTarget;
453 Function* mPfnScatter256;
454
455 static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
456 };
457
458 char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
459
460 FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
461
462 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
463 {
464 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
465 return nullptr;
466 }
467
468 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
469 {
470 // Only need vperm emulation for AVX
471 SWR_ASSERT(arch == AVX);
472
473 Builder* B = pThis->B;
474 auto v32A = pCallInst->getArgOperand(0);
475 auto vi32Index = pCallInst->getArgOperand(1);
476
477 Value* v32Result;
478 if (isa<Constant>(vi32Index))
479 {
480 // Can use llvm shuffle vector directly with constant shuffle indices
481 v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
482 }
483 else
484 {
485 v32Result = UndefValue::get(v32A->getType());
486 for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
487 {
488 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
489 auto val = B->VEXTRACT(v32A, i32Index);
490 v32Result = B->VINSERT(v32Result, val, B->C(l));
491 }
492 }
493 return cast<Instruction>(v32Result);
494 }
495
496 Instruction*
497 VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
498 {
499 Builder* B = pThis->B;
500 auto vSrc = pCallInst->getArgOperand(0);
501 auto pBase = pCallInst->getArgOperand(1);
502 auto vi32Indices = pCallInst->getArgOperand(2);
503 auto vi1Mask = pCallInst->getArgOperand(3);
504 auto i8Scale = pCallInst->getArgOperand(4);
505
506 pBase = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
507 uint32_t numElem = vSrc->getType()->getVectorNumElements();
508 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
509 auto srcTy = vSrc->getType()->getVectorElementType();
510 Value* v32Gather = nullptr;
511 if (arch == AVX)
512 {
513 // Full emulation for AVX
514 // Store source on stack to provide a valid address to load from inactive lanes
515 auto pStack = B->STACKSAVE();
516 auto pTmp = B->ALLOCA(vSrc->getType());
517 B->STORE(vSrc, pTmp);
518
519 v32Gather = UndefValue::get(vSrc->getType());
520 auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
521 auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
522
523 for (uint32_t i = 0; i < numElem; ++i)
524 {
525 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
526 auto pLoadAddress = B->GEP(pBase, i32Offset);
527 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
528 auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
529 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
530 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
531 auto val = B->LOAD(pValidAddress);
532 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
533 }
534
535 B->STACKRESTORE(pStack);
536 }
537 else if (arch == AVX2 || (arch == AVX512 && width == W256))
538 {
539 Function* pX86IntrinFunc = nullptr;
540 if (srcTy == B->mFP32Ty)
541 {
542 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
543 Intrinsic::x86_avx2_gather_d_ps_256);
544 }
545 else if (srcTy == B->mInt32Ty)
546 {
547 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
548 Intrinsic::x86_avx2_gather_d_d_256);
549 }
550 else if (srcTy == B->mDoubleTy)
551 {
552 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
553 Intrinsic::x86_avx2_gather_d_q_256);
554 }
555 else
556 {
557 SWR_ASSERT(false, "Unsupported vector element type for gather.");
558 }
559
560 if (width == W256)
561 {
562 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
563 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
564 }
565 else if (width == W512)
566 {
567 // Double pump 4-wide for 64bit elements
568 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
569 {
570 auto v64Mask = pThis->VectorMask(vi1Mask);
571 v64Mask = B->S_EXT(
572 v64Mask,
573 VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
574 v64Mask = B->BITCAST(v64Mask, vSrc->getType());
575
576 Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
577 Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
578
579 Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
580 Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
581
582 Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
583 Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
584
585 src0 = B->BITCAST(
586 src0,
587 VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
588 mask0 = B->BITCAST(
589 mask0,
590 VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
591 Value* gather0 =
592 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
593 src1 = B->BITCAST(
594 src1,
595 VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
596 mask1 = B->BITCAST(
597 mask1,
598 VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
599 Value* gather1 =
600 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
601
602 v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
603 v32Gather = B->BITCAST(v32Gather, vSrc->getType());
604 }
605 else
606 {
607 // Double pump 8-wide for 32bit elements
608 auto v32Mask = pThis->VectorMask(vi1Mask);
609 v32Mask = B->BITCAST(v32Mask, vSrc->getType());
610 Value* src0 = B->EXTRACT_16(vSrc, 0);
611 Value* src1 = B->EXTRACT_16(vSrc, 1);
612
613 Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
614 Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
615
616 Value* mask0 = B->EXTRACT_16(v32Mask, 0);
617 Value* mask1 = B->EXTRACT_16(v32Mask, 1);
618
619 Value* gather0 =
620 B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
621 Value* gather1 =
622 B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
623
624 v32Gather = B->JOIN_16(gather0, gather1);
625 }
626 }
627 }
628 else if (arch == AVX512)
629 {
630 Value* iMask = nullptr;
631 Function* pX86IntrinFunc = nullptr;
632 if (srcTy == B->mFP32Ty)
633 {
634 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
635 Intrinsic::x86_avx512_gather_dps_512);
636 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
637 }
638 else if (srcTy == B->mInt32Ty)
639 {
640 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
641 Intrinsic::x86_avx512_gather_dpi_512);
642 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
643 }
644 else if (srcTy == B->mDoubleTy)
645 {
646 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
647 Intrinsic::x86_avx512_gather_dpd_512);
648 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
649 }
650 else
651 {
652 SWR_ASSERT(false, "Unsupported vector element type for gather.");
653 }
654
655 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
656 v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
657 }
658
659 return cast<Instruction>(v32Gather);
660 }
661 Instruction*
662 VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
663 {
664 Builder* B = pThis->B;
665 auto pBase = pCallInst->getArgOperand(0);
666 auto vi1Mask = pCallInst->getArgOperand(1);
667 auto vi32Indices = pCallInst->getArgOperand(2);
668 auto v32Src = pCallInst->getArgOperand(3);
669 auto i32Scale = pCallInst->getArgOperand(4);
670
671 if (arch != AVX512)
672 {
673 // Call into C function to do the scatter. This has significantly better compile perf
674 // compared to jitting scatter loops for every scatter
675 if (width == W256)
676 {
677 auto mask = B->BITCAST(vi1Mask, B->mInt8Ty);
678 B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale});
679 }
680 else
681 {
682 // Need to break up 512 wide scatter to two 256 wide
683 auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
684 auto indicesLo =
685 B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
686 auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
687
688 auto mask = B->BITCAST(maskLo, B->mInt8Ty);
689 B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale});
690
691 auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
692 auto indicesHi =
693 B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
694 auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15}));
695
696 mask = B->BITCAST(maskHi, B->mInt8Ty);
697 B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale});
698 }
699 return nullptr;
700 }
701
702 Value* iMask;
703 Function* pX86IntrinFunc;
704 if (width == W256)
705 {
706 // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
707 // can use the scatter of 8 elements with 64bit indices
708 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
709 Intrinsic::x86_avx512_scatter_qps_512);
710
711 auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty);
712 iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
713 B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale});
714 }
715 else if (width == W512)
716 {
717 pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
718 Intrinsic::x86_avx512_scatter_dps_512);
719 iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
720 B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale});
721 }
722 return nullptr;
723 }
724
725 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
726 // instructions
727 Instruction*
728 VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
729 {
730 SWR_ASSERT(arch == AVX512);
731
732 auto B = pThis->B;
733 auto vf32Src = pCallInst->getOperand(0);
734 auto i8Round = pCallInst->getOperand(1);
735 auto pfnFunc =
736 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
737
738 if (width == W256)
739 {
740 return cast<Instruction>(B->CALL2(pfnFunc, vf32Src, i8Round));
741 }
742 else if (width == W512)
743 {
744 auto v8f32SrcLo = B->EXTRACT_16(vf32Src, 0);
745 auto v8f32SrcHi = B->EXTRACT_16(vf32Src, 1);
746
747 auto v8f32ResLo = B->CALL2(pfnFunc, v8f32SrcLo, i8Round);
748 auto v8f32ResHi = B->CALL2(pfnFunc, v8f32SrcHi, i8Round);
749
750 return cast<Instruction>(B->JOIN_16(v8f32ResLo, v8f32ResHi));
751 }
752 else
753 {
754 SWR_ASSERT(false, "Unimplemented vector width.");
755 }
756
757 return nullptr;
758 }
759
760 Instruction*
761 VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
762 {
763 SWR_ASSERT(arch == AVX512);
764
765 auto B = pThis->B;
766 auto vf32Src = pCallInst->getOperand(0);
767
768 if (width == W256)
769 {
770 auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
771 Intrinsic::x86_avx_round_ps_256);
772 return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
773 }
774 else if (width == W512)
775 {
776 // 512 can use intrinsic
777 auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
778 Intrinsic::x86_avx512_mask_cvtpd2ps_512);
779 return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
780 }
781 else
782 {
783 SWR_ASSERT(false, "Unimplemented vector width.");
784 }
785
786 return nullptr;
787 }
788
789 // No support for hsub in AVX512
790 Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
791 {
792 SWR_ASSERT(arch == AVX512);
793
794 auto B = pThis->B;
795 auto src0 = pCallInst->getOperand(0);
796 auto src1 = pCallInst->getOperand(1);
797
798 // 256b hsub can just use avx intrinsic
799 if (width == W256)
800 {
801 auto pX86IntrinFunc =
802 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
803 return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
804 }
805 else if (width == W512)
806 {
807 // 512b hsub can be accomplished with shuf/sub combo
808 auto minuend = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
809 auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
810 return cast<Instruction>(B->SUB(minuend, subtrahend));
811 }
812 else
813 {
814 SWR_ASSERT(false, "Unimplemented vector width.");
815 return nullptr;
816 }
817 }
818
819 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
820 // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
821 Instruction* DOUBLE_EMU(LowerX86* pThis,
822 TargetArch arch,
823 TargetWidth width,
824 CallInst* pCallInst,
825 Intrinsic::ID intrin)
826 {
827 auto B = pThis->B;
828 SWR_ASSERT(width == W512);
829 Value* result[2];
830 Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
831 for (uint32_t i = 0; i < 2; ++i)
832 {
833 SmallVector<Value*, 8> args;
834 for (auto& arg : pCallInst->arg_operands())
835 {
836 auto argType = arg.get()->getType();
837 if (argType->isVectorTy())
838 {
839 uint32_t vecWidth = argType->getVectorNumElements();
840 Value* lanes = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
841 Value* argToPush = B->VSHUFFLE(
842 arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
843 args.push_back(argToPush);
844 }
845 else
846 {
847 args.push_back(arg.get());
848 }
849 }
850 result[i] = B->CALLA(pX86IntrinFunc, args);
851 }
852 uint32_t vecWidth;
853 if (result[0]->getType()->isVectorTy())
854 {
855 assert(result[1]->getType()->isVectorTy());
856 vecWidth = result[0]->getType()->getVectorNumElements() +
857 result[1]->getType()->getVectorNumElements();
858 }
859 else
860 {
861 vecWidth = 2;
862 }
863 Value* lanes = B->CInc<int>(0, vecWidth);
864 return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
865 }
866
867 } // namespace SwrJit
868
869 using namespace SwrJit;
870
871 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
872 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)