swr/rast: Start refactoring of builder/packetizer.
[mesa.git] / src / gallium / drivers / swr / rasterizer / jitter / functionpasses / lower_x86.cpp
1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file lower_x86.cpp
24 *
25 * @brief llvm pass to lower meta code to x86
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30
31 #include "jit_pch.hpp"
32 #include "passes.h"
33 #include "JitManager.h"
34
35 #include <unordered_map>
36
37
38 namespace llvm
39 {
40 // foward declare the initializer
41 void initializeLowerX86Pass(PassRegistry&);
42 }
43
44 namespace SwrJit
45 {
46 using namespace llvm;
47
48 enum TargetArch
49 {
50 AVX = 0,
51 AVX2 = 1,
52 AVX512 = 2
53 };
54
55 enum TargetWidth
56 {
57 W256 = 0,
58 W512 = 1,
59 NUM_WIDTHS = 2
60 };
61
62 struct LowerX86;
63
64 typedef std::function<Instruction*(LowerX86*, TargetArch, TargetWidth, CallInst*)> EmuFunc;
65
66 struct X86Intrinsic
67 {
68 Intrinsic::ID intrin[NUM_WIDTHS];
69 EmuFunc emuFunc;
70 };
71
72 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
73 // mapping directly to avx/avx2 intrinsics.
74 static std::map<std::string, Intrinsic::ID> intrinsicMap = {
75 {"meta.intrinsic.VGATHERPD", Intrinsic::x86_avx2_gather_d_pd_256},
76 {"meta.intrinsic.VROUND", Intrinsic::x86_avx_round_ps_256},
77 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
78 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
79 {"meta.intrinsic.VCVTPD2PS", Intrinsic::x86_avx_cvt_pd2_ps_256},
80 {"meta.intrinsic.VCVTPH2PS", Intrinsic::x86_vcvtph2ps_256},
81 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
82 {"meta.intrinsic.VHSUBPS", Intrinsic::x86_avx_hsub_ps_256},
83 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
84 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
85 {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256},
86 {"meta.intrinsic.VMOVMSKPS", Intrinsic::x86_avx_movmsk_ps_256},
87 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
88 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
89 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
90 };
91
92 // Forward decls
93 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
94 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
95 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
96
97 static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
98 // 256 wide 512 wide
99 { // AVX
100 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
101 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
102 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
103 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
104 {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
105 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
106 {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
107 },
108 { // AVX2
109 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
110 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
111 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
112 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
113 {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
114 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
115 {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
116 },
117 { // AVX512
118 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
119 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}},
120 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}},
121 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
122 {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
123 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
124 {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
125 }
126 };
127
128 struct LowerX86 : public FunctionPass
129 {
130 LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
131 : FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
132 {
133 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
134
135 // Determine target arch
136 if (mpJitMgr->mArch.AVX512F())
137 {
138 mTarget = AVX512;
139 }
140 else if (mpJitMgr->mArch.AVX2())
141 {
142 mTarget = AVX2;
143 }
144 else if (mpJitMgr->mArch.AVX())
145 {
146 mTarget = AVX;
147
148 }
149 else
150 {
151 SWR_ASSERT(false, "Unsupported AVX architecture.");
152 mTarget = AVX;
153 }
154 }
155
156 // Try to decipher the vector type of the instruction. This does not work properly
157 // across all intrinsics, and will have to be rethought. Probably need something
158 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
159 // intrinsic.
160 void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* pWidth, Type** pTy)
161 {
162 uint32_t vecWidth;
163 Type* pVecTy = pCallInst->getType();
164 if (!pVecTy->isVectorTy())
165 {
166 for (auto& op : pCallInst->arg_operands())
167 {
168 if (op.get()->getType()->isVectorTy())
169 {
170 pVecTy = op.get()->getType();
171 break;
172 }
173 }
174 }
175 SWR_ASSERT(pVecTy->isVectorTy(), "Couldn't determine vector size");
176
177 uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
178 switch (width)
179 {
180 case 256: *pWidth = W256; break;
181 case 512: *pWidth = W512; break;
182 default: SWR_ASSERT(false, "Unhandled vector width %d", width);
183 *pWidth = W256;
184 }
185
186 *pTy = pVecTy->getScalarType();
187 }
188
189 Value* GetZeroVec(TargetWidth width, Type* pTy)
190 {
191 uint32_t numElem = 0;
192 switch (width)
193 {
194 case W256: numElem = 8; break;
195 case W512: numElem = 16; break;
196 }
197
198 return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
199 }
200
201 Value* GetMask(TargetWidth width)
202 {
203 Value* mask;
204 switch (width)
205 {
206 case W256: mask = B->C((uint8_t)-1); break;
207 case W512: mask = B->C((uint16_t)-1); break;
208 }
209 return mask;
210 }
211
212 Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
213 {
214 Function* pFunc = pCallInst->getCalledFunction();
215 auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
216 TargetWidth vecWidth;
217 Type* pElemTy;
218 GetRequestedWidthAndType(pCallInst, &vecWidth, &pElemTy);
219
220 // Check if there is a native intrinsic for this instruction
221 Intrinsic::ID id = intrinsic.intrin[vecWidth];
222 if (id != Intrinsic::not_intrinsic)
223 {
224 Function* pIntrin = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, id);
225 SmallVector<Value*, 8> args;
226 for (auto& arg : pCallInst->arg_operands())
227 {
228 args.push_back(arg.get());
229 }
230
231 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
232 // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
233 if (mTarget == AVX512)
234 {
235 args.push_back(GetZeroVec(vecWidth, pElemTy));
236 args.push_back(GetMask(vecWidth));
237 }
238
239 return B->CALLA(pIntrin, args);
240 }
241 else
242 {
243 // No native intrinsic, call emulation function
244 return intrinsic.emuFunc(this, mTarget, vecWidth, pCallInst);
245 }
246
247 SWR_ASSERT(false);
248 return nullptr;
249 }
250
251 Instruction* ProcessIntrinsic(CallInst* pCallInst)
252 {
253 Function* pFunc = pCallInst->getCalledFunction();
254
255 // Forward to the advanced support if found
256 if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
257 {
258 return ProcessIntrinsicAdvanced(pCallInst);
259 }
260
261 SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName());
262
263 Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
264 Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
265
266 SmallVector<Value*, 8> args;
267 for (auto& arg : pCallInst->arg_operands())
268 {
269 args.push_back(arg.get());
270 }
271 return B->CALLA(pX86IntrinFunc, args);
272 }
273
274 //////////////////////////////////////////////////////////////////////////
275 /// @brief LLVM funtion pass run method.
276 /// @param f- The function we're working on with this pass.
277 virtual bool runOnFunction(Function& F)
278 {
279 std::vector<Instruction*> toRemove;
280
281 for (auto& BB : F.getBasicBlockList())
282 {
283 for (auto& I : BB.getInstList())
284 {
285 if (CallInst* pCallInst = dyn_cast<CallInst>(&I))
286 {
287 Function* pFunc = pCallInst->getCalledFunction();
288 if (pFunc)
289 {
290 if (pFunc->getName().startswith("meta.intrinsic"))
291 {
292 B->IRB()->SetInsertPoint(&I);
293 Instruction* pReplace = ProcessIntrinsic(pCallInst);
294 SWR_ASSERT(pReplace);
295 toRemove.push_back(pCallInst);
296 pCallInst->replaceAllUsesWith(pReplace);
297 }
298 }
299
300 }
301 }
302 }
303
304 for (auto* pInst : toRemove)
305 {
306 pInst->eraseFromParent();
307 }
308
309 JitManager::DumpToFile(&F, "lowerx86");
310
311 return true;
312 }
313
314 virtual void getAnalysisUsage(AnalysisUsage& AU) const
315 {
316 }
317
318 JitManager* JM() { return mpJitMgr; }
319
320 JitManager* mpJitMgr;
321 Builder* B;
322
323 TargetArch mTarget;
324
325 static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
326 };
327
328 char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
329
330 FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
331 {
332 return new LowerX86(pJitMgr, b);
333 }
334
335 Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
336 {
337 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
338 return nullptr;
339 }
340
341 Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
342 {
343 // Only need vperm emulation for AVX
344 SWR_ASSERT(arch == AVX);
345
346 Builder* B = pThis->B;
347 auto v32A = pCallInst->getArgOperand(0);
348 auto vi32Index = pCallInst->getArgOperand(1);
349
350 Value* v32Result;
351 if (isa<Constant>(vi32Index))
352 {
353 // Can use llvm shuffle vector directly with constant shuffle indices
354 v32Result = B->VSHUFFLE(v32A, v32A, vi32Index);
355 }
356 else
357 {
358 v32Result = UndefValue::get(v32A->getType());
359 for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
360 {
361 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
362 auto val = B->VEXTRACT(v32A, i32Index);
363 v32Result = B->VINSERT(v32Result, val, B->C(l));
364 }
365 }
366 return cast<Instruction>(v32Result);
367 }
368
369 Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
370 {
371 Builder* B = pThis->B;
372 auto vSrc = pCallInst->getArgOperand(0);
373 auto pBase = pCallInst->getArgOperand(1);
374 auto vi32Indices = pCallInst->getArgOperand(2);
375 auto vi1Mask = pCallInst->getArgOperand(3);
376 auto i8Scale = pCallInst->getArgOperand(4);
377
378 pBase = B->INT_TO_PTR(pBase, PointerType::get(B->mInt8Ty, 0));
379 uint32_t numElem = vSrc->getType()->getVectorNumElements();
380 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
381 auto srcTy = vSrc->getType()->getVectorElementType();
382 Value* v32Gather;
383 if (arch == AVX)
384 {
385 // Full emulation for AVX
386 // Store source on stack to provide a valid address to load from inactive lanes
387 auto pStack = B->STACKSAVE();
388 auto pTmp = B->ALLOCA(vSrc->getType());
389 B->STORE(vSrc, pTmp);
390
391 v32Gather = UndefValue::get(vSrc->getType());
392 auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
393 auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
394
395 for (uint32_t i = 0; i < numElem; ++i)
396 {
397 auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
398 auto pLoadAddress = B->GEP(pBase, i32Offset);
399 pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
400 auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i });
401 auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
402 auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
403 auto val = B->LOAD(pValidAddress);
404 v32Gather = B->VINSERT(v32Gather, val, B->C(i));
405 }
406
407 B->STACKRESTORE(pStack);
408 }
409 else if (arch == AVX2 || (arch == AVX512 && width == W256))
410 {
411 Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256) :
412 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
413 if (width == W256)
414 {
415 auto v32Mask = B->BITCAST(B->VMASK(vi1Mask), vSrc->getType());
416 v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale });
417 }
418 else if (width == W512)
419 {
420 // Double pump 8-wide
421 auto v32Mask = B->BITCAST(B->VMASK_16(vi1Mask), vSrc->getType());
422 Value *src0 = B->EXTRACT_16(vSrc, 0);
423 Value *src1 = B->EXTRACT_16(vSrc, 1);
424
425 Value *indices0 = B->EXTRACT_16(vi32Indices, 0);
426 Value *indices1 = B->EXTRACT_16(vi32Indices, 1);
427
428 Value *mask0 = B->EXTRACT_16(v32Mask, 0);
429 Value *mask1 = B->EXTRACT_16(v32Mask, 1);
430
431 Value *gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
432 Value *gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
433
434 v32Gather = B->JOIN_16(gather0, gather1);
435 }
436 }
437 else if (arch == AVX512)
438 {
439 auto i16Mask = B->BITCAST(vi1Mask, B->mInt16Ty);
440
441 Function* pX86IntrinFunc = srcTy == B->mFP32Ty ? Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512) :
442 Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512);
443 auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
444 v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, i16Mask, i32Scale });
445 }
446
447 return cast<Instruction>(v32Gather);
448 }
449 }
450
451 using namespace SwrJit;
452
453 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
454 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
455