1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief llvm pass to lower meta code to x86
29 ******************************************************************************/
31 #include "jit_pch.hpp"
33 #include "JitManager.h"
35 #include <unordered_map>
40 // foward declare the initializer
41 void initializeLowerX86Pass(PassRegistry
&);
64 typedef std::function
<Instruction
*(LowerX86
*, TargetArch
, TargetWidth
, CallInst
*)> EmuFunc
;
68 Intrinsic::ID intrin
[NUM_WIDTHS
];
72 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
73 // mapping directly to avx/avx2 intrinsics.
74 static std::map
<std::string
, Intrinsic::ID
> intrinsicMap
= {
75 {"meta.intrinsic.VGATHERPD", Intrinsic::x86_avx2_gather_d_pd_256
},
76 {"meta.intrinsic.VROUND", Intrinsic::x86_avx_round_ps_256
},
77 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32
},
78 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b
},
79 {"meta.intrinsic.VCVTPD2PS", Intrinsic::x86_avx_cvt_pd2_ps_256
},
80 {"meta.intrinsic.VCVTPH2PS", Intrinsic::x86_vcvtph2ps_256
},
81 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256
},
82 {"meta.intrinsic.VHSUBPS", Intrinsic::x86_avx_hsub_ps_256
},
83 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256
},
84 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256
},
85 {"meta.intrinsic.VFMADDPS", Intrinsic::x86_fma_vfmadd_ps_256
},
86 {"meta.intrinsic.VMOVMSKPS", Intrinsic::x86_avx_movmsk_ps_256
},
87 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d
},
88 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32
},
89 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc
},
93 Instruction
* NO_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
94 Instruction
* VPERM_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
95 Instruction
* VGATHER_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
97 static std::map
<std::string
, X86Intrinsic
> intrinsicMap2
[] = {
100 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
101 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
102 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
103 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
104 {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
105 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
106 {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
109 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
110 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
111 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
112 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
113 {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
114 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
115 {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
118 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256
, Intrinsic::x86_avx512_rcp14_ps_512
}, NO_EMU
}},
119 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256
, Intrinsic::x86_avx512_mask_permvar_sf_512
}, NO_EMU
}},
120 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256
, Intrinsic::x86_avx512_mask_permvar_si_512
}, NO_EMU
}},
121 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
122 {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
123 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
124 {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
128 struct LowerX86
: public FunctionPass
130 LowerX86(JitManager
* pJitMgr
= nullptr, Builder
* b
= nullptr)
131 : FunctionPass(ID
), mpJitMgr(pJitMgr
), B(b
)
133 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
135 // Determine target arch
136 if (mpJitMgr
->mArch
.AVX512F())
140 else if (mpJitMgr
->mArch
.AVX2())
144 else if (mpJitMgr
->mArch
.AVX())
151 SWR_ASSERT(false, "Unsupported AVX architecture.");
156 // Try to decipher the vector type of the instruction. This does not work properly
157 // across all intrinsics, and will have to be rethought. Probably need something
158 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
160 void GetRequestedWidthAndType(CallInst
* pCallInst
, TargetWidth
* pWidth
, Type
** pTy
)
163 Type
* pVecTy
= pCallInst
->getType();
164 if (!pVecTy
->isVectorTy())
166 for (auto& op
: pCallInst
->arg_operands())
168 if (op
.get()->getType()->isVectorTy())
170 pVecTy
= op
.get()->getType();
175 SWR_ASSERT(pVecTy
->isVectorTy(), "Couldn't determine vector size");
177 uint32_t width
= cast
<VectorType
>(pVecTy
)->getBitWidth();
180 case 256: *pWidth
= W256
; break;
181 case 512: *pWidth
= W512
; break;
182 default: SWR_ASSERT(false, "Unhandled vector width %d", width
);
186 *pTy
= pVecTy
->getScalarType();
189 Value
* GetZeroVec(TargetWidth width
, Type
* pTy
)
191 uint32_t numElem
= 0;
194 case W256
: numElem
= 8; break;
195 case W512
: numElem
= 16; break;
198 return ConstantVector::getNullValue(VectorType::get(pTy
, numElem
));
201 Value
* GetMask(TargetWidth width
)
206 case W256
: mask
= B
->C((uint8_t)-1); break;
207 case W512
: mask
= B
->C((uint16_t)-1); break;
212 Instruction
* ProcessIntrinsicAdvanced(CallInst
* pCallInst
)
214 Function
* pFunc
= pCallInst
->getCalledFunction();
215 auto& intrinsic
= intrinsicMap2
[mTarget
][pFunc
->getName()];
216 TargetWidth vecWidth
;
218 GetRequestedWidthAndType(pCallInst
, &vecWidth
, &pElemTy
);
220 // Check if there is a native intrinsic for this instruction
221 Intrinsic::ID id
= intrinsic
.intrin
[vecWidth
];
222 if (id
!= Intrinsic::not_intrinsic
)
224 Function
* pIntrin
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, id
);
225 SmallVector
<Value
*, 8> args
;
226 for (auto& arg
: pCallInst
->arg_operands())
228 args
.push_back(arg
.get());
231 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
232 // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
233 if (mTarget
== AVX512
)
235 args
.push_back(GetZeroVec(vecWidth
, pElemTy
));
236 args
.push_back(GetMask(vecWidth
));
239 return B
->CALLA(pIntrin
, args
);
243 // No native intrinsic, call emulation function
244 return intrinsic
.emuFunc(this, mTarget
, vecWidth
, pCallInst
);
251 Instruction
* ProcessIntrinsic(CallInst
* pCallInst
)
253 Function
* pFunc
= pCallInst
->getCalledFunction();
255 // Forward to the advanced support if found
256 if (intrinsicMap2
[mTarget
].find(pFunc
->getName()) != intrinsicMap2
[mTarget
].end())
258 return ProcessIntrinsicAdvanced(pCallInst
);
261 SWR_ASSERT(intrinsicMap
.find(pFunc
->getName()) != intrinsicMap
.end(), "Unimplemented intrinsic %s.", pFunc
->getName());
263 Intrinsic::ID x86Intrinsic
= intrinsicMap
[pFunc
->getName()];
264 Function
* pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, x86Intrinsic
);
266 SmallVector
<Value
*, 8> args
;
267 for (auto& arg
: pCallInst
->arg_operands())
269 args
.push_back(arg
.get());
271 return B
->CALLA(pX86IntrinFunc
, args
);
274 //////////////////////////////////////////////////////////////////////////
275 /// @brief LLVM funtion pass run method.
276 /// @param f- The function we're working on with this pass.
277 virtual bool runOnFunction(Function
& F
)
279 std::vector
<Instruction
*> toRemove
;
281 for (auto& BB
: F
.getBasicBlockList())
283 for (auto& I
: BB
.getInstList())
285 if (CallInst
* pCallInst
= dyn_cast
<CallInst
>(&I
))
287 Function
* pFunc
= pCallInst
->getCalledFunction();
290 if (pFunc
->getName().startswith("meta.intrinsic"))
292 B
->IRB()->SetInsertPoint(&I
);
293 Instruction
* pReplace
= ProcessIntrinsic(pCallInst
);
294 SWR_ASSERT(pReplace
);
295 toRemove
.push_back(pCallInst
);
296 pCallInst
->replaceAllUsesWith(pReplace
);
304 for (auto* pInst
: toRemove
)
306 pInst
->eraseFromParent();
309 JitManager::DumpToFile(&F
, "lowerx86");
314 virtual void getAnalysisUsage(AnalysisUsage
& AU
) const
318 JitManager
* JM() { return mpJitMgr
; }
320 JitManager
* mpJitMgr
;
325 static char ID
; ///< Needed by LLVM to generate ID for FunctionPass.
328 char LowerX86::ID
= 0; // LLVM uses address of ID as the actual ID.
330 FunctionPass
* createLowerX86Pass(JitManager
* pJitMgr
, Builder
* b
)
332 return new LowerX86(pJitMgr
, b
);
335 Instruction
* NO_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
337 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
341 Instruction
* VPERM_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
343 // Only need vperm emulation for AVX
344 SWR_ASSERT(arch
== AVX
);
346 Builder
* B
= pThis
->B
;
347 auto v32A
= pCallInst
->getArgOperand(0);
348 auto vi32Index
= pCallInst
->getArgOperand(1);
351 if (isa
<Constant
>(vi32Index
))
353 // Can use llvm shuffle vector directly with constant shuffle indices
354 v32Result
= B
->VSHUFFLE(v32A
, v32A
, vi32Index
);
358 v32Result
= UndefValue::get(v32A
->getType());
359 for (uint32_t l
= 0; l
< v32A
->getType()->getVectorNumElements(); ++l
)
361 auto i32Index
= B
->VEXTRACT(vi32Index
, B
->C(l
));
362 auto val
= B
->VEXTRACT(v32A
, i32Index
);
363 v32Result
= B
->VINSERT(v32Result
, val
, B
->C(l
));
366 return cast
<Instruction
>(v32Result
);
369 Instruction
* VGATHER_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
371 Builder
* B
= pThis
->B
;
372 auto vSrc
= pCallInst
->getArgOperand(0);
373 auto pBase
= pCallInst
->getArgOperand(1);
374 auto vi32Indices
= pCallInst
->getArgOperand(2);
375 auto vi1Mask
= pCallInst
->getArgOperand(3);
376 auto i8Scale
= pCallInst
->getArgOperand(4);
378 pBase
= B
->INT_TO_PTR(pBase
, PointerType::get(B
->mInt8Ty
, 0));
379 uint32_t numElem
= vSrc
->getType()->getVectorNumElements();
380 auto i32Scale
= B
->Z_EXT(i8Scale
, B
->mInt32Ty
);
381 auto srcTy
= vSrc
->getType()->getVectorElementType();
385 // Full emulation for AVX
386 // Store source on stack to provide a valid address to load from inactive lanes
387 auto pStack
= B
->STACKSAVE();
388 auto pTmp
= B
->ALLOCA(vSrc
->getType());
389 B
->STORE(vSrc
, pTmp
);
391 v32Gather
= UndefValue::get(vSrc
->getType());
392 auto vi32Scale
= ConstantVector::getSplat(numElem
, cast
<ConstantInt
>(i32Scale
));
393 auto vi32Offsets
= B
->MUL(vi32Indices
, vi32Scale
);
395 for (uint32_t i
= 0; i
< numElem
; ++i
)
397 auto i32Offset
= B
->VEXTRACT(vi32Offsets
, B
->C(i
));
398 auto pLoadAddress
= B
->GEP(pBase
, i32Offset
);
399 pLoadAddress
= B
->BITCAST(pLoadAddress
, PointerType::get(srcTy
, 0));
400 auto pMaskedLoadAddress
= B
->GEP(pTmp
, { 0, i
});
401 auto i1Mask
= B
->VEXTRACT(vi1Mask
, B
->C(i
));
402 auto pValidAddress
= B
->SELECT(i1Mask
, pLoadAddress
, pMaskedLoadAddress
);
403 auto val
= B
->LOAD(pValidAddress
);
404 v32Gather
= B
->VINSERT(v32Gather
, val
, B
->C(i
));
407 B
->STACKRESTORE(pStack
);
409 else if (arch
== AVX2
|| (arch
== AVX512
&& width
== W256
))
411 Function
* pX86IntrinFunc
= srcTy
== B
->mFP32Ty
? Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx2_gather_d_ps_256
) :
412 Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx2_gather_d_d_256
);
415 auto v32Mask
= B
->BITCAST(B
->VMASK(vi1Mask
), vSrc
->getType());
416 v32Gather
= B
->CALL(pX86IntrinFunc
, { vSrc
, pBase
, vi32Indices
, v32Mask
, i8Scale
});
418 else if (width
== W512
)
420 // Double pump 8-wide
421 auto v32Mask
= B
->BITCAST(B
->VMASK_16(vi1Mask
), vSrc
->getType());
422 Value
*src0
= B
->EXTRACT_16(vSrc
, 0);
423 Value
*src1
= B
->EXTRACT_16(vSrc
, 1);
425 Value
*indices0
= B
->EXTRACT_16(vi32Indices
, 0);
426 Value
*indices1
= B
->EXTRACT_16(vi32Indices
, 1);
428 Value
*mask0
= B
->EXTRACT_16(v32Mask
, 0);
429 Value
*mask1
= B
->EXTRACT_16(v32Mask
, 1);
431 Value
*gather0
= B
->CALL(pX86IntrinFunc
, { src0
, pBase
, indices0
, mask0
, i8Scale
});
432 Value
*gather1
= B
->CALL(pX86IntrinFunc
, { src1
, pBase
, indices1
, mask1
, i8Scale
});
434 v32Gather
= B
->JOIN_16(gather0
, gather1
);
437 else if (arch
== AVX512
)
439 auto i16Mask
= B
->BITCAST(vi1Mask
, B
->mInt16Ty
);
441 Function
* pX86IntrinFunc
= srcTy
== B
->mFP32Ty
? Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx512_gather_dps_512
) :
442 Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx512_gather_dpi_512
);
443 auto i32Scale
= B
->Z_EXT(i8Scale
, B
->mInt32Ty
);
444 v32Gather
= B
->CALL(pX86IntrinFunc
, { vSrc
, pBase
, vi32Indices
, i16Mask
, i32Scale
});
447 return cast
<Instruction
>(v32Gather
);
451 using namespace SwrJit
;
453 INITIALIZE_PASS_BEGIN(LowerX86
, "LowerX86", "LowerX86", false, false)
454 INITIALIZE_PASS_END(LowerX86
, "LowerX86", "LowerX86", false, false)