1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief llvm pass to lower meta code to x86
29 ******************************************************************************/
31 #include "jit_pch.hpp"
33 #include "JitManager.h"
35 #include "common/simdlib.hpp"
37 #include <unordered_map>
39 extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer
, SIMD256::Float
, uint8_t, uint32_t);
43 // foward declare the initializer
44 void initializeLowerX86Pass(PassRegistry
&);
67 typedef std::function
<Instruction
*(LowerX86
*, TargetArch
, TargetWidth
, CallInst
*)> EmuFunc
;
71 IntrinsicID intrin
[NUM_WIDTHS
];
75 // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
76 // previous behavior of mapping directly to avx/avx2 intrinsics.
77 using intrinsicMap_t
= std::map
<std::string
, IntrinsicID
>;
78 static intrinsicMap_t
& getIntrinsicMap() {
79 static std::map
<std::string
, IntrinsicID
> intrinsicMap
= {
80 {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32
},
81 {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b
},
82 {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256
},
83 {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256
},
84 {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256
},
85 {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d
},
86 {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32
},
87 {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc
}
93 Instruction
* NO_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
95 VPERM_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
97 VGATHER_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
99 VSCATTER_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
101 VROUND_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
103 VHSUB_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
105 VCONVERT_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
);
107 Instruction
* DOUBLE_EMU(LowerX86
* pThis
,
111 Intrinsic::ID intrin
);
113 static Intrinsic::ID DOUBLE
= (Intrinsic::ID
)-1;
115 using intrinsicMapAdvanced_t
= std::vector
<std::map
<std::string
, X86Intrinsic
>>;
117 static intrinsicMapAdvanced_t
& getIntrinsicMapAdvanced()
120 static intrinsicMapAdvanced_t intrinsicMapAdvanced
= {
124 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256
, DOUBLE
}, NO_EMU
}},
125 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
126 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
127 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
128 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
129 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
130 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VSCATTER_EMU
}},
131 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256
, Intrinsic::not_intrinsic
}, NO_EMU
}},
132 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256
, DOUBLE
}, NO_EMU
}},
133 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256
, DOUBLE
}, NO_EMU
}},
137 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256
, DOUBLE
}, NO_EMU
}},
138 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
139 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
140 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
141 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
142 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
143 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VSCATTER_EMU
}},
144 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256
, DOUBLE
}, NO_EMU
}},
145 {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256
, DOUBLE
}, NO_EMU
}},
146 {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256
, DOUBLE
}, NO_EMU
}},
150 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256
, Intrinsic::x86_avx512_rcp14_ps_512
}, NO_EMU
}},
151 #if LLVM_VERSION_MAJOR < 7
152 {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256
, Intrinsic::x86_avx512_mask_permvar_sf_512
}, NO_EMU
}},
153 {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256
, Intrinsic::x86_avx512_mask_permvar_si_512
}, NO_EMU
}},
155 {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
156 {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VPERM_EMU
}},
158 {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
159 {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
160 {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VGATHER_EMU
}},
161 {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VSCATTER_EMU
}},
162 #if LLVM_VERSION_MAJOR < 7
163 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256
, Intrinsic::x86_avx512_mask_cvtpd2ps_512
}, NO_EMU
}},
165 {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VCONVERT_EMU
}},
167 {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VROUND_EMU
}},
168 {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic
, Intrinsic::not_intrinsic
}, VHSUB_EMU
}}
171 return intrinsicMapAdvanced
;
174 static uint32_t getBitWidth(VectorType
*pVTy
)
176 #if LLVM_VERSION_MAJOR >= 11
177 return pVTy
->getNumElements() * pVTy
->getElementType()->getPrimitiveSizeInBits();
179 return pVTy
->getBitWidth();
183 struct LowerX86
: public FunctionPass
185 LowerX86(Builder
* b
= nullptr) : FunctionPass(ID
), B(b
)
187 initializeLowerX86Pass(*PassRegistry::getPassRegistry());
189 // Determine target arch
190 if (JM()->mArch
.AVX512F())
194 else if (JM()->mArch
.AVX2())
198 else if (JM()->mArch
.AVX())
204 SWR_ASSERT(false, "Unsupported AVX architecture.");
208 // Setup scatter function for 256 wide
209 uint32_t curWidth
= B
->mVWidth
;
210 B
->SetTargetWidth(8);
211 std::vector
<Type
*> args
= {
212 B
->mInt8PtrTy
, // pBase
213 B
->mSimdInt32Ty
, // vIndices
214 B
->mSimdFP32Ty
, // vSrc
219 FunctionType
* pfnScatterTy
= FunctionType::get(B
->mVoidTy
, args
, false);
220 mPfnScatter256
= cast
<Function
>(
221 #if LLVM_VERSION_MAJOR >= 9
222 B
->JM()->mpCurrentModule
->getOrInsertFunction("ScatterPS_256", pfnScatterTy
).getCallee());
224 B
->JM()->mpCurrentModule
->getOrInsertFunction("ScatterPS_256", pfnScatterTy
));
226 if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr)
228 sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256
);
231 B
->SetTargetWidth(curWidth
);
234 // Try to decipher the vector type of the instruction. This does not work properly
235 // across all intrinsics, and will have to be rethought. Probably need something
236 // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
238 void GetRequestedWidthAndType(CallInst
* pCallInst
,
239 const StringRef intrinName
,
244 Type
* pVecTy
= pCallInst
->getType();
246 // Check for intrinsic specific types
247 // VCVTPD2PS type comes from src, not dst
248 if (intrinName
.equals("meta.intrinsic.VCVTPD2PS"))
250 Value
* pOp
= pCallInst
->getOperand(0);
252 pVecTy
= pOp
->getType();
255 if (!pVecTy
->isVectorTy())
257 for (auto& op
: pCallInst
->arg_operands())
259 if (op
.get()->getType()->isVectorTy())
261 pVecTy
= op
.get()->getType();
266 SWR_ASSERT(pVecTy
->isVectorTy(), "Couldn't determine vector size");
268 uint32_t width
= getBitWidth(cast
<VectorType
>(pVecTy
));
278 SWR_ASSERT(false, "Unhandled vector width %d", width
);
282 *pTy
= pVecTy
->getScalarType();
285 Value
* GetZeroVec(TargetWidth width
, Type
* pTy
)
287 uint32_t numElem
= 0;
297 SWR_ASSERT(false, "Unhandled vector width type %d\n", width
);
300 return ConstantVector::getNullValue(getVectorType(pTy
, numElem
));
303 Value
* GetMask(TargetWidth width
)
309 mask
= B
->C((uint8_t)-1);
312 mask
= B
->C((uint16_t)-1);
315 SWR_ASSERT(false, "Unhandled vector width type %d\n", width
);
320 // Convert <N x i1> mask to <N x i32> x86 mask
321 Value
* VectorMask(Value
* vi1Mask
)
323 #if LLVM_VERSION_MAJOR >= 11
324 uint32_t numElem
= cast
<VectorType
>(vi1Mask
->getType())->getNumElements();
326 uint32_t numElem
= vi1Mask
->getType()->getVectorNumElements();
328 return B
->S_EXT(vi1Mask
, getVectorType(B
->mInt32Ty
, numElem
));
331 Instruction
* ProcessIntrinsicAdvanced(CallInst
* pCallInst
)
333 Function
* pFunc
= pCallInst
->getCalledFunction();
336 auto& intrinsic
= getIntrinsicMapAdvanced()[mTarget
][pFunc
->getName().str()];
337 TargetWidth vecWidth
;
339 GetRequestedWidthAndType(pCallInst
, pFunc
->getName(), &vecWidth
, &pElemTy
);
341 // Check if there is a native intrinsic for this instruction
342 IntrinsicID id
= intrinsic
.intrin
[vecWidth
];
345 // Double pump the next smaller SIMD intrinsic
346 SWR_ASSERT(vecWidth
!= 0, "Cannot double pump smallest SIMD width.");
347 Intrinsic::ID id2
= intrinsic
.intrin
[vecWidth
- 1];
348 SWR_ASSERT(id2
!= Intrinsic::not_intrinsic
,
349 "Cannot find intrinsic to double pump.");
350 return DOUBLE_EMU(this, mTarget
, vecWidth
, pCallInst
, id2
);
352 else if (id
!= Intrinsic::not_intrinsic
)
354 Function
* pIntrin
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, id
);
355 SmallVector
<Value
*, 8> args
;
356 for (auto& arg
: pCallInst
->arg_operands())
358 args
.push_back(arg
.get());
361 // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
362 // full mask for now Assuming the intrinsics are consistent and place the src
363 // operand and mask last in the argument list.
364 if (mTarget
== AVX512
)
366 if (pFunc
->getName().equals("meta.intrinsic.VCVTPD2PS"))
368 args
.push_back(GetZeroVec(W256
, pCallInst
->getType()->getScalarType()));
369 args
.push_back(GetMask(W256
));
370 // for AVX512 VCVTPD2PS, we also have to add rounding mode
371 args
.push_back(B
->C(_MM_FROUND_TO_NEAREST_INT
| _MM_FROUND_NO_EXC
));
375 args
.push_back(GetZeroVec(vecWidth
, pElemTy
));
376 args
.push_back(GetMask(vecWidth
));
380 return B
->CALLA(pIntrin
, args
);
384 // No native intrinsic, call emulation function
385 return intrinsic
.emuFunc(this, mTarget
, vecWidth
, pCallInst
);
392 Instruction
* ProcessIntrinsic(CallInst
* pCallInst
)
394 Function
* pFunc
= pCallInst
->getCalledFunction();
397 // Forward to the advanced support if found
398 if (getIntrinsicMapAdvanced()[mTarget
].find(pFunc
->getName().str()) != getIntrinsicMapAdvanced()[mTarget
].end())
400 return ProcessIntrinsicAdvanced(pCallInst
);
403 SWR_ASSERT(getIntrinsicMap().find(pFunc
->getName().str()) != getIntrinsicMap().end(),
404 "Unimplemented intrinsic %s.",
405 pFunc
->getName().str().c_str());
407 Intrinsic::ID x86Intrinsic
= getIntrinsicMap()[pFunc
->getName().str()];
408 Function
* pX86IntrinFunc
=
409 Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, x86Intrinsic
);
411 SmallVector
<Value
*, 8> args
;
412 for (auto& arg
: pCallInst
->arg_operands())
414 args
.push_back(arg
.get());
416 return B
->CALLA(pX86IntrinFunc
, args
);
419 //////////////////////////////////////////////////////////////////////////
420 /// @brief LLVM funtion pass run method.
421 /// @param f- The function we're working on with this pass.
422 virtual bool runOnFunction(Function
& F
)
424 std::vector
<Instruction
*> toRemove
;
425 std::vector
<BasicBlock
*> bbs
;
427 // Make temp copy of the basic blocks and instructions, as the intrinsic
428 // replacement code might invalidate the iterators
429 for (auto& b
: F
.getBasicBlockList())
436 std::vector
<Instruction
*> insts
;
437 for (auto& i
: BB
->getInstList())
442 for (auto* I
: insts
)
444 if (CallInst
* pCallInst
= dyn_cast
<CallInst
>(I
))
446 Function
* pFunc
= pCallInst
->getCalledFunction();
449 if (pFunc
->getName().startswith("meta.intrinsic"))
451 B
->IRB()->SetInsertPoint(I
);
452 Instruction
* pReplace
= ProcessIntrinsic(pCallInst
);
453 toRemove
.push_back(pCallInst
);
456 pCallInst
->replaceAllUsesWith(pReplace
);
464 for (auto* pInst
: toRemove
)
466 pInst
->eraseFromParent();
469 JitManager::DumpToFile(&F
, "lowerx86");
474 virtual void getAnalysisUsage(AnalysisUsage
& AU
) const {}
476 JitManager
* JM() { return B
->JM(); }
479 Function
* mPfnScatter256
;
481 static char ID
; ///< Needed by LLVM to generate ID for FunctionPass.
484 char LowerX86::ID
= 0; // LLVM uses address of ID as the actual ID.
486 FunctionPass
* createLowerX86Pass(Builder
* b
) { return new LowerX86(b
); }
488 Instruction
* NO_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
490 SWR_ASSERT(false, "Unimplemented intrinsic emulation.");
494 Instruction
* VPERM_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
496 // Only need vperm emulation for AVX
497 SWR_ASSERT(arch
== AVX
);
499 Builder
* B
= pThis
->B
;
500 auto v32A
= pCallInst
->getArgOperand(0);
501 auto vi32Index
= pCallInst
->getArgOperand(1);
504 if (isa
<Constant
>(vi32Index
))
506 // Can use llvm shuffle vector directly with constant shuffle indices
507 v32Result
= B
->VSHUFFLE(v32A
, v32A
, vi32Index
);
511 v32Result
= UndefValue::get(v32A
->getType());
512 #if LLVM_VERSION_MAJOR >= 11
513 uint32_t numElem
= cast
<VectorType
>(v32A
->getType())->getNumElements();
515 uint32_t numElem
= v32A
->getType()->getVectorNumElements();
517 for (uint32_t l
= 0; l
< numElem
; ++l
)
519 auto i32Index
= B
->VEXTRACT(vi32Index
, B
->C(l
));
520 auto val
= B
->VEXTRACT(v32A
, i32Index
);
521 v32Result
= B
->VINSERT(v32Result
, val
, B
->C(l
));
524 return cast
<Instruction
>(v32Result
);
528 VGATHER_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
530 Builder
* B
= pThis
->B
;
531 auto vSrc
= pCallInst
->getArgOperand(0);
532 auto pBase
= pCallInst
->getArgOperand(1);
533 auto vi32Indices
= pCallInst
->getArgOperand(2);
534 auto vi1Mask
= pCallInst
->getArgOperand(3);
535 auto i8Scale
= pCallInst
->getArgOperand(4);
537 pBase
= B
->POINTER_CAST(pBase
, PointerType::get(B
->mInt8Ty
, 0));
538 #if LLVM_VERSION_MAJOR >= 11
539 VectorType
* pVectorType
= cast
<VectorType
>(vSrc
->getType());
540 uint32_t numElem
= pVectorType
->getNumElements();
541 auto srcTy
= pVectorType
->getElementType();
543 uint32_t numElem
= vSrc
->getType()->getVectorNumElements();
544 auto srcTy
= vSrc
->getType()->getVectorElementType();
546 auto i32Scale
= B
->Z_EXT(i8Scale
, B
->mInt32Ty
);
548 Value
* v32Gather
= nullptr;
551 // Full emulation for AVX
552 // Store source on stack to provide a valid address to load from inactive lanes
553 auto pStack
= B
->STACKSAVE();
554 auto pTmp
= B
->ALLOCA(vSrc
->getType());
555 B
->STORE(vSrc
, pTmp
);
557 v32Gather
= UndefValue::get(vSrc
->getType());
558 #if LLVM_VERSION_MAJOR > 10
559 auto vi32Scale
= ConstantVector::getSplat(ElementCount::get(numElem
, false), cast
<ConstantInt
>(i32Scale
));
561 auto vi32Scale
= ConstantVector::getSplat(numElem
, cast
<ConstantInt
>(i32Scale
));
563 auto vi32Offsets
= B
->MUL(vi32Indices
, vi32Scale
);
565 for (uint32_t i
= 0; i
< numElem
; ++i
)
567 auto i32Offset
= B
->VEXTRACT(vi32Offsets
, B
->C(i
));
568 auto pLoadAddress
= B
->GEP(pBase
, i32Offset
);
569 pLoadAddress
= B
->BITCAST(pLoadAddress
, PointerType::get(srcTy
, 0));
570 auto pMaskedLoadAddress
= B
->GEP(pTmp
, {0, i
});
571 auto i1Mask
= B
->VEXTRACT(vi1Mask
, B
->C(i
));
572 auto pValidAddress
= B
->SELECT(i1Mask
, pLoadAddress
, pMaskedLoadAddress
);
573 auto val
= B
->LOAD(pValidAddress
);
574 v32Gather
= B
->VINSERT(v32Gather
, val
, B
->C(i
));
577 B
->STACKRESTORE(pStack
);
579 else if (arch
== AVX2
|| (arch
== AVX512
&& width
== W256
))
581 Function
* pX86IntrinFunc
= nullptr;
582 if (srcTy
== B
->mFP32Ty
)
584 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
585 Intrinsic::x86_avx2_gather_d_ps_256
);
587 else if (srcTy
== B
->mInt32Ty
)
589 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
590 Intrinsic::x86_avx2_gather_d_d_256
);
592 else if (srcTy
== B
->mDoubleTy
)
594 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
595 Intrinsic::x86_avx2_gather_d_q_256
);
599 SWR_ASSERT(false, "Unsupported vector element type for gather.");
604 auto v32Mask
= B
->BITCAST(pThis
->VectorMask(vi1Mask
), vSrc
->getType());
605 v32Gather
= B
->CALL(pX86IntrinFunc
, {vSrc
, pBase
, vi32Indices
, v32Mask
, i8Scale
});
607 else if (width
== W512
)
609 // Double pump 4-wide for 64bit elements
610 #if LLVM_VERSION_MAJOR >= 11
611 if (cast
<VectorType
>(vSrc
->getType())->getElementType() == B
->mDoubleTy
)
613 if (vSrc
->getType()->getVectorElementType() == B
->mDoubleTy
)
616 auto v64Mask
= pThis
->VectorMask(vi1Mask
);
617 #if LLVM_VERSION_MAJOR >= 11
618 uint32_t numElem
= cast
<VectorType
>(v64Mask
->getType())->getNumElements();
620 uint32_t numElem
= v64Mask
->getType()->getVectorNumElements();
622 v64Mask
= B
->S_EXT(v64Mask
, getVectorType(B
->mInt64Ty
, numElem
));
623 v64Mask
= B
->BITCAST(v64Mask
, vSrc
->getType());
625 Value
* src0
= B
->VSHUFFLE(vSrc
, vSrc
, B
->C({0, 1, 2, 3}));
626 Value
* src1
= B
->VSHUFFLE(vSrc
, vSrc
, B
->C({4, 5, 6, 7}));
628 Value
* indices0
= B
->VSHUFFLE(vi32Indices
, vi32Indices
, B
->C({0, 1, 2, 3}));
629 Value
* indices1
= B
->VSHUFFLE(vi32Indices
, vi32Indices
, B
->C({4, 5, 6, 7}));
631 Value
* mask0
= B
->VSHUFFLE(v64Mask
, v64Mask
, B
->C({0, 1, 2, 3}));
632 Value
* mask1
= B
->VSHUFFLE(v64Mask
, v64Mask
, B
->C({4, 5, 6, 7}));
634 #if LLVM_VERSION_MAJOR >= 11
635 uint32_t numElemSrc0
= cast
<VectorType
>(src0
->getType())->getNumElements();
636 uint32_t numElemMask0
= cast
<VectorType
>(mask0
->getType())->getNumElements();
637 uint32_t numElemSrc1
= cast
<VectorType
>(src1
->getType())->getNumElements();
638 uint32_t numElemMask1
= cast
<VectorType
>(mask1
->getType())->getNumElements();
640 uint32_t numElemSrc0
= src0
->getType()->getVectorNumElements();
641 uint32_t numElemMask0
= mask0
->getType()->getVectorNumElements();
642 uint32_t numElemSrc1
= src1
->getType()->getVectorNumElements();
643 uint32_t numElemMask1
= mask1
->getType()->getVectorNumElements();
645 src0
= B
->BITCAST(src0
, getVectorType(B
->mInt64Ty
, numElemSrc0
));
646 mask0
= B
->BITCAST(mask0
, getVectorType(B
->mInt64Ty
, numElemMask0
));
648 B
->CALL(pX86IntrinFunc
, {src0
, pBase
, indices0
, mask0
, i8Scale
});
649 src1
= B
->BITCAST(src1
, getVectorType(B
->mInt64Ty
, numElemSrc1
));
650 mask1
= B
->BITCAST(mask1
, getVectorType(B
->mInt64Ty
, numElemMask1
));
652 B
->CALL(pX86IntrinFunc
, {src1
, pBase
, indices1
, mask1
, i8Scale
});
653 v32Gather
= B
->VSHUFFLE(gather0
, gather1
, B
->C({0, 1, 2, 3, 4, 5, 6, 7}));
654 v32Gather
= B
->BITCAST(v32Gather
, vSrc
->getType());
658 // Double pump 8-wide for 32bit elements
659 auto v32Mask
= pThis
->VectorMask(vi1Mask
);
660 v32Mask
= B
->BITCAST(v32Mask
, vSrc
->getType());
661 Value
* src0
= B
->EXTRACT_16(vSrc
, 0);
662 Value
* src1
= B
->EXTRACT_16(vSrc
, 1);
664 Value
* indices0
= B
->EXTRACT_16(vi32Indices
, 0);
665 Value
* indices1
= B
->EXTRACT_16(vi32Indices
, 1);
667 Value
* mask0
= B
->EXTRACT_16(v32Mask
, 0);
668 Value
* mask1
= B
->EXTRACT_16(v32Mask
, 1);
671 B
->CALL(pX86IntrinFunc
, {src0
, pBase
, indices0
, mask0
, i8Scale
});
673 B
->CALL(pX86IntrinFunc
, {src1
, pBase
, indices1
, mask1
, i8Scale
});
675 v32Gather
= B
->JOIN_16(gather0
, gather1
);
679 else if (arch
== AVX512
)
681 Value
* iMask
= nullptr;
682 Function
* pX86IntrinFunc
= nullptr;
683 if (srcTy
== B
->mFP32Ty
)
685 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
686 Intrinsic::x86_avx512_gather_dps_512
);
687 iMask
= B
->BITCAST(vi1Mask
, B
->mInt16Ty
);
689 else if (srcTy
== B
->mInt32Ty
)
691 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
692 Intrinsic::x86_avx512_gather_dpi_512
);
693 iMask
= B
->BITCAST(vi1Mask
, B
->mInt16Ty
);
695 else if (srcTy
== B
->mDoubleTy
)
697 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
698 Intrinsic::x86_avx512_gather_dpd_512
);
699 iMask
= B
->BITCAST(vi1Mask
, B
->mInt8Ty
);
703 SWR_ASSERT(false, "Unsupported vector element type for gather.");
706 auto i32Scale
= B
->Z_EXT(i8Scale
, B
->mInt32Ty
);
707 v32Gather
= B
->CALL(pX86IntrinFunc
, {vSrc
, pBase
, vi32Indices
, iMask
, i32Scale
});
710 return cast
<Instruction
>(v32Gather
);
713 VSCATTER_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
715 Builder
* B
= pThis
->B
;
716 auto pBase
= pCallInst
->getArgOperand(0);
717 auto vi1Mask
= pCallInst
->getArgOperand(1);
718 auto vi32Indices
= pCallInst
->getArgOperand(2);
719 auto v32Src
= pCallInst
->getArgOperand(3);
720 auto i32Scale
= pCallInst
->getArgOperand(4);
724 // Call into C function to do the scatter. This has significantly better compile perf
725 // compared to jitting scatter loops for every scatter
728 auto mask
= B
->BITCAST(vi1Mask
, B
->mInt8Ty
);
729 B
->CALL(pThis
->mPfnScatter256
, {pBase
, vi32Indices
, v32Src
, mask
, i32Scale
});
733 // Need to break up 512 wide scatter to two 256 wide
734 auto maskLo
= B
->VSHUFFLE(vi1Mask
, vi1Mask
, B
->C({0, 1, 2, 3, 4, 5, 6, 7}));
736 B
->VSHUFFLE(vi32Indices
, vi32Indices
, B
->C({0, 1, 2, 3, 4, 5, 6, 7}));
737 auto srcLo
= B
->VSHUFFLE(v32Src
, v32Src
, B
->C({0, 1, 2, 3, 4, 5, 6, 7}));
739 auto mask
= B
->BITCAST(maskLo
, B
->mInt8Ty
);
740 B
->CALL(pThis
->mPfnScatter256
, {pBase
, indicesLo
, srcLo
, mask
, i32Scale
});
742 auto maskHi
= B
->VSHUFFLE(vi1Mask
, vi1Mask
, B
->C({8, 9, 10, 11, 12, 13, 14, 15}));
744 B
->VSHUFFLE(vi32Indices
, vi32Indices
, B
->C({8, 9, 10, 11, 12, 13, 14, 15}));
745 auto srcHi
= B
->VSHUFFLE(v32Src
, v32Src
, B
->C({8, 9, 10, 11, 12, 13, 14, 15}));
747 mask
= B
->BITCAST(maskHi
, B
->mInt8Ty
);
748 B
->CALL(pThis
->mPfnScatter256
, {pBase
, indicesHi
, srcHi
, mask
, i32Scale
});
754 Function
* pX86IntrinFunc
;
757 // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we
758 // can use the scatter of 8 elements with 64bit indices
759 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
760 Intrinsic::x86_avx512_scatter_qps_512
);
762 auto vi32IndicesExt
= B
->Z_EXT(vi32Indices
, B
->mSimdInt64Ty
);
763 iMask
= B
->BITCAST(vi1Mask
, B
->mInt8Ty
);
764 B
->CALL(pX86IntrinFunc
, {pBase
, iMask
, vi32IndicesExt
, v32Src
, i32Scale
});
766 else if (width
== W512
)
768 pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
769 Intrinsic::x86_avx512_scatter_dps_512
);
770 iMask
= B
->BITCAST(vi1Mask
, B
->mInt16Ty
);
771 B
->CALL(pX86IntrinFunc
, {pBase
, iMask
, vi32Indices
, v32Src
, i32Scale
});
776 // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
779 VROUND_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
781 SWR_ASSERT(arch
== AVX512
);
784 auto vf32Src
= pCallInst
->getOperand(0);
786 auto i8Round
= pCallInst
->getOperand(1);
789 Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx_round_ps_256
);
793 return cast
<Instruction
>(B
->CALL2(pfnFunc
, vf32Src
, i8Round
));
795 else if (width
== W512
)
797 auto v8f32SrcLo
= B
->EXTRACT_16(vf32Src
, 0);
798 auto v8f32SrcHi
= B
->EXTRACT_16(vf32Src
, 1);
800 auto v8f32ResLo
= B
->CALL2(pfnFunc
, v8f32SrcLo
, i8Round
);
801 auto v8f32ResHi
= B
->CALL2(pfnFunc
, v8f32SrcHi
, i8Round
);
803 return cast
<Instruction
>(B
->JOIN_16(v8f32ResLo
, v8f32ResHi
));
807 SWR_ASSERT(false, "Unimplemented vector width.");
814 VCONVERT_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
816 SWR_ASSERT(arch
== AVX512
);
819 auto vf32Src
= pCallInst
->getOperand(0);
823 auto vf32SrcRound
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
824 Intrinsic::x86_avx_round_ps_256
);
825 return cast
<Instruction
>(B
->FP_TRUNC(vf32SrcRound
, B
->mFP32Ty
));
827 else if (width
== W512
)
829 // 512 can use intrinsic
830 auto pfnFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
,
831 Intrinsic::x86_avx512_mask_cvtpd2ps_512
);
832 return cast
<Instruction
>(B
->CALL(pfnFunc
, vf32Src
));
836 SWR_ASSERT(false, "Unimplemented vector width.");
842 // No support for hsub in AVX512
843 Instruction
* VHSUB_EMU(LowerX86
* pThis
, TargetArch arch
, TargetWidth width
, CallInst
* pCallInst
)
845 SWR_ASSERT(arch
== AVX512
);
848 auto src0
= pCallInst
->getOperand(0);
849 auto src1
= pCallInst
->getOperand(1);
851 // 256b hsub can just use avx intrinsic
854 auto pX86IntrinFunc
=
855 Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, Intrinsic::x86_avx_hsub_ps_256
);
856 return cast
<Instruction
>(B
->CALL2(pX86IntrinFunc
, src0
, src1
));
858 else if (width
== W512
)
860 // 512b hsub can be accomplished with shuf/sub combo
861 auto minuend
= B
->VSHUFFLE(src0
, src1
, B
->C({0, 2, 8, 10, 4, 6, 12, 14}));
862 auto subtrahend
= B
->VSHUFFLE(src0
, src1
, B
->C({1, 3, 9, 11, 5, 7, 13, 15}));
863 return cast
<Instruction
>(B
->SUB(minuend
, subtrahend
));
867 SWR_ASSERT(false, "Unimplemented vector width.");
872 // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
873 // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
874 Instruction
* DOUBLE_EMU(LowerX86
* pThis
,
878 Intrinsic::ID intrin
)
881 SWR_ASSERT(width
== W512
);
883 Function
* pX86IntrinFunc
= Intrinsic::getDeclaration(B
->JM()->mpCurrentModule
, intrin
);
884 for (uint32_t i
= 0; i
< 2; ++i
)
886 SmallVector
<Value
*, 8> args
;
887 for (auto& arg
: pCallInst
->arg_operands())
889 auto argType
= arg
.get()->getType();
890 if (argType
->isVectorTy())
892 #if LLVM_VERSION_MAJOR >= 11
893 uint32_t vecWidth
= cast
<VectorType
>(argType
)->getNumElements();
894 auto elemTy
= cast
<VectorType
>(argType
)->getElementType();
896 uint32_t vecWidth
= argType
->getVectorNumElements();
897 auto elemTy
= argType
->getVectorElementType();
899 Value
* lanes
= B
->CInc
<int>(i
* vecWidth
/ 2, vecWidth
/ 2);
900 Value
* argToPush
= B
->VSHUFFLE(arg
.get(), B
->VUNDEF(elemTy
, vecWidth
), lanes
);
901 args
.push_back(argToPush
);
905 args
.push_back(arg
.get());
908 result
[i
] = B
->CALLA(pX86IntrinFunc
, args
);
911 if (result
[0]->getType()->isVectorTy())
913 assert(result
[1]->getType()->isVectorTy());
914 #if LLVM_VERSION_MAJOR >= 11
915 vecWidth
= cast
<VectorType
>(result
[0]->getType())->getNumElements() +
916 cast
<VectorType
>(result
[1]->getType())->getNumElements();
918 vecWidth
= result
[0]->getType()->getVectorNumElements() +
919 result
[1]->getType()->getVectorNumElements();
926 Value
* lanes
= B
->CInc
<int>(0, vecWidth
);
927 return cast
<Instruction
>(B
->VSHUFFLE(result
[0], result
[1], lanes
));
930 } // namespace SwrJit
932 using namespace SwrJit
;
934 INITIALIZE_PASS_BEGIN(LowerX86
, "LowerX86", "LowerX86", false, false)
935 INITIALIZE_PASS_END(LowerX86
, "LowerX86", "LowerX86", false, false)