1 //===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //==-----------------------------------------------------------------------===//
10 #include "AMDILDevices.h"
11 #include "AMDGPUInstrInfo.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/ADT/StringExtras.h"
14 #include "llvm/ADT/StringRef.h"
15 #include "llvm/ADT/Twine.h"
16 #include "llvm/Constants.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
19 #include "llvm/Function.h"
20 #include "llvm/Instructions.h"
21 #include "llvm/Module.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/MathExtras.h"
28 STATISTIC(PointerAssignments
, "Number of dynamic pointer "
29 "assigments discovered");
30 STATISTIC(PointerSubtract
, "Number of pointer subtractions discovered");
34 // The Peephole optimization pass is used to do simple last minute optimizations
35 // that are required for correct code or to remove redundant functions
40 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt
: public FunctionPass
{
44 AMDGPUPeepholeOpt(TargetMachine
&tm
);
46 const char *getPassName() const;
47 bool runOnFunction(Function
&F
);
48 bool doInitialization(Module
&M
);
49 bool doFinalization(Module
&M
);
50 void getAnalysisUsage(AnalysisUsage
&AU
) const;
53 // Function to initiate all of the instruction level optimizations.
54 bool instLevelOptimizations(BasicBlock::iterator
*inst
);
55 // Quick check to see if we need to dump all of the pointers into the
56 // arena. If this is correct, then we set all pointers to exist in arena. This
57 // is a workaround for aliasing of pointers in a struct/union.
58 bool dumpAllIntoArena(Function
&F
);
59 // Because I don't want to invalidate any pointers while in the
60 // safeNestedForEachFunction. I push atomic conversions to a vector and handle
61 // it later. This function does the conversions if required.
62 void doAtomicConversionIfNeeded(Function
&F
);
63 // Because __amdil_is_constant cannot be properly evaluated if
64 // optimizations are disabled, the call's are placed in a vector
65 // and evaluated after the __amdil_image* functions are evaluated
66 // which should allow the __amdil_is_constant function to be
67 // evaluated correctly.
68 void doIsConstCallConversionIfNeeded();
72 CodeGenOpt::Level optLevel
;
73 // Run a series of tests to see if we can optimize a CALL instruction.
74 bool optimizeCallInst(BasicBlock::iterator
*bbb
);
75 // A peephole optimization to optimize bit extract sequences.
76 bool optimizeBitExtract(Instruction
*inst
);
77 // A peephole optimization to optimize bit insert sequences.
78 bool optimizeBitInsert(Instruction
*inst
);
79 bool setupBitInsert(Instruction
*base
,
83 // Expand the bit field insert instruction on versions of OpenCL that
85 bool expandBFI(CallInst
*CI
);
86 // Expand the bit field mask instruction on version of OpenCL that
88 bool expandBFM(CallInst
*CI
);
89 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
90 // this case we need to expand them. These functions check for 24bit functions
92 bool isSigned24BitOps(CallInst
*CI
);
93 void expandSigned24BitOps(CallInst
*CI
);
94 // One optimization that can occur is that if the required workgroup size is
95 // specified then the result of get_local_size is known at compile time and
96 // can be returned accordingly.
97 bool isRWGLocalOpt(CallInst
*CI
);
98 // On northern island cards, the division is slightly less accurate than on
99 // previous generations, so we need to utilize a more accurate division. So we
100 // can translate the accurate divide to a normal divide on all other cards.
101 bool convertAccurateDivide(CallInst
*CI
);
102 void expandAccurateDivide(CallInst
*CI
);
103 // If the alignment is set incorrectly, it can produce really inefficient
104 // code. This checks for this scenario and fixes it if possible.
105 bool correctMisalignedMemOp(Instruction
*inst
);
107 // If we are in no opt mode, then we need to make sure that
108 // local samplers are properly propagated as constant propagation
109 // doesn't occur and we need to know the value of kernel defined
110 // samplers at compile time.
111 bool propagateSamplerInst(CallInst
*CI
);
115 // Group of functions that recursively calculate the size of a structure based
116 // on it's sub-types.
117 size_t getTypeSize(Type
* const T
, bool dereferencePtr
= false);
118 size_t getTypeSize(StructType
* const ST
, bool dereferencePtr
= false);
119 size_t getTypeSize(IntegerType
* const IT
, bool dereferencePtr
= false);
120 size_t getTypeSize(FunctionType
* const FT
,bool dereferencePtr
= false);
121 size_t getTypeSize(ArrayType
* const AT
, bool dereferencePtr
= false);
122 size_t getTypeSize(VectorType
* const VT
, bool dereferencePtr
= false);
123 size_t getTypeSize(PointerType
* const PT
, bool dereferencePtr
= false);
124 size_t getTypeSize(OpaqueType
* const OT
, bool dereferencePtr
= false);
128 const AMDGPUSubtarget
*mSTM
;
129 SmallVector
< std::pair
<CallInst
*, Function
*>, 16> atomicFuncs
;
130 SmallVector
<CallInst
*, 16> isConstVec
;
131 }; // class AMDGPUPeepholeOpt
132 char AMDGPUPeepholeOpt::ID
= 0;
134 // A template function that has two levels of looping before calling the
135 // function with a pointer to the current iterator.
136 template<class InputIterator
, class SecondIterator
, class Function
>
137 Function
safeNestedForEach(InputIterator First
, InputIterator Last
,
138 SecondIterator S
, Function F
)
140 for ( ; First
!= Last
; ++First
) {
141 SecondIterator sf
, sl
;
142 for (sf
= First
->begin(), sl
= First
->end();
152 } // anonymous namespace
156 createAMDGPUPeepholeOpt(TargetMachine
&tm
)
158 return new AMDGPUPeepholeOpt(tm
);
162 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine
&tm
)
163 : FunctionPass(ID
), TM(tm
)
166 optLevel
= TM
.getOptLevel();
170 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()
175 AMDGPUPeepholeOpt::getPassName() const
177 return "AMDGPU PeepHole Optimization Pass";
181 containsPointerType(Type
*Ty
)
186 switch(Ty
->getTypeID()) {
189 case Type::StructTyID
: {
190 const StructType
*ST
= dyn_cast
<StructType
>(Ty
);
191 for (StructType::element_iterator stb
= ST
->element_begin(),
192 ste
= ST
->element_end(); stb
!= ste
; ++stb
) {
193 if (!containsPointerType(*stb
)) {
200 case Type::VectorTyID
:
201 case Type::ArrayTyID
:
202 return containsPointerType(dyn_cast
<SequentialType
>(Ty
)->getElementType());
203 case Type::PointerTyID
:
210 AMDGPUPeepholeOpt::dumpAllIntoArena(Function
&F
)
212 bool dumpAll
= false;
213 for (Function::const_arg_iterator cab
= F
.arg_begin(),
214 cae
= F
.arg_end(); cab
!= cae
; ++cab
) {
215 const Argument
*arg
= cab
;
216 const PointerType
*PT
= dyn_cast
<PointerType
>(arg
->getType());
220 Type
*DereferencedType
= PT
->getElementType();
221 if (!dyn_cast
<StructType
>(DereferencedType
)
225 if (!containsPointerType(DereferencedType
)) {
228 // FIXME: Because a pointer inside of a struct/union may be aliased to
229 // another pointer we need to take the conservative approach and place all
230 // pointers into the arena until more advanced detection is implemented.
236 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded()
238 if (isConstVec
.empty()) {
241 for (unsigned x
= 0, y
= isConstVec
.size(); x
< y
; ++x
) {
242 CallInst
*CI
= isConstVec
[x
];
243 Constant
*CV
= dyn_cast
<Constant
>(CI
->getOperand(0));
244 Type
*aType
= Type::getInt32Ty(*mCTX
);
245 Value
*Val
= (CV
!= NULL
) ? ConstantInt::get(aType
, 1)
246 : ConstantInt::get(aType
, 0);
247 CI
->replaceAllUsesWith(Val
);
248 CI
->eraseFromParent();
253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function
&F
)
255 // Don't do anything if we don't have any atomic operations.
256 if (atomicFuncs
.empty()) {
259 // Change the function name for the atomic if it is required
260 uint32_t size
= atomicFuncs
.size();
261 for (uint32_t x
= 0; x
< size
; ++x
) {
262 atomicFuncs
[x
].first
->setOperand(
263 atomicFuncs
[x
].first
->getNumOperands()-1,
264 atomicFuncs
[x
].second
);
268 if (mConvertAtomics
) {
274 AMDGPUPeepholeOpt::runOnFunction(Function
&MF
)
278 mSTM
= &TM
.getSubtarget
<AMDGPUSubtarget
>();
282 mCTX
= &MF
.getType()->getContext();
283 mConvertAtomics
= true;
284 safeNestedForEach(MF
.begin(), MF
.end(), MF
.begin()->begin(),
285 std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations
),
288 doAtomicConversionIfNeeded(MF
);
289 doIsConstCallConversionIfNeeded();
298 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator
*bbb
)
300 Instruction
*inst
= (*bbb
);
301 CallInst
*CI
= dyn_cast
<CallInst
>(inst
);
305 if (isSigned24BitOps(CI
)) {
306 expandSigned24BitOps(CI
);
308 CI
->eraseFromParent();
311 if (propagateSamplerInst(CI
)) {
314 if (expandBFI(CI
) || expandBFM(CI
)) {
316 CI
->eraseFromParent();
319 if (convertAccurateDivide(CI
)) {
320 expandAccurateDivide(CI
);
322 CI
->eraseFromParent();
326 StringRef calleeName
= CI
->getOperand(CI
->getNumOperands()-1)->getName();
327 if (calleeName
.startswith("__amdil_is_constant")) {
328 // If we do not have optimizations, then this
329 // cannot be properly evaluated, so we add the
330 // call instruction to a vector and process
331 // them at the end of processing after the
332 // samplers have been correctly handled.
333 if (optLevel
== CodeGenOpt::None
) {
334 isConstVec
.push_back(CI
);
337 Constant
*CV
= dyn_cast
<Constant
>(CI
->getOperand(0));
338 Type
*aType
= Type::getInt32Ty(*mCTX
);
339 Value
*Val
= (CV
!= NULL
) ? ConstantInt::get(aType
, 1)
340 : ConstantInt::get(aType
, 0);
341 CI
->replaceAllUsesWith(Val
);
343 CI
->eraseFromParent();
348 if (calleeName
.equals("__amdil_is_asic_id_i32")) {
349 ConstantInt
*CV
= dyn_cast
<ConstantInt
>(CI
->getOperand(0));
350 Type
*aType
= Type::getInt32Ty(*mCTX
);
353 Val
= ConstantInt::get(aType
,
354 mSTM
->device()->getDeviceFlag() & CV
->getZExtValue());
356 Val
= ConstantInt::get(aType
, 0);
358 CI
->replaceAllUsesWith(Val
);
360 CI
->eraseFromParent();
363 Function
*F
= dyn_cast
<Function
>(CI
->getOperand(CI
->getNumOperands()-1));
367 if (F
->getName().startswith("__atom") && !CI
->getNumUses()
368 && F
->getName().find("_xchg") == StringRef::npos
) {
369 std::string
buffer(F
->getName().str() + "_noret");
370 F
= dyn_cast
<Function
>(
371 F
->getParent()->getOrInsertFunction(buffer
, F
->getFunctionType()));
372 atomicFuncs
.push_back(std::make_pair
<CallInst
*, Function
*>(CI
, F
));
375 if (!mSTM
->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment
)
376 && !mSTM
->device()->isSupported(AMDGPUDeviceInfo::MultiUAV
)) {
379 if (!mConvertAtomics
) {
382 StringRef name
= F
->getName();
383 if (name
.startswith("__atom") && name
.find("_g") != StringRef::npos
) {
384 mConvertAtomics
= false;
390 AMDGPUPeepholeOpt::setupBitInsert(Instruction
*base
,
397 dbgs() << "Null pointer passed into function.\n";
402 if (base
->getOpcode() == Instruction::Shl
) {
403 shift
= dyn_cast
<Constant
>(base
->getOperand(1));
404 } else if (base
->getOpcode() == Instruction::And
) {
405 mask
= dyn_cast
<Constant
>(base
->getOperand(1));
409 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
411 // If the base is neither a Shl or a And, we don't fit any of the patterns above.
414 src
= dyn_cast
<Instruction
>(base
->getOperand(0));
417 dbgs() << "Failed setup since the base operand is not an instruction!\n";
421 // If we find an 'and' operation, then we don't need to
422 // find the next operation as we already know the
423 // bits that are valid at this point.
427 if (src
->getOpcode() == Instruction::Shl
&& !shift
) {
428 shift
= dyn_cast
<Constant
>(src
->getOperand(1));
429 src
= dyn_cast
<Instruction
>(src
->getOperand(0));
430 } else if (src
->getOpcode() == Instruction::And
&& !mask
) {
431 mask
= dyn_cast
<Constant
>(src
->getOperand(1));
433 if (!mask
&& !shift
) {
435 dbgs() << "Failed setup since both mask and shift are NULL!\n";
437 // Did not find a constant mask or a shift.
443 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction
*inst
)
448 if (!inst
->isBinaryOp()) {
451 if (inst
->getOpcode() != Instruction::Or
) {
454 if (optLevel
== CodeGenOpt::None
) {
457 // We want to do an optimization on a sequence of ops that in the end equals a
458 // single ISA instruction.
459 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
460 // Some simplified versions of this pattern are as follows:
461 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
462 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
463 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
464 // (A & B) | (D << F) when (1 << F) >= B
465 // (A << C) | (D & E) when (1 << C) >= E
466 if (mSTM
->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX
) {
467 // The HD4XXX hardware doesn't support the ubit_insert instruction.
470 Type
*aType
= inst
->getType();
471 bool isVector
= aType
->isVectorTy();
473 // This optimization only works on 32bit integers.
474 if (aType
->getScalarType()
475 != Type::getInt32Ty(inst
->getContext())) {
479 const VectorType
*VT
= dyn_cast
<VectorType
>(aType
);
480 numEle
= VT
->getNumElements();
481 // We currently cannot support more than 4 elements in a intrinsic and we
482 // cannot support Vec3 types.
483 if (numEle
> 4 || numEle
== 3) {
487 // TODO: Handle vectors.
490 dbgs() << "!!! Vectors are not supported yet!\n";
494 Instruction
*LHSSrc
= NULL
, *RHSSrc
= NULL
;
495 Constant
*LHSMask
= NULL
, *RHSMask
= NULL
;
496 Constant
*LHSShift
= NULL
, *RHSShift
= NULL
;
497 Instruction
*LHS
= dyn_cast
<Instruction
>(inst
->getOperand(0));
498 Instruction
*RHS
= dyn_cast
<Instruction
>(inst
->getOperand(1));
499 if (!setupBitInsert(LHS
, LHSSrc
, LHSMask
, LHSShift
)) {
501 dbgs() << "Found an OR Operation that failed setup!\n";
503 if (LHS
) { LHS
->dump(); }
504 if (LHSSrc
) { LHSSrc
->dump(); }
505 if (LHSMask
) { LHSMask
->dump(); }
506 if (LHSShift
) { LHSShift
->dump(); }
508 // There was an issue with the setup for BitInsert.
511 if (!setupBitInsert(RHS
, RHSSrc
, RHSMask
, RHSShift
)) {
513 dbgs() << "Found an OR Operation that failed setup!\n";
515 if (RHS
) { RHS
->dump(); }
516 if (RHSSrc
) { RHSSrc
->dump(); }
517 if (RHSMask
) { RHSMask
->dump(); }
518 if (RHSShift
) { RHSShift
->dump(); }
520 // There was an issue with the setup for BitInsert.
524 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
525 dbgs() << "Op: "; inst
->dump();
526 dbgs() << "LHS: "; if (LHS
) { LHS
->dump(); } else { dbgs() << "(None)\n"; }
527 dbgs() << "LHS Src: "; if (LHSSrc
) { LHSSrc
->dump(); } else { dbgs() << "(None)\n"; }
528 dbgs() << "LHS Mask: "; if (LHSMask
) { LHSMask
->dump(); } else { dbgs() << "(None)\n"; }
529 dbgs() << "LHS Shift: "; if (LHSShift
) { LHSShift
->dump(); } else { dbgs() << "(None)\n"; }
530 dbgs() << "RHS: "; if (RHS
) { RHS
->dump(); } else { dbgs() << "(None)\n"; }
531 dbgs() << "RHS Src: "; if (RHSSrc
) { RHSSrc
->dump(); } else { dbgs() << "(None)\n"; }
532 dbgs() << "RHS Mask: "; if (RHSMask
) { RHSMask
->dump(); } else { dbgs() << "(None)\n"; }
533 dbgs() << "RHS Shift: "; if (RHSShift
) { RHSShift
->dump(); } else { dbgs() << "(None)\n"; }
535 Constant
*offset
= NULL
;
536 Constant
*width
= NULL
;
537 int32_t lhsMaskVal
= 0, rhsMaskVal
= 0;
538 int32_t lhsShiftVal
= 0, rhsShiftVal
= 0;
539 int32_t lhsMaskWidth
= 0, rhsMaskWidth
= 0;
540 int32_t lhsMaskOffset
= 0, rhsMaskOffset
= 0;
541 lhsMaskVal
= (int32_t)(LHSMask
542 ? dyn_cast
<ConstantInt
>(LHSMask
)->getZExtValue() : 0);
543 rhsMaskVal
= (int32_t)(RHSMask
544 ? dyn_cast
<ConstantInt
>(RHSMask
)->getZExtValue() : 0);
545 lhsShiftVal
= (int32_t)(LHSShift
546 ? dyn_cast
<ConstantInt
>(LHSShift
)->getZExtValue() : 0);
547 rhsShiftVal
= (int32_t)(RHSShift
548 ? dyn_cast
<ConstantInt
>(RHSShift
)->getZExtValue() : 0);
549 lhsMaskWidth
= lhsMaskVal
? CountPopulation_32(lhsMaskVal
) : 32 - lhsShiftVal
;
550 rhsMaskWidth
= rhsMaskVal
? CountPopulation_32(rhsMaskVal
) : 32 - rhsShiftVal
;
551 lhsMaskOffset
= lhsMaskVal
? CountTrailingZeros_32(lhsMaskVal
) : lhsShiftVal
;
552 rhsMaskOffset
= rhsMaskVal
? CountTrailingZeros_32(rhsMaskVal
) : rhsShiftVal
;
553 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
555 dbgs() << "Found pattern: \'((A" << (LHSMask
? " & B)" : ")");
556 dbgs() << (LHSShift
? " << C)" : ")") << " | ((D" ;
557 dbgs() << (RHSMask
? " & E)" : ")");
558 dbgs() << (RHSShift
? " << F)\'\n" : ")\'\n");
559 dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
560 dbgs() << "B = " << lhsMaskVal
<< "\t\tE = " << rhsMaskVal
<< "\n";
561 dbgs() << "C = " << lhsShiftVal
<< "\t\tF = " << rhsShiftVal
<< "\n";
562 dbgs() << "width(B) = " << lhsMaskWidth
;
563 dbgs() << "\twidth(E) = " << rhsMaskWidth
<< "\n";
564 dbgs() << "offset(B) = " << lhsMaskOffset
;
565 dbgs() << "\toffset(E) = " << rhsMaskOffset
<< "\n";
566 dbgs() << "Constraints: \n";
567 dbgs() << "\t(1) B ^ E == 0\n";
568 dbgs() << "\t(2-LHS) B is a mask\n";
569 dbgs() << "\t(2-LHS) E is a mask\n";
570 dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
571 dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
573 if ((lhsMaskVal
|| rhsMaskVal
) && !(lhsMaskVal
^ rhsMaskVal
)) {
575 dbgs() << lhsMaskVal
<< " ^ " << rhsMaskVal
;
576 dbgs() << " = " << (lhsMaskVal
^ rhsMaskVal
) << "\n";
577 dbgs() << "Failed constraint 1!\n";
582 dbgs() << "LHS = " << lhsMaskOffset
<< "";
583 dbgs() << " >= (" << rhsMaskWidth
<< " + " << rhsMaskOffset
<< ") = ";
584 dbgs() << (lhsMaskOffset
>= (rhsMaskWidth
+ rhsMaskOffset
));
585 dbgs() << "\nRHS = " << rhsMaskOffset
<< "";
586 dbgs() << " >= (" << lhsMaskWidth
<< " + " << lhsMaskOffset
<< ") = ";
587 dbgs() << (rhsMaskOffset
>= (lhsMaskWidth
+ lhsMaskOffset
));
590 if (lhsMaskOffset
>= (rhsMaskWidth
+ rhsMaskOffset
)) {
591 offset
= ConstantInt::get(aType
, lhsMaskOffset
, false);
592 width
= ConstantInt::get(aType
, lhsMaskWidth
, false);
594 if (!isMask_32(lhsMaskVal
) && !isShiftedMask_32(lhsMaskVal
)) {
596 dbgs() << "Value is not a Mask: " << lhsMaskVal
<< "\n";
597 dbgs() << "Failed constraint 2!\n";
602 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
604 } else if (lhsShiftVal
!= lhsMaskOffset
) {
605 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
609 dbgs() << "Optimizing LHS!\n";
611 } else if (rhsMaskOffset
>= (lhsMaskWidth
+ lhsMaskOffset
)) {
612 offset
= ConstantInt::get(aType
, rhsMaskOffset
, false);
613 width
= ConstantInt::get(aType
, rhsMaskWidth
, false);
616 if (!isMask_32(rhsMaskVal
) && !isShiftedMask_32(rhsMaskVal
)) {
618 dbgs() << "Non-Mask: " << rhsMaskVal
<< "\n";
619 dbgs() << "Failed constraint 2!\n";
624 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
626 } else if (rhsShiftVal
!= rhsMaskOffset
) {
627 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
631 dbgs() << "Optimizing RHS!\n";
635 dbgs() << "Failed constraint 3!\n";
640 dbgs() << "Width: "; if (width
) { width
->dump(); } else { dbgs() << "(0)\n"; }
641 dbgs() << "Offset: "; if (offset
) { offset
->dump(); } else { dbgs() << "(0)\n"; }
642 dbgs() << "LHSSrc: "; if (LHSSrc
) { LHSSrc
->dump(); } else { dbgs() << "(0)\n"; }
643 dbgs() << "RHSSrc: "; if (RHSSrc
) { RHSSrc
->dump(); } else { dbgs() << "(0)\n"; }
645 if (!offset
|| !width
) {
647 dbgs() << "Either width or offset are NULL, failed detection!\n";
651 // Lets create the function signature.
652 std::vector
<Type
*> callTypes
;
653 callTypes
.push_back(aType
);
654 callTypes
.push_back(aType
);
655 callTypes
.push_back(aType
);
656 callTypes
.push_back(aType
);
657 FunctionType
*funcType
= FunctionType::get(aType
, callTypes
, false);
658 std::string name
= "__amdil_ubit_insert";
659 if (isVector
) { name
+= "_v" + itostr(numEle
) + "u32"; } else { name
+= "_u32"; }
661 dyn_cast
<Function
>(inst
->getParent()->getParent()->getParent()->
662 getOrInsertFunction(llvm::StringRef(name
), funcType
));
663 Value
*Operands
[4] = {
669 CallInst
*CI
= CallInst::Create(Func
, Operands
, "BitInsertOpt");
671 dbgs() << "Old Inst: ";
673 dbgs() << "New Inst: ";
677 CI
->insertBefore(inst
);
678 inst
->replaceAllUsesWith(CI
);
683 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction
*inst
)
688 if (!inst
->isBinaryOp()) {
691 if (inst
->getOpcode() != Instruction::And
) {
694 if (optLevel
== CodeGenOpt::None
) {
697 // We want to do some simple optimizations on Shift right/And patterns. The
698 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
699 // value smaller than 32 and C is a mask. If C is a constant value, then the
700 // following transformation can occur. For signed integers, it turns into the
701 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
702 // integers, it turns into the function call dst =
703 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
704 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
705 // Evergreen hardware.
706 if (mSTM
->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX
) {
707 // This does not work on HD4XXX hardware.
710 Type
*aType
= inst
->getType();
711 bool isVector
= aType
->isVectorTy();
713 // XXX Support vector types
718 // This only works on 32bit integers
719 if (aType
->getScalarType()
720 != Type::getInt32Ty(inst
->getContext())) {
724 const VectorType
*VT
= dyn_cast
<VectorType
>(aType
);
725 numEle
= VT
->getNumElements();
726 // We currently cannot support more than 4 elements in a intrinsic and we
727 // cannot support Vec3 types.
728 if (numEle
> 4 || numEle
== 3) {
732 BinaryOperator
*ShiftInst
= dyn_cast
<BinaryOperator
>(inst
->getOperand(0));
733 // If the first operand is not a shift instruction, then we can return as it
734 // doesn't match this pattern.
735 if (!ShiftInst
|| !ShiftInst
->isShift()) {
738 // If we are a shift left, then we need don't match this pattern.
739 if (ShiftInst
->getOpcode() == Instruction::Shl
) {
742 bool isSigned
= ShiftInst
->isArithmeticShift();
743 Constant
*AndMask
= dyn_cast
<Constant
>(inst
->getOperand(1));
744 Constant
*ShrVal
= dyn_cast
<Constant
>(ShiftInst
->getOperand(1));
745 // Lets make sure that the shift value and the and mask are constant integers.
746 if (!AndMask
|| !ShrVal
) {
749 Constant
*newMaskConst
;
750 Constant
*shiftValConst
;
752 // Handle the vector case
753 std::vector
<Constant
*> maskVals
;
754 std::vector
<Constant
*> shiftVals
;
755 ConstantVector
*AndMaskVec
= dyn_cast
<ConstantVector
>(AndMask
);
756 ConstantVector
*ShrValVec
= dyn_cast
<ConstantVector
>(ShrVal
);
757 Type
*scalarType
= AndMaskVec
->getType()->getScalarType();
758 assert(AndMaskVec
->getNumOperands() ==
759 ShrValVec
->getNumOperands() && "cannot have a "
760 "combination where the number of elements to a "
761 "shift and an and are different!");
762 for (size_t x
= 0, y
= AndMaskVec
->getNumOperands(); x
< y
; ++x
) {
763 ConstantInt
*AndCI
= dyn_cast
<ConstantInt
>(AndMaskVec
->getOperand(x
));
764 ConstantInt
*ShiftIC
= dyn_cast
<ConstantInt
>(ShrValVec
->getOperand(x
));
765 if (!AndCI
|| !ShiftIC
) {
768 uint32_t maskVal
= (uint32_t)AndCI
->getZExtValue();
769 if (!isMask_32(maskVal
)) {
772 maskVal
= (uint32_t)CountTrailingOnes_32(maskVal
);
773 uint32_t shiftVal
= (uint32_t)ShiftIC
->getZExtValue();
774 // If the mask or shiftval is greater than the bitcount, then break out.
775 if (maskVal
>= 32 || shiftVal
>= 32) {
778 // If the mask val is greater than the the number of original bits left
779 // then this optimization is invalid.
780 if (maskVal
> (32 - shiftVal
)) {
783 maskVals
.push_back(ConstantInt::get(scalarType
, maskVal
, isSigned
));
784 shiftVals
.push_back(ConstantInt::get(scalarType
, shiftVal
, isSigned
));
786 newMaskConst
= ConstantVector::get(maskVals
);
787 shiftValConst
= ConstantVector::get(shiftVals
);
789 // Handle the scalar case
790 uint32_t maskVal
= (uint32_t)dyn_cast
<ConstantInt
>(AndMask
)->getZExtValue();
791 // This must be a mask value where all lower bits are set to 1 and then any
792 // bit higher is set to 0.
793 if (!isMask_32(maskVal
)) {
796 maskVal
= (uint32_t)CountTrailingOnes_32(maskVal
);
797 // Count the number of bits set in the mask, this is the width of the
798 // resulting bit set that is extracted from the source value.
799 uint32_t shiftVal
= (uint32_t)dyn_cast
<ConstantInt
>(ShrVal
)->getZExtValue();
800 // If the mask or shift val is greater than the bitcount, then break out.
801 if (maskVal
>= 32 || shiftVal
>= 32) {
804 // If the mask val is greater than the the number of original bits left then
805 // this optimization is invalid.
806 if (maskVal
> (32 - shiftVal
)) {
809 newMaskConst
= ConstantInt::get(aType
, maskVal
, isSigned
);
810 shiftValConst
= ConstantInt::get(aType
, shiftVal
, isSigned
);
812 // Lets create the function signature.
813 std::vector
<Type
*> callTypes
;
814 callTypes
.push_back(aType
);
815 callTypes
.push_back(aType
);
816 callTypes
.push_back(aType
);
817 FunctionType
*funcType
= FunctionType::get(aType
, callTypes
, false);
818 std::string name
= "llvm.AMDIL.bit.extract.u32";
820 name
+= ".v" + itostr(numEle
) + "i32";
824 // Lets create the function.
826 dyn_cast
<Function
>(inst
->getParent()->getParent()->getParent()->
827 getOrInsertFunction(llvm::StringRef(name
), funcType
));
828 Value
*Operands
[3] = {
829 ShiftInst
->getOperand(0),
833 // Lets create the Call with the operands
834 CallInst
*CI
= CallInst::Create(Func
, Operands
, "ByteExtractOpt");
835 CI
->setDoesNotAccessMemory();
836 CI
->insertBefore(inst
);
837 inst
->replaceAllUsesWith(CI
);
842 AMDGPUPeepholeOpt::expandBFI(CallInst
*CI
)
847 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
848 if (!LHS
->getName().startswith("__amdil_bfi")) {
851 Type
* type
= CI
->getOperand(0)->getType();
852 Constant
*negOneConst
= NULL
;
853 if (type
->isVectorTy()) {
854 std::vector
<Constant
*> negOneVals
;
855 negOneConst
= ConstantInt::get(CI
->getContext(),
856 APInt(32, StringRef("-1"), 10));
858 y
= dyn_cast
<VectorType
>(type
)->getNumElements(); x
< y
; ++x
) {
859 negOneVals
.push_back(negOneConst
);
861 negOneConst
= ConstantVector::get(negOneVals
);
863 negOneConst
= ConstantInt::get(CI
->getContext(),
864 APInt(32, StringRef("-1"), 10));
866 // __amdil_bfi => (A & B) | (~A & C)
867 BinaryOperator
*lhs
=
868 BinaryOperator::Create(Instruction::And
, CI
->getOperand(0),
869 CI
->getOperand(1), "bfi_and", CI
);
870 BinaryOperator
*rhs
=
871 BinaryOperator::Create(Instruction::Xor
, CI
->getOperand(0), negOneConst
,
873 rhs
= BinaryOperator::Create(Instruction::And
, rhs
, CI
->getOperand(2),
875 lhs
= BinaryOperator::Create(Instruction::Or
, lhs
, rhs
, "bfi_or", CI
);
876 CI
->replaceAllUsesWith(lhs
);
881 AMDGPUPeepholeOpt::expandBFM(CallInst
*CI
)
886 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
887 if (!LHS
->getName().startswith("__amdil_bfm")) {
890 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
891 Constant
*newMaskConst
= NULL
;
892 Constant
*newShiftConst
= NULL
;
893 Type
* type
= CI
->getOperand(0)->getType();
894 if (type
->isVectorTy()) {
895 std::vector
<Constant
*> newMaskVals
, newShiftVals
;
896 newMaskConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 0x1F);
897 newShiftConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 1);
899 y
= dyn_cast
<VectorType
>(type
)->getNumElements(); x
< y
; ++x
) {
900 newMaskVals
.push_back(newMaskConst
);
901 newShiftVals
.push_back(newShiftConst
);
903 newMaskConst
= ConstantVector::get(newMaskVals
);
904 newShiftConst
= ConstantVector::get(newShiftVals
);
906 newMaskConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 0x1F);
907 newShiftConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 1);
909 BinaryOperator
*lhs
=
910 BinaryOperator::Create(Instruction::And
, CI
->getOperand(0),
911 newMaskConst
, "bfm_mask", CI
);
912 lhs
= BinaryOperator::Create(Instruction::Shl
, newShiftConst
,
914 lhs
= BinaryOperator::Create(Instruction::Sub
, lhs
,
915 newShiftConst
, "bfm_sub", CI
);
916 BinaryOperator
*rhs
=
917 BinaryOperator::Create(Instruction::And
, CI
->getOperand(1),
918 newMaskConst
, "bfm_mask", CI
);
919 lhs
= BinaryOperator::Create(Instruction::Shl
, lhs
, rhs
, "bfm_shl", CI
);
920 CI
->replaceAllUsesWith(lhs
);
925 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator
*bbb
)
927 Instruction
*inst
= (*bbb
);
928 if (optimizeCallInst(bbb
)) {
931 if (optimizeBitExtract(inst
)) {
934 if (optimizeBitInsert(inst
)) {
937 if (correctMisalignedMemOp(inst
)) {
943 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction
*inst
)
945 LoadInst
*linst
= dyn_cast
<LoadInst
>(inst
);
946 StoreInst
*sinst
= dyn_cast
<StoreInst
>(inst
);
948 Type
* Ty
= inst
->getType();
950 alignment
= linst
->getAlignment();
951 Ty
= inst
->getType();
953 alignment
= sinst
->getAlignment();
954 Ty
= sinst
->getValueOperand()->getType();
958 unsigned size
= getTypeSize(Ty
);
959 if (size
== alignment
|| size
< alignment
) {
962 if (!Ty
->isStructTy()) {
967 linst
->setAlignment(0);
970 sinst
->setAlignment(0);
977 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst
*CI
)
982 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
983 std::string namePrefix
= LHS
->getName().substr(0, 14);
984 if (namePrefix
!= "__amdil_imad24" && namePrefix
!= "__amdil_imul24"
985 && namePrefix
!= "__amdil__imul24_high") {
988 if (mSTM
->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps
)) {
995 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst
*CI
)
997 assert(isSigned24BitOps(CI
) && "Must be a "
998 "signed 24 bit operation to call this function!");
999 Value
*LHS
= CI
->getOperand(CI
->getNumOperands()-1);
1000 // On 7XX and 8XX we do not have signed 24bit, so we need to
1001 // expand it to the following:
1002 // imul24 turns into 32bit imul
1003 // imad24 turns into 32bit imad
1004 // imul24_high turns into 32bit imulhigh
1005 if (LHS
->getName().substr(0, 14) == "__amdil_imad24") {
1006 Type
*aType
= CI
->getOperand(0)->getType();
1007 bool isVector
= aType
->isVectorTy();
1008 int numEle
= isVector
? dyn_cast
<VectorType
>(aType
)->getNumElements() : 1;
1009 std::vector
<Type
*> callTypes
;
1010 callTypes
.push_back(CI
->getOperand(0)->getType());
1011 callTypes
.push_back(CI
->getOperand(1)->getType());
1012 callTypes
.push_back(CI
->getOperand(2)->getType());
1013 FunctionType
*funcType
=
1014 FunctionType::get(CI
->getOperand(0)->getType(), callTypes
, false);
1015 std::string name
= "__amdil_imad";
1017 name
+= "_v" + itostr(numEle
) + "i32";
1021 Function
*Func
= dyn_cast
<Function
>(
1022 CI
->getParent()->getParent()->getParent()->
1023 getOrInsertFunction(llvm::StringRef(name
), funcType
));
1024 Value
*Operands
[3] = {
1029 CallInst
*nCI
= CallInst::Create(Func
, Operands
, "imad24");
1030 nCI
->insertBefore(CI
);
1031 CI
->replaceAllUsesWith(nCI
);
1032 } else if (LHS
->getName().substr(0, 14) == "__amdil_imul24") {
1033 BinaryOperator
*mulOp
=
1034 BinaryOperator::Create(Instruction::Mul
, CI
->getOperand(0),
1035 CI
->getOperand(1), "imul24", CI
);
1036 CI
->replaceAllUsesWith(mulOp
);
1037 } else if (LHS
->getName().substr(0, 19) == "__amdil_imul24_high") {
1038 Type
*aType
= CI
->getOperand(0)->getType();
1040 bool isVector
= aType
->isVectorTy();
1041 int numEle
= isVector
? dyn_cast
<VectorType
>(aType
)->getNumElements() : 1;
1042 std::vector
<Type
*> callTypes
;
1043 callTypes
.push_back(CI
->getOperand(0)->getType());
1044 callTypes
.push_back(CI
->getOperand(1)->getType());
1045 FunctionType
*funcType
=
1046 FunctionType::get(CI
->getOperand(0)->getType(), callTypes
, false);
1047 std::string name
= "__amdil_imul_high";
1049 name
+= "_v" + itostr(numEle
) + "i32";
1053 Function
*Func
= dyn_cast
<Function
>(
1054 CI
->getParent()->getParent()->getParent()->
1055 getOrInsertFunction(llvm::StringRef(name
), funcType
));
1056 Value
*Operands
[2] = {
1060 CallInst
*nCI
= CallInst::Create(Func
, Operands
, "imul24_high");
1061 nCI
->insertBefore(CI
);
1062 CI
->replaceAllUsesWith(nCI
);
1067 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst
*CI
)
1070 && CI
->getOperand(CI
->getNumOperands() - 1)->getName()
1071 == "__amdil_get_local_size_int");
1075 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst
*CI
)
1080 if (mSTM
->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
1081 && (mSTM
->getDeviceName() == "cayman")) {
1084 return CI
->getOperand(CI
->getNumOperands() - 1)->getName().substr(0, 20)
1085 == "__amdil_improved_div";
1089 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst
*CI
)
1091 assert(convertAccurateDivide(CI
)
1092 && "expanding accurate divide can only happen if it is expandable!");
1093 BinaryOperator
*divOp
=
1094 BinaryOperator::Create(Instruction::FDiv
, CI
->getOperand(0),
1095 CI
->getOperand(1), "fdiv32", CI
);
1096 CI
->replaceAllUsesWith(divOp
);
1100 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst
*CI
)
1102 if (optLevel
!= CodeGenOpt::None
) {
1110 unsigned funcNameIdx
= 0;
1111 funcNameIdx
= CI
->getNumOperands() - 1;
1112 StringRef calleeName
= CI
->getOperand(funcNameIdx
)->getName();
1113 if (calleeName
!= "__amdil_image2d_read_norm"
1114 && calleeName
!= "__amdil_image2d_read_unnorm"
1115 && calleeName
!= "__amdil_image3d_read_norm"
1116 && calleeName
!= "__amdil_image3d_read_unnorm") {
1120 unsigned samplerIdx
= 2;
1122 Value
*sampler
= CI
->getOperand(samplerIdx
);
1123 LoadInst
*lInst
= dyn_cast
<LoadInst
>(sampler
);
1128 if (lInst
->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS
) {
1132 GlobalVariable
*gv
= dyn_cast
<GlobalVariable
>(lInst
->getPointerOperand());
1133 // If we are loading from what is not a global value, then we
1139 // If we don't have an initializer or we have an initializer and
1140 // the initializer is not a 32bit integer, we fail.
1141 if (!gv
->hasInitializer()
1142 || !gv
->getInitializer()->getType()->isIntegerTy(32)) {
1146 // Now that we have the global variable initializer, lets replace
1147 // all uses of the load instruction with the samplerVal and
1148 // reparse the __amdil_is_constant() function.
1149 Constant
*samplerVal
= gv
->getInitializer();
1150 lInst
->replaceAllUsesWith(samplerVal
);
1155 AMDGPUPeepholeOpt::doInitialization(Module
&M
)
1161 AMDGPUPeepholeOpt::doFinalization(Module
&M
)
1167 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage
&AU
) const
1169 AU
.addRequired
<MachineFunctionAnalysis
>();
1170 FunctionPass::getAnalysisUsage(AU
);
1171 AU
.setPreservesAll();
1174 size_t AMDGPUPeepholeOpt::getTypeSize(Type
* const T
, bool dereferencePtr
) {
1179 switch (T
->getTypeID()) {
1180 case Type::X86_FP80TyID
:
1181 case Type::FP128TyID
:
1182 case Type::PPC_FP128TyID
:
1183 case Type::LabelTyID
:
1184 assert(0 && "These types are not supported by this backend");
1186 case Type::FloatTyID
:
1187 case Type::DoubleTyID
:
1188 size
= T
->getPrimitiveSizeInBits() >> 3;
1190 case Type::PointerTyID
:
1191 size
= getTypeSize(dyn_cast
<PointerType
>(T
), dereferencePtr
);
1193 case Type::IntegerTyID
:
1194 size
= getTypeSize(dyn_cast
<IntegerType
>(T
), dereferencePtr
);
1196 case Type::StructTyID
:
1197 size
= getTypeSize(dyn_cast
<StructType
>(T
), dereferencePtr
);
1199 case Type::ArrayTyID
:
1200 size
= getTypeSize(dyn_cast
<ArrayType
>(T
), dereferencePtr
);
1202 case Type::FunctionTyID
:
1203 size
= getTypeSize(dyn_cast
<FunctionType
>(T
), dereferencePtr
);
1205 case Type::VectorTyID
:
1206 size
= getTypeSize(dyn_cast
<VectorType
>(T
), dereferencePtr
);
1212 size_t AMDGPUPeepholeOpt::getTypeSize(StructType
* const ST
,
1213 bool dereferencePtr
) {
1219 StructType::element_iterator eib
;
1220 StructType::element_iterator eie
;
1221 for (eib
= ST
->element_begin(), eie
= ST
->element_end(); eib
!= eie
; ++eib
) {
1223 size
+= getTypeSize(curType
, dereferencePtr
);
1228 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType
* const IT
,
1229 bool dereferencePtr
) {
1230 return IT
? (IT
->getBitWidth() >> 3) : 0;
1233 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType
* const FT
,
1234 bool dereferencePtr
) {
1235 assert(0 && "Should not be able to calculate the size of an function type");
1239 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType
* const AT
,
1240 bool dereferencePtr
) {
1241 return (size_t)(AT
? (getTypeSize(AT
->getElementType(),
1242 dereferencePtr
) * AT
->getNumElements())
1246 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType
* const VT
,
1247 bool dereferencePtr
) {
1248 return VT
? (VT
->getBitWidth() >> 3) : 0;
1251 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType
* const PT
,
1252 bool dereferencePtr
) {
1256 Type
*CT
= PT
->getElementType();
1257 if (CT
->getTypeID() == Type::StructTyID
&&
1258 PT
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
) {
1259 return getTypeSize(dyn_cast
<StructType
>(CT
));
1260 } else if (dereferencePtr
) {
1262 for (size_t x
= 0, y
= PT
->getNumContainedTypes(); x
< y
; ++x
) {
1263 size
+= getTypeSize(PT
->getContainedType(x
), dereferencePtr
);
1271 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType
* const OT
,
1272 bool dereferencePtr
) {
1273 //assert(0 && "Should not be able to calculate the size of an opaque type");