1 //===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //==-----------------------------------------------------------------------===//
10 #include "AMDILAlgorithms.tpp"
11 #include "AMDILDevices.h"
12 #include "AMDGPUInstrInfo.h"
13 #include "llvm/ADT/Statistic.h"
14 #include "llvm/ADT/StringExtras.h"
15 #include "llvm/ADT/StringRef.h"
16 #include "llvm/ADT/Twine.h"
17 #include "llvm/Constants.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
20 #include "llvm/Function.h"
21 #include "llvm/Instructions.h"
22 #include "llvm/Module.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/MathExtras.h"
29 STATISTIC(PointerAssignments
, "Number of dynamic pointer "
30 "assigments discovered");
31 STATISTIC(PointerSubtract
, "Number of pointer subtractions discovered");
35 // The Peephole optimization pass is used to do simple last minute optimizations
36 // that are required for correct code or to remove redundant functions
41 class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt
: public FunctionPass
{
45 AMDILPeepholeOpt(TargetMachine
&tm AMDIL_OPT_LEVEL_DECL
);
47 const char *getPassName() const;
48 bool runOnFunction(Function
&F
);
49 bool doInitialization(Module
&M
);
50 bool doFinalization(Module
&M
);
51 void getAnalysisUsage(AnalysisUsage
&AU
) const;
54 // Function to initiate all of the instruction level optimizations.
55 bool instLevelOptimizations(BasicBlock::iterator
*inst
);
56 // Quick check to see if we need to dump all of the pointers into the
57 // arena. If this is correct, then we set all pointers to exist in arena. This
58 // is a workaround for aliasing of pointers in a struct/union.
59 bool dumpAllIntoArena(Function
&F
);
60 // Because I don't want to invalidate any pointers while in the
61 // safeNestedForEachFunction. I push atomic conversions to a vector and handle
62 // it later. This function does the conversions if required.
63 void doAtomicConversionIfNeeded(Function
&F
);
64 // Because __amdil_is_constant cannot be properly evaluated if
65 // optimizations are disabled, the call's are placed in a vector
66 // and evaluated after the __amdil_image* functions are evaluated
67 // which should allow the __amdil_is_constant function to be
68 // evaluated correctly.
69 void doIsConstCallConversionIfNeeded();
73 CodeGenOpt::Level optLevel
;
74 // Run a series of tests to see if we can optimize a CALL instruction.
75 bool optimizeCallInst(BasicBlock::iterator
*bbb
);
76 // A peephole optimization to optimize bit extract sequences.
77 bool optimizeBitExtract(Instruction
*inst
);
78 // A peephole optimization to optimize bit insert sequences.
79 bool optimizeBitInsert(Instruction
*inst
);
80 bool setupBitInsert(Instruction
*base
,
84 // Expand the bit field insert instruction on versions of OpenCL that
86 bool expandBFI(CallInst
*CI
);
87 // Expand the bit field mask instruction on version of OpenCL that
89 bool expandBFM(CallInst
*CI
);
90 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
91 // this case we need to expand them. These functions check for 24bit functions
93 bool isSigned24BitOps(CallInst
*CI
);
94 void expandSigned24BitOps(CallInst
*CI
);
95 // One optimization that can occur is that if the required workgroup size is
96 // specified then the result of get_local_size is known at compile time and
97 // can be returned accordingly.
98 bool isRWGLocalOpt(CallInst
*CI
);
99 // On northern island cards, the division is slightly less accurate than on
100 // previous generations, so we need to utilize a more accurate division. So we
101 // can translate the accurate divide to a normal divide on all other cards.
102 bool convertAccurateDivide(CallInst
*CI
);
103 void expandAccurateDivide(CallInst
*CI
);
104 // If the alignment is set incorrectly, it can produce really inefficient
105 // code. This checks for this scenario and fixes it if possible.
106 bool correctMisalignedMemOp(Instruction
*inst
);
108 // If we are in no opt mode, then we need to make sure that
109 // local samplers are properly propagated as constant propagation
110 // doesn't occur and we need to know the value of kernel defined
111 // samplers at compile time.
112 bool propagateSamplerInst(CallInst
*CI
);
116 // Group of functions that recursively calculate the size of a structure based
117 // on it's sub-types.
118 size_t getTypeSize(Type
* const T
, bool dereferencePtr
= false);
119 size_t getTypeSize(StructType
* const ST
, bool dereferencePtr
= false);
120 size_t getTypeSize(IntegerType
* const IT
, bool dereferencePtr
= false);
121 size_t getTypeSize(FunctionType
* const FT
,bool dereferencePtr
= false);
122 size_t getTypeSize(ArrayType
* const AT
, bool dereferencePtr
= false);
123 size_t getTypeSize(VectorType
* const VT
, bool dereferencePtr
= false);
124 size_t getTypeSize(PointerType
* const PT
, bool dereferencePtr
= false);
125 size_t getTypeSize(OpaqueType
* const OT
, bool dereferencePtr
= false);
129 const AMDILSubtarget
*mSTM
;
130 SmallVector
< std::pair
<CallInst
*, Function
*>, 16> atomicFuncs
;
131 SmallVector
<CallInst
*, 16> isConstVec
;
132 }; // class AMDILPeepholeOpt
133 char AMDILPeepholeOpt::ID
= 0;
134 } // anonymous namespace
138 createAMDILPeepholeOpt(TargetMachine
&tm AMDIL_OPT_LEVEL_DECL
)
140 return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR
);
144 AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine
&tm AMDIL_OPT_LEVEL_DECL
)
145 : FunctionPass(ID
), TM(tm
)
148 optLevel
= TM
.getOptLevel();
152 AMDILPeepholeOpt::~AMDILPeepholeOpt()
157 AMDILPeepholeOpt::getPassName() const
159 return "AMDIL PeepHole Optimization Pass";
163 containsPointerType(Type
*Ty
)
168 switch(Ty
->getTypeID()) {
171 case Type::StructTyID
: {
172 const StructType
*ST
= dyn_cast
<StructType
>(Ty
);
173 for (StructType::element_iterator stb
= ST
->element_begin(),
174 ste
= ST
->element_end(); stb
!= ste
; ++stb
) {
175 if (!containsPointerType(*stb
)) {
182 case Type::VectorTyID
:
183 case Type::ArrayTyID
:
184 return containsPointerType(dyn_cast
<SequentialType
>(Ty
)->getElementType());
185 case Type::PointerTyID
:
192 AMDILPeepholeOpt::dumpAllIntoArena(Function
&F
)
194 bool dumpAll
= false;
195 for (Function::const_arg_iterator cab
= F
.arg_begin(),
196 cae
= F
.arg_end(); cab
!= cae
; ++cab
) {
197 const Argument
*arg
= cab
;
198 const PointerType
*PT
= dyn_cast
<PointerType
>(arg
->getType());
202 Type
*DereferencedType
= PT
->getElementType();
203 if (!dyn_cast
<StructType
>(DereferencedType
)
207 if (!containsPointerType(DereferencedType
)) {
210 // FIXME: Because a pointer inside of a struct/union may be aliased to
211 // another pointer we need to take the conservative approach and place all
212 // pointers into the arena until more advanced detection is implemented.
218 AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
220 if (isConstVec
.empty()) {
223 for (unsigned x
= 0, y
= isConstVec
.size(); x
< y
; ++x
) {
224 CallInst
*CI
= isConstVec
[x
];
225 Constant
*CV
= dyn_cast
<Constant
>(CI
->getOperand(0));
226 Type
*aType
= Type::getInt32Ty(*mCTX
);
227 Value
*Val
= (CV
!= NULL
) ? ConstantInt::get(aType
, 1)
228 : ConstantInt::get(aType
, 0);
229 CI
->replaceAllUsesWith(Val
);
230 CI
->eraseFromParent();
235 AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function
&F
)
237 // Don't do anything if we don't have any atomic operations.
238 if (atomicFuncs
.empty()) {
241 // Change the function name for the atomic if it is required
242 uint32_t size
= atomicFuncs
.size();
243 for (uint32_t x
= 0; x
< size
; ++x
) {
244 atomicFuncs
[x
].first
->setOperand(
245 atomicFuncs
[x
].first
->getNumOperands()-1,
246 atomicFuncs
[x
].second
);
250 if (mConvertAtomics
) {
256 AMDILPeepholeOpt::runOnFunction(Function
&MF
)
260 mSTM
= &TM
.getSubtarget
<AMDILSubtarget
>();
264 mCTX
= &MF
.getType()->getContext();
265 mConvertAtomics
= true;
266 safeNestedForEach(MF
.begin(), MF
.end(), MF
.begin()->begin(),
267 std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations
),
270 doAtomicConversionIfNeeded(MF
);
271 doIsConstCallConversionIfNeeded();
280 AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator
*bbb
)
282 Instruction
*inst
= (*bbb
);
283 CallInst
*CI
= dyn_cast
<CallInst
>(inst
);
287 if (isSigned24BitOps(CI
)) {
288 expandSigned24BitOps(CI
);
290 CI
->eraseFromParent();
293 if (propagateSamplerInst(CI
)) {
296 if (expandBFI(CI
) || expandBFM(CI
)) {
298 CI
->eraseFromParent();
301 if (convertAccurateDivide(CI
)) {
302 expandAccurateDivide(CI
);
304 CI
->eraseFromParent();
308 StringRef calleeName
= CI
->getOperand(CI
->getNumOperands()-1)->getName();
309 if (calleeName
.startswith("__amdil_is_constant")) {
310 // If we do not have optimizations, then this
311 // cannot be properly evaluated, so we add the
312 // call instruction to a vector and process
313 // them at the end of processing after the
314 // samplers have been correctly handled.
315 if (optLevel
== CodeGenOpt::None
) {
316 isConstVec
.push_back(CI
);
319 Constant
*CV
= dyn_cast
<Constant
>(CI
->getOperand(0));
320 Type
*aType
= Type::getInt32Ty(*mCTX
);
321 Value
*Val
= (CV
!= NULL
) ? ConstantInt::get(aType
, 1)
322 : ConstantInt::get(aType
, 0);
323 CI
->replaceAllUsesWith(Val
);
325 CI
->eraseFromParent();
330 if (calleeName
.equals("__amdil_is_asic_id_i32")) {
331 ConstantInt
*CV
= dyn_cast
<ConstantInt
>(CI
->getOperand(0));
332 Type
*aType
= Type::getInt32Ty(*mCTX
);
335 Val
= ConstantInt::get(aType
,
336 mSTM
->device()->getDeviceFlag() & CV
->getZExtValue());
338 Val
= ConstantInt::get(aType
, 0);
340 CI
->replaceAllUsesWith(Val
);
342 CI
->eraseFromParent();
345 Function
*F
= dyn_cast
<Function
>(CI
->getOperand(CI
->getNumOperands()-1));
349 if (F
->getName().startswith("__atom") && !CI
->getNumUses()
350 && F
->getName().find("_xchg") == StringRef::npos
) {
351 std::string
buffer(F
->getName().str() + "_noret");
352 F
= dyn_cast
<Function
>(
353 F
->getParent()->getOrInsertFunction(buffer
, F
->getFunctionType()));
354 atomicFuncs
.push_back(std::make_pair
<CallInst
*, Function
*>(CI
, F
));
357 if (!mSTM
->device()->isSupported(AMDILDeviceInfo::ArenaSegment
)
358 && !mSTM
->device()->isSupported(AMDILDeviceInfo::MultiUAV
)) {
361 if (!mConvertAtomics
) {
364 StringRef name
= F
->getName();
365 if (name
.startswith("__atom") && name
.find("_g") != StringRef::npos
) {
366 mConvertAtomics
= false;
372 AMDILPeepholeOpt::setupBitInsert(Instruction
*base
,
379 dbgs() << "Null pointer passed into function.\n";
384 if (base
->getOpcode() == Instruction::Shl
) {
385 shift
= dyn_cast
<Constant
>(base
->getOperand(1));
386 } else if (base
->getOpcode() == Instruction::And
) {
387 mask
= dyn_cast
<Constant
>(base
->getOperand(1));
391 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
393 // If the base is neither a Shl or a And, we don't fit any of the patterns above.
396 src
= dyn_cast
<Instruction
>(base
->getOperand(0));
399 dbgs() << "Failed setup since the base operand is not an instruction!\n";
403 // If we find an 'and' operation, then we don't need to
404 // find the next operation as we already know the
405 // bits that are valid at this point.
409 if (src
->getOpcode() == Instruction::Shl
&& !shift
) {
410 shift
= dyn_cast
<Constant
>(src
->getOperand(1));
411 src
= dyn_cast
<Instruction
>(src
->getOperand(0));
412 } else if (src
->getOpcode() == Instruction::And
&& !mask
) {
413 mask
= dyn_cast
<Constant
>(src
->getOperand(1));
415 if (!mask
&& !shift
) {
417 dbgs() << "Failed setup since both mask and shift are NULL!\n";
419 // Did not find a constant mask or a shift.
425 AMDILPeepholeOpt::optimizeBitInsert(Instruction
*inst
)
430 if (!inst
->isBinaryOp()) {
433 if (inst
->getOpcode() != Instruction::Or
) {
436 if (optLevel
== CodeGenOpt::None
) {
439 // We want to do an optimization on a sequence of ops that in the end equals a
440 // single ISA instruction.
441 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
442 // Some simplified versions of this pattern are as follows:
443 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
444 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
445 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
446 // (A & B) | (D << F) when (1 << F) >= B
447 // (A << C) | (D & E) when (1 << C) >= E
448 if (mSTM
->device()->getGeneration() == AMDILDeviceInfo::HD4XXX
) {
449 // The HD4XXX hardware doesn't support the ubit_insert instruction.
452 Type
*aType
= inst
->getType();
453 bool isVector
= aType
->isVectorTy();
455 // This optimization only works on 32bit integers.
456 if (aType
->getScalarType()
457 != Type::getInt32Ty(inst
->getContext())) {
461 const VectorType
*VT
= dyn_cast
<VectorType
>(aType
);
462 numEle
= VT
->getNumElements();
463 // We currently cannot support more than 4 elements in a intrinsic and we
464 // cannot support Vec3 types.
465 if (numEle
> 4 || numEle
== 3) {
469 // TODO: Handle vectors.
472 dbgs() << "!!! Vectors are not supported yet!\n";
476 Instruction
*LHSSrc
= NULL
, *RHSSrc
= NULL
;
477 Constant
*LHSMask
= NULL
, *RHSMask
= NULL
;
478 Constant
*LHSShift
= NULL
, *RHSShift
= NULL
;
479 Instruction
*LHS
= dyn_cast
<Instruction
>(inst
->getOperand(0));
480 Instruction
*RHS
= dyn_cast
<Instruction
>(inst
->getOperand(1));
481 if (!setupBitInsert(LHS
, LHSSrc
, LHSMask
, LHSShift
)) {
483 dbgs() << "Found an OR Operation that failed setup!\n";
485 if (LHS
) { LHS
->dump(); }
486 if (LHSSrc
) { LHSSrc
->dump(); }
487 if (LHSMask
) { LHSMask
->dump(); }
488 if (LHSShift
) { LHSShift
->dump(); }
490 // There was an issue with the setup for BitInsert.
493 if (!setupBitInsert(RHS
, RHSSrc
, RHSMask
, RHSShift
)) {
495 dbgs() << "Found an OR Operation that failed setup!\n";
497 if (RHS
) { RHS
->dump(); }
498 if (RHSSrc
) { RHSSrc
->dump(); }
499 if (RHSMask
) { RHSMask
->dump(); }
500 if (RHSShift
) { RHSShift
->dump(); }
502 // There was an issue with the setup for BitInsert.
506 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
507 dbgs() << "Op: "; inst
->dump();
508 dbgs() << "LHS: "; if (LHS
) { LHS
->dump(); } else { dbgs() << "(None)\n"; }
509 dbgs() << "LHS Src: "; if (LHSSrc
) { LHSSrc
->dump(); } else { dbgs() << "(None)\n"; }
510 dbgs() << "LHS Mask: "; if (LHSMask
) { LHSMask
->dump(); } else { dbgs() << "(None)\n"; }
511 dbgs() << "LHS Shift: "; if (LHSShift
) { LHSShift
->dump(); } else { dbgs() << "(None)\n"; }
512 dbgs() << "RHS: "; if (RHS
) { RHS
->dump(); } else { dbgs() << "(None)\n"; }
513 dbgs() << "RHS Src: "; if (RHSSrc
) { RHSSrc
->dump(); } else { dbgs() << "(None)\n"; }
514 dbgs() << "RHS Mask: "; if (RHSMask
) { RHSMask
->dump(); } else { dbgs() << "(None)\n"; }
515 dbgs() << "RHS Shift: "; if (RHSShift
) { RHSShift
->dump(); } else { dbgs() << "(None)\n"; }
517 Constant
*offset
= NULL
;
518 Constant
*width
= NULL
;
519 int32_t lhsMaskVal
= 0, rhsMaskVal
= 0;
520 int32_t lhsShiftVal
= 0, rhsShiftVal
= 0;
521 int32_t lhsMaskWidth
= 0, rhsMaskWidth
= 0;
522 int32_t lhsMaskOffset
= 0, rhsMaskOffset
= 0;
523 lhsMaskVal
= (int32_t)(LHSMask
524 ? dyn_cast
<ConstantInt
>(LHSMask
)->getZExtValue() : 0);
525 rhsMaskVal
= (int32_t)(RHSMask
526 ? dyn_cast
<ConstantInt
>(RHSMask
)->getZExtValue() : 0);
527 lhsShiftVal
= (int32_t)(LHSShift
528 ? dyn_cast
<ConstantInt
>(LHSShift
)->getZExtValue() : 0);
529 rhsShiftVal
= (int32_t)(RHSShift
530 ? dyn_cast
<ConstantInt
>(RHSShift
)->getZExtValue() : 0);
531 lhsMaskWidth
= lhsMaskVal
? CountPopulation_32(lhsMaskVal
) : 32 - lhsShiftVal
;
532 rhsMaskWidth
= rhsMaskVal
? CountPopulation_32(rhsMaskVal
) : 32 - rhsShiftVal
;
533 lhsMaskOffset
= lhsMaskVal
? CountTrailingZeros_32(lhsMaskVal
) : lhsShiftVal
;
534 rhsMaskOffset
= rhsMaskVal
? CountTrailingZeros_32(rhsMaskVal
) : rhsShiftVal
;
535 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
537 dbgs() << "Found pattern: \'((A" << (LHSMask
? " & B)" : ")");
538 dbgs() << (LHSShift
? " << C)" : ")") << " | ((D" ;
539 dbgs() << (RHSMask
? " & E)" : ")");
540 dbgs() << (RHSShift
? " << F)\'\n" : ")\'\n");
541 dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
542 dbgs() << "B = " << lhsMaskVal
<< "\t\tE = " << rhsMaskVal
<< "\n";
543 dbgs() << "C = " << lhsShiftVal
<< "\t\tF = " << rhsShiftVal
<< "\n";
544 dbgs() << "width(B) = " << lhsMaskWidth
;
545 dbgs() << "\twidth(E) = " << rhsMaskWidth
<< "\n";
546 dbgs() << "offset(B) = " << lhsMaskOffset
;
547 dbgs() << "\toffset(E) = " << rhsMaskOffset
<< "\n";
548 dbgs() << "Constraints: \n";
549 dbgs() << "\t(1) B ^ E == 0\n";
550 dbgs() << "\t(2-LHS) B is a mask\n";
551 dbgs() << "\t(2-LHS) E is a mask\n";
552 dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
553 dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
555 if ((lhsMaskVal
|| rhsMaskVal
) && !(lhsMaskVal
^ rhsMaskVal
)) {
557 dbgs() << lhsMaskVal
<< " ^ " << rhsMaskVal
;
558 dbgs() << " = " << (lhsMaskVal
^ rhsMaskVal
) << "\n";
559 dbgs() << "Failed constraint 1!\n";
564 dbgs() << "LHS = " << lhsMaskOffset
<< "";
565 dbgs() << " >= (" << rhsMaskWidth
<< " + " << rhsMaskOffset
<< ") = ";
566 dbgs() << (lhsMaskOffset
>= (rhsMaskWidth
+ rhsMaskOffset
));
567 dbgs() << "\nRHS = " << rhsMaskOffset
<< "";
568 dbgs() << " >= (" << lhsMaskWidth
<< " + " << lhsMaskOffset
<< ") = ";
569 dbgs() << (rhsMaskOffset
>= (lhsMaskWidth
+ lhsMaskOffset
));
572 if (lhsMaskOffset
>= (rhsMaskWidth
+ rhsMaskOffset
)) {
573 offset
= ConstantInt::get(aType
, lhsMaskOffset
, false);
574 width
= ConstantInt::get(aType
, lhsMaskWidth
, false);
576 if (!isMask_32(lhsMaskVal
) && !isShiftedMask_32(lhsMaskVal
)) {
578 dbgs() << "Value is not a Mask: " << lhsMaskVal
<< "\n";
579 dbgs() << "Failed constraint 2!\n";
584 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
586 } else if (lhsShiftVal
!= lhsMaskOffset
) {
587 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
591 dbgs() << "Optimizing LHS!\n";
593 } else if (rhsMaskOffset
>= (lhsMaskWidth
+ lhsMaskOffset
)) {
594 offset
= ConstantInt::get(aType
, rhsMaskOffset
, false);
595 width
= ConstantInt::get(aType
, rhsMaskWidth
, false);
598 if (!isMask_32(rhsMaskVal
) && !isShiftedMask_32(rhsMaskVal
)) {
600 dbgs() << "Non-Mask: " << rhsMaskVal
<< "\n";
601 dbgs() << "Failed constraint 2!\n";
606 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
608 } else if (rhsShiftVal
!= rhsMaskOffset
) {
609 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
613 dbgs() << "Optimizing RHS!\n";
617 dbgs() << "Failed constraint 3!\n";
622 dbgs() << "Width: "; if (width
) { width
->dump(); } else { dbgs() << "(0)\n"; }
623 dbgs() << "Offset: "; if (offset
) { offset
->dump(); } else { dbgs() << "(0)\n"; }
624 dbgs() << "LHSSrc: "; if (LHSSrc
) { LHSSrc
->dump(); } else { dbgs() << "(0)\n"; }
625 dbgs() << "RHSSrc: "; if (RHSSrc
) { RHSSrc
->dump(); } else { dbgs() << "(0)\n"; }
627 if (!offset
|| !width
) {
629 dbgs() << "Either width or offset are NULL, failed detection!\n";
633 // Lets create the function signature.
634 std::vector
<Type
*> callTypes
;
635 callTypes
.push_back(aType
);
636 callTypes
.push_back(aType
);
637 callTypes
.push_back(aType
);
638 callTypes
.push_back(aType
);
639 FunctionType
*funcType
= FunctionType::get(aType
, callTypes
, false);
640 std::string name
= "__amdil_ubit_insert";
641 if (isVector
) { name
+= "_v" + itostr(numEle
) + "u32"; } else { name
+= "_u32"; }
643 dyn_cast
<Function
>(inst
->getParent()->getParent()->getParent()->
644 getOrInsertFunction(llvm::StringRef(name
), funcType
));
645 Value
*Operands
[4] = {
651 CallInst
*CI
= CallInst::Create(Func
, Operands
, "BitInsertOpt");
653 dbgs() << "Old Inst: ";
655 dbgs() << "New Inst: ";
659 CI
->insertBefore(inst
);
660 inst
->replaceAllUsesWith(CI
);
665 AMDILPeepholeOpt::optimizeBitExtract(Instruction
*inst
)
670 if (!inst
->isBinaryOp()) {
673 if (inst
->getOpcode() != Instruction::And
) {
676 if (optLevel
== CodeGenOpt::None
) {
679 // We want to do some simple optimizations on Shift right/And patterns. The
680 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
681 // value smaller than 32 and C is a mask. If C is a constant value, then the
682 // following transformation can occur. For signed integers, it turns into the
683 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
684 // integers, it turns into the function call dst =
685 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
686 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
687 // Evergreen hardware.
688 if (mSTM
->device()->getGeneration() == AMDILDeviceInfo::HD4XXX
) {
689 // This does not work on HD4XXX hardware.
692 Type
*aType
= inst
->getType();
693 bool isVector
= aType
->isVectorTy();
695 // XXX Support vector types
700 // This only works on 32bit integers
701 if (aType
->getScalarType()
702 != Type::getInt32Ty(inst
->getContext())) {
706 const VectorType
*VT
= dyn_cast
<VectorType
>(aType
);
707 numEle
= VT
->getNumElements();
708 // We currently cannot support more than 4 elements in a intrinsic and we
709 // cannot support Vec3 types.
710 if (numEle
> 4 || numEle
== 3) {
714 BinaryOperator
*ShiftInst
= dyn_cast
<BinaryOperator
>(inst
->getOperand(0));
715 // If the first operand is not a shift instruction, then we can return as it
716 // doesn't match this pattern.
717 if (!ShiftInst
|| !ShiftInst
->isShift()) {
720 // If we are a shift left, then we need don't match this pattern.
721 if (ShiftInst
->getOpcode() == Instruction::Shl
) {
724 bool isSigned
= ShiftInst
->isArithmeticShift();
725 Constant
*AndMask
= dyn_cast
<Constant
>(inst
->getOperand(1));
726 Constant
*ShrVal
= dyn_cast
<Constant
>(ShiftInst
->getOperand(1));
727 // Lets make sure that the shift value and the and mask are constant integers.
728 if (!AndMask
|| !ShrVal
) {
731 Constant
*newMaskConst
;
732 Constant
*shiftValConst
;
734 // Handle the vector case
735 std::vector
<Constant
*> maskVals
;
736 std::vector
<Constant
*> shiftVals
;
737 ConstantVector
*AndMaskVec
= dyn_cast
<ConstantVector
>(AndMask
);
738 ConstantVector
*ShrValVec
= dyn_cast
<ConstantVector
>(ShrVal
);
739 Type
*scalarType
= AndMaskVec
->getType()->getScalarType();
740 assert(AndMaskVec
->getNumOperands() ==
741 ShrValVec
->getNumOperands() && "cannot have a "
742 "combination where the number of elements to a "
743 "shift and an and are different!");
744 for (size_t x
= 0, y
= AndMaskVec
->getNumOperands(); x
< y
; ++x
) {
745 ConstantInt
*AndCI
= dyn_cast
<ConstantInt
>(AndMaskVec
->getOperand(x
));
746 ConstantInt
*ShiftIC
= dyn_cast
<ConstantInt
>(ShrValVec
->getOperand(x
));
747 if (!AndCI
|| !ShiftIC
) {
750 uint32_t maskVal
= (uint32_t)AndCI
->getZExtValue();
751 if (!isMask_32(maskVal
)) {
754 maskVal
= (uint32_t)CountTrailingOnes_32(maskVal
);
755 uint32_t shiftVal
= (uint32_t)ShiftIC
->getZExtValue();
756 // If the mask or shiftval is greater than the bitcount, then break out.
757 if (maskVal
>= 32 || shiftVal
>= 32) {
760 // If the mask val is greater than the the number of original bits left
761 // then this optimization is invalid.
762 if (maskVal
> (32 - shiftVal
)) {
765 maskVals
.push_back(ConstantInt::get(scalarType
, maskVal
, isSigned
));
766 shiftVals
.push_back(ConstantInt::get(scalarType
, shiftVal
, isSigned
));
768 newMaskConst
= ConstantVector::get(maskVals
);
769 shiftValConst
= ConstantVector::get(shiftVals
);
771 // Handle the scalar case
772 uint32_t maskVal
= (uint32_t)dyn_cast
<ConstantInt
>(AndMask
)->getZExtValue();
773 // This must be a mask value where all lower bits are set to 1 and then any
774 // bit higher is set to 0.
775 if (!isMask_32(maskVal
)) {
778 maskVal
= (uint32_t)CountTrailingOnes_32(maskVal
);
779 // Count the number of bits set in the mask, this is the width of the
780 // resulting bit set that is extracted from the source value.
781 uint32_t shiftVal
= (uint32_t)dyn_cast
<ConstantInt
>(ShrVal
)->getZExtValue();
782 // If the mask or shift val is greater than the bitcount, then break out.
783 if (maskVal
>= 32 || shiftVal
>= 32) {
786 // If the mask val is greater than the the number of original bits left then
787 // this optimization is invalid.
788 if (maskVal
> (32 - shiftVal
)) {
791 newMaskConst
= ConstantInt::get(aType
, maskVal
, isSigned
);
792 shiftValConst
= ConstantInt::get(aType
, shiftVal
, isSigned
);
794 // Lets create the function signature.
795 std::vector
<Type
*> callTypes
;
796 callTypes
.push_back(aType
);
797 callTypes
.push_back(aType
);
798 callTypes
.push_back(aType
);
799 FunctionType
*funcType
= FunctionType::get(aType
, callTypes
, false);
800 std::string name
= "llvm.AMDIL.bit.extract.u32";
802 name
+= ".v" + itostr(numEle
) + "i32";
806 // Lets create the function.
808 dyn_cast
<Function
>(inst
->getParent()->getParent()->getParent()->
809 getOrInsertFunction(llvm::StringRef(name
), funcType
));
810 Value
*Operands
[3] = {
811 ShiftInst
->getOperand(0),
815 // Lets create the Call with the operands
816 CallInst
*CI
= CallInst::Create(Func
, Operands
, "ByteExtractOpt");
817 CI
->setDoesNotAccessMemory();
818 CI
->insertBefore(inst
);
819 inst
->replaceAllUsesWith(CI
);
824 AMDILPeepholeOpt::expandBFI(CallInst
*CI
)
826 if (!CI
|| mSTM
->calVersion() <= CAL_VERSION_SC_150
) {
829 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
830 if (!LHS
->getName().startswith("__amdil_bfi")) {
833 Type
* type
= CI
->getOperand(0)->getType();
834 Constant
*negOneConst
= NULL
;
835 if (type
->isVectorTy()) {
836 std::vector
<Constant
*> negOneVals
;
837 negOneConst
= ConstantInt::get(CI
->getContext(),
838 APInt(32, StringRef("-1"), 10));
840 y
= dyn_cast
<VectorType
>(type
)->getNumElements(); x
< y
; ++x
) {
841 negOneVals
.push_back(negOneConst
);
843 negOneConst
= ConstantVector::get(negOneVals
);
845 negOneConst
= ConstantInt::get(CI
->getContext(),
846 APInt(32, StringRef("-1"), 10));
848 // __amdil_bfi => (A & B) | (~A & C)
849 BinaryOperator
*lhs
=
850 BinaryOperator::Create(Instruction::And
, CI
->getOperand(0),
851 CI
->getOperand(1), "bfi_and", CI
);
852 BinaryOperator
*rhs
=
853 BinaryOperator::Create(Instruction::Xor
, CI
->getOperand(0), negOneConst
,
855 rhs
= BinaryOperator::Create(Instruction::And
, rhs
, CI
->getOperand(2),
857 lhs
= BinaryOperator::Create(Instruction::Or
, lhs
, rhs
, "bfi_or", CI
);
858 CI
->replaceAllUsesWith(lhs
);
863 AMDILPeepholeOpt::expandBFM(CallInst
*CI
)
865 if (!CI
|| mSTM
->calVersion() <= CAL_VERSION_SC_150
) {
868 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
869 if (!LHS
->getName().startswith("__amdil_bfm")) {
872 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
873 Constant
*newMaskConst
= NULL
;
874 Constant
*newShiftConst
= NULL
;
875 Type
* type
= CI
->getOperand(0)->getType();
876 if (type
->isVectorTy()) {
877 std::vector
<Constant
*> newMaskVals
, newShiftVals
;
878 newMaskConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 0x1F);
879 newShiftConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 1);
881 y
= dyn_cast
<VectorType
>(type
)->getNumElements(); x
< y
; ++x
) {
882 newMaskVals
.push_back(newMaskConst
);
883 newShiftVals
.push_back(newShiftConst
);
885 newMaskConst
= ConstantVector::get(newMaskVals
);
886 newShiftConst
= ConstantVector::get(newShiftVals
);
888 newMaskConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 0x1F);
889 newShiftConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 1);
891 BinaryOperator
*lhs
=
892 BinaryOperator::Create(Instruction::And
, CI
->getOperand(0),
893 newMaskConst
, "bfm_mask", CI
);
894 lhs
= BinaryOperator::Create(Instruction::Shl
, newShiftConst
,
896 lhs
= BinaryOperator::Create(Instruction::Sub
, lhs
,
897 newShiftConst
, "bfm_sub", CI
);
898 BinaryOperator
*rhs
=
899 BinaryOperator::Create(Instruction::And
, CI
->getOperand(1),
900 newMaskConst
, "bfm_mask", CI
);
901 lhs
= BinaryOperator::Create(Instruction::Shl
, lhs
, rhs
, "bfm_shl", CI
);
902 CI
->replaceAllUsesWith(lhs
);
907 AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator
*bbb
)
909 Instruction
*inst
= (*bbb
);
910 if (optimizeCallInst(bbb
)) {
913 if (optimizeBitExtract(inst
)) {
916 if (optimizeBitInsert(inst
)) {
919 if (correctMisalignedMemOp(inst
)) {
925 AMDILPeepholeOpt::correctMisalignedMemOp(Instruction
*inst
)
927 LoadInst
*linst
= dyn_cast
<LoadInst
>(inst
);
928 StoreInst
*sinst
= dyn_cast
<StoreInst
>(inst
);
930 Type
* Ty
= inst
->getType();
932 alignment
= linst
->getAlignment();
933 Ty
= inst
->getType();
935 alignment
= sinst
->getAlignment();
936 Ty
= sinst
->getValueOperand()->getType();
940 unsigned size
= getTypeSize(Ty
);
941 if (size
== alignment
|| size
< alignment
) {
944 if (!Ty
->isStructTy()) {
949 linst
->setAlignment(0);
952 sinst
->setAlignment(0);
959 AMDILPeepholeOpt::isSigned24BitOps(CallInst
*CI
)
964 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
965 std::string namePrefix
= LHS
->getName().substr(0, 14);
966 if (namePrefix
!= "__amdil_imad24" && namePrefix
!= "__amdil_imul24"
967 && namePrefix
!= "__amdil__imul24_high") {
970 if (mSTM
->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps
)) {
977 AMDILPeepholeOpt::expandSigned24BitOps(CallInst
*CI
)
979 assert(isSigned24BitOps(CI
) && "Must be a "
980 "signed 24 bit operation to call this function!");
981 Value
*LHS
= CI
->getOperand(CI
->getNumOperands()-1);
982 // On 7XX and 8XX we do not have signed 24bit, so we need to
983 // expand it to the following:
984 // imul24 turns into 32bit imul
985 // imad24 turns into 32bit imad
986 // imul24_high turns into 32bit imulhigh
987 if (LHS
->getName().substr(0, 14) == "__amdil_imad24") {
988 Type
*aType
= CI
->getOperand(0)->getType();
989 bool isVector
= aType
->isVectorTy();
990 int numEle
= isVector
? dyn_cast
<VectorType
>(aType
)->getNumElements() : 1;
991 std::vector
<Type
*> callTypes
;
992 callTypes
.push_back(CI
->getOperand(0)->getType());
993 callTypes
.push_back(CI
->getOperand(1)->getType());
994 callTypes
.push_back(CI
->getOperand(2)->getType());
995 FunctionType
*funcType
=
996 FunctionType::get(CI
->getOperand(0)->getType(), callTypes
, false);
997 std::string name
= "__amdil_imad";
999 name
+= "_v" + itostr(numEle
) + "i32";
1003 Function
*Func
= dyn_cast
<Function
>(
1004 CI
->getParent()->getParent()->getParent()->
1005 getOrInsertFunction(llvm::StringRef(name
), funcType
));
1006 Value
*Operands
[3] = {
1011 CallInst
*nCI
= CallInst::Create(Func
, Operands
, "imad24");
1012 nCI
->insertBefore(CI
);
1013 CI
->replaceAllUsesWith(nCI
);
1014 } else if (LHS
->getName().substr(0, 14) == "__amdil_imul24") {
1015 BinaryOperator
*mulOp
=
1016 BinaryOperator::Create(Instruction::Mul
, CI
->getOperand(0),
1017 CI
->getOperand(1), "imul24", CI
);
1018 CI
->replaceAllUsesWith(mulOp
);
1019 } else if (LHS
->getName().substr(0, 19) == "__amdil_imul24_high") {
1020 Type
*aType
= CI
->getOperand(0)->getType();
1022 bool isVector
= aType
->isVectorTy();
1023 int numEle
= isVector
? dyn_cast
<VectorType
>(aType
)->getNumElements() : 1;
1024 std::vector
<Type
*> callTypes
;
1025 callTypes
.push_back(CI
->getOperand(0)->getType());
1026 callTypes
.push_back(CI
->getOperand(1)->getType());
1027 FunctionType
*funcType
=
1028 FunctionType::get(CI
->getOperand(0)->getType(), callTypes
, false);
1029 std::string name
= "__amdil_imul_high";
1031 name
+= "_v" + itostr(numEle
) + "i32";
1035 Function
*Func
= dyn_cast
<Function
>(
1036 CI
->getParent()->getParent()->getParent()->
1037 getOrInsertFunction(llvm::StringRef(name
), funcType
));
1038 Value
*Operands
[2] = {
1042 CallInst
*nCI
= CallInst::Create(Func
, Operands
, "imul24_high");
1043 nCI
->insertBefore(CI
);
1044 CI
->replaceAllUsesWith(nCI
);
1049 AMDILPeepholeOpt::isRWGLocalOpt(CallInst
*CI
)
1052 && CI
->getOperand(CI
->getNumOperands() - 1)->getName()
1053 == "__amdil_get_local_size_int");
1057 AMDILPeepholeOpt::convertAccurateDivide(CallInst
*CI
)
1062 if (mSTM
->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
1063 && (mSTM
->getDeviceName() == "cayman")) {
1066 return CI
->getOperand(CI
->getNumOperands() - 1)->getName().substr(0, 20)
1067 == "__amdil_improved_div";
1071 AMDILPeepholeOpt::expandAccurateDivide(CallInst
*CI
)
1073 assert(convertAccurateDivide(CI
)
1074 && "expanding accurate divide can only happen if it is expandable!");
1075 BinaryOperator
*divOp
=
1076 BinaryOperator::Create(Instruction::FDiv
, CI
->getOperand(0),
1077 CI
->getOperand(1), "fdiv32", CI
);
1078 CI
->replaceAllUsesWith(divOp
);
1082 AMDILPeepholeOpt::propagateSamplerInst(CallInst
*CI
)
1084 if (optLevel
!= CodeGenOpt::None
) {
1092 unsigned funcNameIdx
= 0;
1093 funcNameIdx
= CI
->getNumOperands() - 1;
1094 StringRef calleeName
= CI
->getOperand(funcNameIdx
)->getName();
1095 if (calleeName
!= "__amdil_image2d_read_norm"
1096 && calleeName
!= "__amdil_image2d_read_unnorm"
1097 && calleeName
!= "__amdil_image3d_read_norm"
1098 && calleeName
!= "__amdil_image3d_read_unnorm") {
1102 unsigned samplerIdx
= 2;
1104 Value
*sampler
= CI
->getOperand(samplerIdx
);
1105 LoadInst
*lInst
= dyn_cast
<LoadInst
>(sampler
);
1110 if (lInst
->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS
) {
1114 GlobalVariable
*gv
= dyn_cast
<GlobalVariable
>(lInst
->getPointerOperand());
1115 // If we are loading from what is not a global value, then we
1121 // If we don't have an initializer or we have an initializer and
1122 // the initializer is not a 32bit integer, we fail.
1123 if (!gv
->hasInitializer()
1124 || !gv
->getInitializer()->getType()->isIntegerTy(32)) {
1128 // Now that we have the global variable initializer, lets replace
1129 // all uses of the load instruction with the samplerVal and
1130 // reparse the __amdil_is_constant() function.
1131 Constant
*samplerVal
= gv
->getInitializer();
1132 lInst
->replaceAllUsesWith(samplerVal
);
1137 AMDILPeepholeOpt::doInitialization(Module
&M
)
1143 AMDILPeepholeOpt::doFinalization(Module
&M
)
1149 AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage
&AU
) const
1151 AU
.addRequired
<MachineFunctionAnalysis
>();
1152 FunctionPass::getAnalysisUsage(AU
);
1153 AU
.setPreservesAll();
1156 size_t AMDILPeepholeOpt::getTypeSize(Type
* const T
, bool dereferencePtr
) {
1161 switch (T
->getTypeID()) {
1162 case Type::X86_FP80TyID
:
1163 case Type::FP128TyID
:
1164 case Type::PPC_FP128TyID
:
1165 case Type::LabelTyID
:
1166 assert(0 && "These types are not supported by this backend");
1168 case Type::FloatTyID
:
1169 case Type::DoubleTyID
:
1170 size
= T
->getPrimitiveSizeInBits() >> 3;
1172 case Type::PointerTyID
:
1173 size
= getTypeSize(dyn_cast
<PointerType
>(T
), dereferencePtr
);
1175 case Type::IntegerTyID
:
1176 size
= getTypeSize(dyn_cast
<IntegerType
>(T
), dereferencePtr
);
1178 case Type::StructTyID
:
1179 size
= getTypeSize(dyn_cast
<StructType
>(T
), dereferencePtr
);
1181 case Type::ArrayTyID
:
1182 size
= getTypeSize(dyn_cast
<ArrayType
>(T
), dereferencePtr
);
1184 case Type::FunctionTyID
:
1185 size
= getTypeSize(dyn_cast
<FunctionType
>(T
), dereferencePtr
);
1187 case Type::VectorTyID
:
1188 size
= getTypeSize(dyn_cast
<VectorType
>(T
), dereferencePtr
);
1194 size_t AMDILPeepholeOpt::getTypeSize(StructType
* const ST
,
1195 bool dereferencePtr
) {
1201 StructType::element_iterator eib
;
1202 StructType::element_iterator eie
;
1203 for (eib
= ST
->element_begin(), eie
= ST
->element_end(); eib
!= eie
; ++eib
) {
1205 size
+= getTypeSize(curType
, dereferencePtr
);
1210 size_t AMDILPeepholeOpt::getTypeSize(IntegerType
* const IT
,
1211 bool dereferencePtr
) {
1212 return IT
? (IT
->getBitWidth() >> 3) : 0;
1215 size_t AMDILPeepholeOpt::getTypeSize(FunctionType
* const FT
,
1216 bool dereferencePtr
) {
1217 assert(0 && "Should not be able to calculate the size of an function type");
1221 size_t AMDILPeepholeOpt::getTypeSize(ArrayType
* const AT
,
1222 bool dereferencePtr
) {
1223 return (size_t)(AT
? (getTypeSize(AT
->getElementType(),
1224 dereferencePtr
) * AT
->getNumElements())
1228 size_t AMDILPeepholeOpt::getTypeSize(VectorType
* const VT
,
1229 bool dereferencePtr
) {
1230 return VT
? (VT
->getBitWidth() >> 3) : 0;
1233 size_t AMDILPeepholeOpt::getTypeSize(PointerType
* const PT
,
1234 bool dereferencePtr
) {
1238 Type
*CT
= PT
->getElementType();
1239 if (CT
->getTypeID() == Type::StructTyID
&&
1240 PT
->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS
) {
1241 return getTypeSize(dyn_cast
<StructType
>(CT
));
1242 } else if (dereferencePtr
) {
1244 for (size_t x
= 0, y
= PT
->getNumContainedTypes(); x
< y
; ++x
) {
1245 size
+= getTypeSize(PT
->getContainedType(x
), dereferencePtr
);
1253 size_t AMDILPeepholeOpt::getTypeSize(OpaqueType
* const OT
,
1254 bool dereferencePtr
) {
1255 //assert(0 && "Should not be able to calculate the size of an opaque type");