1 //===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //==-----------------------------------------------------------------------===//
10 #define DEBUG_TYPE "PeepholeOpt"
12 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
17 #include "AMDILAlgorithms.tpp"
18 #include "AMDILDevices.h"
19 #include "AMDILUtilityFunctions.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/ADT/StringRef.h"
23 #include "llvm/ADT/Twine.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
26 #include "llvm/Function.h"
27 #include "llvm/Instructions.h"
28 #include "llvm/Module.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
35 STATISTIC(PointerAssignments
, "Number of dynamic pointer "
36 "assigments discovered");
37 STATISTIC(PointerSubtract
, "Number of pointer subtractions discovered");
41 // The Peephole optimization pass is used to do simple last minute optimizations
42 // that are required for correct code or to remove redundant functions
44 class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt
: public FunctionPass
{
48 AMDILPeepholeOpt(TargetMachine
&tm AMDIL_OPT_LEVEL_DECL
);
50 const char *getPassName() const;
51 bool runOnFunction(Function
&F
);
52 bool doInitialization(Module
&M
);
53 bool doFinalization(Module
&M
);
54 void getAnalysisUsage(AnalysisUsage
&AU
) const;
57 // Function to initiate all of the instruction level optimizations.
58 bool instLevelOptimizations(BasicBlock::iterator
*inst
);
59 // Quick check to see if we need to dump all of the pointers into the
60 // arena. If this is correct, then we set all pointers to exist in arena. This
61 // is a workaround for aliasing of pointers in a struct/union.
62 bool dumpAllIntoArena(Function
&F
);
63 // Because I don't want to invalidate any pointers while in the
64 // safeNestedForEachFunction. I push atomic conversions to a vector and handle
65 // it later. This function does the conversions if required.
66 void doAtomicConversionIfNeeded(Function
&F
);
67 // Because __amdil_is_constant cannot be properly evaluated if
68 // optimizations are disabled, the call's are placed in a vector
69 // and evaluated after the __amdil_image* functions are evaluated
70 // which should allow the __amdil_is_constant function to be
71 // evaluated correctly.
72 void doIsConstCallConversionIfNeeded();
76 CodeGenOpt::Level optLevel
;
77 // Run a series of tests to see if we can optimize a CALL instruction.
78 bool optimizeCallInst(BasicBlock::iterator
*bbb
);
79 // A peephole optimization to optimize bit extract sequences.
80 bool optimizeBitExtract(Instruction
*inst
);
81 // A peephole optimization to optimize bit insert sequences.
82 bool optimizeBitInsert(Instruction
*inst
);
83 bool setupBitInsert(Instruction
*base
,
87 // Expand the bit field insert instruction on versions of OpenCL that
89 bool expandBFI(CallInst
*CI
);
90 // Expand the bit field mask instruction on version of OpenCL that
92 bool expandBFM(CallInst
*CI
);
93 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
94 // this case we need to expand them. These functions check for 24bit functions
96 bool isSigned24BitOps(CallInst
*CI
);
97 void expandSigned24BitOps(CallInst
*CI
);
98 // One optimization that can occur is that if the required workgroup size is
99 // specified then the result of get_local_size is known at compile time and
100 // can be returned accordingly.
101 bool isRWGLocalOpt(CallInst
*CI
);
102 // On northern island cards, the division is slightly less accurate than on
103 // previous generations, so we need to utilize a more accurate division. So we
104 // can translate the accurate divide to a normal divide on all other cards.
105 bool convertAccurateDivide(CallInst
*CI
);
106 void expandAccurateDivide(CallInst
*CI
);
107 // If the alignment is set incorrectly, it can produce really inefficient
108 // code. This checks for this scenario and fixes it if possible.
109 bool correctMisalignedMemOp(Instruction
*inst
);
111 // If we are in no opt mode, then we need to make sure that
112 // local samplers are properly propagated as constant propagation
113 // doesn't occur and we need to know the value of kernel defined
114 // samplers at compile time.
115 bool propagateSamplerInst(CallInst
*CI
);
119 const AMDILSubtarget
*mSTM
;
120 SmallVector
< std::pair
<CallInst
*, Function
*>, 16> atomicFuncs
;
121 SmallVector
<CallInst
*, 16> isConstVec
;
122 }; // class AMDILPeepholeOpt
123 char AMDILPeepholeOpt::ID
= 0;
124 } // anonymous namespace
128 createAMDILPeepholeOpt(TargetMachine
&tm AMDIL_OPT_LEVEL_DECL
)
130 return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR
);
134 AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine
&tm AMDIL_OPT_LEVEL_DECL
)
135 : FunctionPass(ID
), TM(tm
)
138 optLevel
= TM
.getOptLevel();
142 AMDILPeepholeOpt::~AMDILPeepholeOpt()
147 AMDILPeepholeOpt::getPassName() const
149 return "AMDIL PeepHole Optimization Pass";
153 containsPointerType(Type
*Ty
)
158 switch(Ty
->getTypeID()) {
161 case Type::StructTyID
: {
162 const StructType
*ST
= dyn_cast
<StructType
>(Ty
);
163 for (StructType::element_iterator stb
= ST
->element_begin(),
164 ste
= ST
->element_end(); stb
!= ste
; ++stb
) {
165 if (!containsPointerType(*stb
)) {
172 case Type::VectorTyID
:
173 case Type::ArrayTyID
:
174 return containsPointerType(dyn_cast
<SequentialType
>(Ty
)->getElementType());
175 case Type::PointerTyID
:
182 AMDILPeepholeOpt::dumpAllIntoArena(Function
&F
)
184 bool dumpAll
= false;
185 for (Function::const_arg_iterator cab
= F
.arg_begin(),
186 cae
= F
.arg_end(); cab
!= cae
; ++cab
) {
187 const Argument
*arg
= cab
;
188 const PointerType
*PT
= dyn_cast
<PointerType
>(arg
->getType());
192 Type
*DereferencedType
= PT
->getElementType();
193 if (!dyn_cast
<StructType
>(DereferencedType
)
197 if (!containsPointerType(DereferencedType
)) {
200 // FIXME: Because a pointer inside of a struct/union may be aliased to
201 // another pointer we need to take the conservative approach and place all
202 // pointers into the arena until more advanced detection is implemented.
208 AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
210 if (isConstVec
.empty()) {
213 for (unsigned x
= 0, y
= isConstVec
.size(); x
< y
; ++x
) {
214 CallInst
*CI
= isConstVec
[x
];
215 Constant
*CV
= dyn_cast
<Constant
>(CI
->getOperand(0));
216 Type
*aType
= Type::getInt32Ty(*mCTX
);
217 Value
*Val
= (CV
!= NULL
) ? ConstantInt::get(aType
, 1)
218 : ConstantInt::get(aType
, 0);
219 CI
->replaceAllUsesWith(Val
);
220 CI
->eraseFromParent();
225 AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function
&F
)
227 // Don't do anything if we don't have any atomic operations.
228 if (atomicFuncs
.empty()) {
231 // Change the function name for the atomic if it is required
232 uint32_t size
= atomicFuncs
.size();
233 for (uint32_t x
= 0; x
< size
; ++x
) {
234 atomicFuncs
[x
].first
->setOperand(
235 atomicFuncs
[x
].first
->getNumOperands()-1,
236 atomicFuncs
[x
].second
);
240 if (mConvertAtomics
) {
246 AMDILPeepholeOpt::runOnFunction(Function
&MF
)
250 mSTM
= &TM
.getSubtarget
<AMDILSubtarget
>();
254 mCTX
= &MF
.getType()->getContext();
255 mConvertAtomics
= true;
256 safeNestedForEach(MF
.begin(), MF
.end(), MF
.begin()->begin(),
257 std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations
),
260 doAtomicConversionIfNeeded(MF
);
261 doIsConstCallConversionIfNeeded();
270 AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator
*bbb
)
272 Instruction
*inst
= (*bbb
);
273 CallInst
*CI
= dyn_cast
<CallInst
>(inst
);
277 if (isSigned24BitOps(CI
)) {
278 expandSigned24BitOps(CI
);
280 CI
->eraseFromParent();
283 if (propagateSamplerInst(CI
)) {
286 if (expandBFI(CI
) || expandBFM(CI
)) {
288 CI
->eraseFromParent();
291 if (convertAccurateDivide(CI
)) {
292 expandAccurateDivide(CI
);
294 CI
->eraseFromParent();
298 StringRef calleeName
= CI
->getOperand(CI
->getNumOperands()-1)->getName();
299 if (calleeName
.startswith("__amdil_is_constant")) {
300 // If we do not have optimizations, then this
301 // cannot be properly evaluated, so we add the
302 // call instruction to a vector and process
303 // them at the end of processing after the
304 // samplers have been correctly handled.
305 if (optLevel
== CodeGenOpt::None
) {
306 isConstVec
.push_back(CI
);
309 Constant
*CV
= dyn_cast
<Constant
>(CI
->getOperand(0));
310 Type
*aType
= Type::getInt32Ty(*mCTX
);
311 Value
*Val
= (CV
!= NULL
) ? ConstantInt::get(aType
, 1)
312 : ConstantInt::get(aType
, 0);
313 CI
->replaceAllUsesWith(Val
);
315 CI
->eraseFromParent();
320 if (calleeName
.equals("__amdil_is_asic_id_i32")) {
321 ConstantInt
*CV
= dyn_cast
<ConstantInt
>(CI
->getOperand(0));
322 Type
*aType
= Type::getInt32Ty(*mCTX
);
325 Val
= ConstantInt::get(aType
,
326 mSTM
->device()->getDeviceFlag() & CV
->getZExtValue());
328 Val
= ConstantInt::get(aType
, 0);
330 CI
->replaceAllUsesWith(Val
);
332 CI
->eraseFromParent();
335 Function
*F
= dyn_cast
<Function
>(CI
->getOperand(CI
->getNumOperands()-1));
339 if (F
->getName().startswith("__atom") && !CI
->getNumUses()
340 && F
->getName().find("_xchg") == StringRef::npos
) {
341 std::string
buffer(F
->getName().str() + "_noret");
342 F
= dyn_cast
<Function
>(
343 F
->getParent()->getOrInsertFunction(buffer
, F
->getFunctionType()));
344 atomicFuncs
.push_back(std::make_pair
<CallInst
*, Function
*>(CI
, F
));
347 if (!mSTM
->device()->isSupported(AMDILDeviceInfo::ArenaSegment
)
348 && !mSTM
->device()->isSupported(AMDILDeviceInfo::MultiUAV
)) {
351 if (!mConvertAtomics
) {
354 StringRef name
= F
->getName();
355 if (name
.startswith("__atom") && name
.find("_g") != StringRef::npos
) {
356 mConvertAtomics
= false;
362 AMDILPeepholeOpt::setupBitInsert(Instruction
*base
,
369 dbgs() << "Null pointer passed into function.\n";
374 if (base
->getOpcode() == Instruction::Shl
) {
375 shift
= dyn_cast
<Constant
>(base
->getOperand(1));
376 } else if (base
->getOpcode() == Instruction::And
) {
377 mask
= dyn_cast
<Constant
>(base
->getOperand(1));
381 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
383 // If the base is neither a Shl or a And, we don't fit any of the patterns above.
386 src
= dyn_cast
<Instruction
>(base
->getOperand(0));
389 dbgs() << "Failed setup since the base operand is not an instruction!\n";
393 // If we find an 'and' operation, then we don't need to
394 // find the next operation as we already know the
395 // bits that are valid at this point.
399 if (src
->getOpcode() == Instruction::Shl
&& !shift
) {
400 shift
= dyn_cast
<Constant
>(src
->getOperand(1));
401 src
= dyn_cast
<Instruction
>(src
->getOperand(0));
402 } else if (src
->getOpcode() == Instruction::And
&& !mask
) {
403 mask
= dyn_cast
<Constant
>(src
->getOperand(1));
405 if (!mask
&& !shift
) {
407 dbgs() << "Failed setup since both mask and shift are NULL!\n";
409 // Did not find a constant mask or a shift.
415 AMDILPeepholeOpt::optimizeBitInsert(Instruction
*inst
)
420 if (!inst
->isBinaryOp()) {
423 if (inst
->getOpcode() != Instruction::Or
) {
426 if (optLevel
== CodeGenOpt::None
) {
429 // We want to do an optimization on a sequence of ops that in the end equals a
430 // single ISA instruction.
431 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
432 // Some simplified versions of this pattern are as follows:
433 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
434 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
435 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
436 // (A & B) | (D << F) when (1 << F) >= B
437 // (A << C) | (D & E) when (1 << C) >= E
438 if (mSTM
->device()->getGeneration() == AMDILDeviceInfo::HD4XXX
) {
439 // The HD4XXX hardware doesn't support the ubit_insert instruction.
442 Type
*aType
= inst
->getType();
443 bool isVector
= aType
->isVectorTy();
445 // This optimization only works on 32bit integers.
446 if (aType
->getScalarType()
447 != Type::getInt32Ty(inst
->getContext())) {
451 const VectorType
*VT
= dyn_cast
<VectorType
>(aType
);
452 numEle
= VT
->getNumElements();
453 // We currently cannot support more than 4 elements in a intrinsic and we
454 // cannot support Vec3 types.
455 if (numEle
> 4 || numEle
== 3) {
459 // TODO: Handle vectors.
462 dbgs() << "!!! Vectors are not supported yet!\n";
466 Instruction
*LHSSrc
= NULL
, *RHSSrc
= NULL
;
467 Constant
*LHSMask
= NULL
, *RHSMask
= NULL
;
468 Constant
*LHSShift
= NULL
, *RHSShift
= NULL
;
469 Instruction
*LHS
= dyn_cast
<Instruction
>(inst
->getOperand(0));
470 Instruction
*RHS
= dyn_cast
<Instruction
>(inst
->getOperand(1));
471 if (!setupBitInsert(LHS
, LHSSrc
, LHSMask
, LHSShift
)) {
473 dbgs() << "Found an OR Operation that failed setup!\n";
475 if (LHS
) { LHS
->dump(); }
476 if (LHSSrc
) { LHSSrc
->dump(); }
477 if (LHSMask
) { LHSMask
->dump(); }
478 if (LHSShift
) { LHSShift
->dump(); }
480 // There was an issue with the setup for BitInsert.
483 if (!setupBitInsert(RHS
, RHSSrc
, RHSMask
, RHSShift
)) {
485 dbgs() << "Found an OR Operation that failed setup!\n";
487 if (RHS
) { RHS
->dump(); }
488 if (RHSSrc
) { RHSSrc
->dump(); }
489 if (RHSMask
) { RHSMask
->dump(); }
490 if (RHSShift
) { RHSShift
->dump(); }
492 // There was an issue with the setup for BitInsert.
496 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
497 dbgs() << "Op: "; inst
->dump();
498 dbgs() << "LHS: "; if (LHS
) { LHS
->dump(); } else { dbgs() << "(None)\n"; }
499 dbgs() << "LHS Src: "; if (LHSSrc
) { LHSSrc
->dump(); } else { dbgs() << "(None)\n"; }
500 dbgs() << "LHS Mask: "; if (LHSMask
) { LHSMask
->dump(); } else { dbgs() << "(None)\n"; }
501 dbgs() << "LHS Shift: "; if (LHSShift
) { LHSShift
->dump(); } else { dbgs() << "(None)\n"; }
502 dbgs() << "RHS: "; if (RHS
) { RHS
->dump(); } else { dbgs() << "(None)\n"; }
503 dbgs() << "RHS Src: "; if (RHSSrc
) { RHSSrc
->dump(); } else { dbgs() << "(None)\n"; }
504 dbgs() << "RHS Mask: "; if (RHSMask
) { RHSMask
->dump(); } else { dbgs() << "(None)\n"; }
505 dbgs() << "RHS Shift: "; if (RHSShift
) { RHSShift
->dump(); } else { dbgs() << "(None)\n"; }
507 Constant
*offset
= NULL
;
508 Constant
*width
= NULL
;
509 int32_t lhsMaskVal
= 0, rhsMaskVal
= 0;
510 int32_t lhsShiftVal
= 0, rhsShiftVal
= 0;
511 int32_t lhsMaskWidth
= 0, rhsMaskWidth
= 0;
512 int32_t lhsMaskOffset
= 0, rhsMaskOffset
= 0;
513 lhsMaskVal
= (int32_t)(LHSMask
514 ? dyn_cast
<ConstantInt
>(LHSMask
)->getZExtValue() : 0);
515 rhsMaskVal
= (int32_t)(RHSMask
516 ? dyn_cast
<ConstantInt
>(RHSMask
)->getZExtValue() : 0);
517 lhsShiftVal
= (int32_t)(LHSShift
518 ? dyn_cast
<ConstantInt
>(LHSShift
)->getZExtValue() : 0);
519 rhsShiftVal
= (int32_t)(RHSShift
520 ? dyn_cast
<ConstantInt
>(RHSShift
)->getZExtValue() : 0);
521 lhsMaskWidth
= lhsMaskVal
? CountPopulation_32(lhsMaskVal
) : 32 - lhsShiftVal
;
522 rhsMaskWidth
= rhsMaskVal
? CountPopulation_32(rhsMaskVal
) : 32 - rhsShiftVal
;
523 lhsMaskOffset
= lhsMaskVal
? CountTrailingZeros_32(lhsMaskVal
) : lhsShiftVal
;
524 rhsMaskOffset
= rhsMaskVal
? CountTrailingZeros_32(rhsMaskVal
) : rhsShiftVal
;
525 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
527 dbgs() << "Found pattern: \'((A" << (LHSMask
? " & B)" : ")");
528 dbgs() << (LHSShift
? " << C)" : ")") << " | ((D" ;
529 dbgs() << (RHSMask
? " & E)" : ")");
530 dbgs() << (RHSShift
? " << F)\'\n" : ")\'\n");
531 dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
532 dbgs() << "B = " << lhsMaskVal
<< "\t\tE = " << rhsMaskVal
<< "\n";
533 dbgs() << "C = " << lhsShiftVal
<< "\t\tF = " << rhsShiftVal
<< "\n";
534 dbgs() << "width(B) = " << lhsMaskWidth
;
535 dbgs() << "\twidth(E) = " << rhsMaskWidth
<< "\n";
536 dbgs() << "offset(B) = " << lhsMaskOffset
;
537 dbgs() << "\toffset(E) = " << rhsMaskOffset
<< "\n";
538 dbgs() << "Constraints: \n";
539 dbgs() << "\t(1) B ^ E == 0\n";
540 dbgs() << "\t(2-LHS) B is a mask\n";
541 dbgs() << "\t(2-LHS) E is a mask\n";
542 dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
543 dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
545 if ((lhsMaskVal
|| rhsMaskVal
) && !(lhsMaskVal
^ rhsMaskVal
)) {
547 dbgs() << lhsMaskVal
<< " ^ " << rhsMaskVal
;
548 dbgs() << " = " << (lhsMaskVal
^ rhsMaskVal
) << "\n";
549 dbgs() << "Failed constraint 1!\n";
554 dbgs() << "LHS = " << lhsMaskOffset
<< "";
555 dbgs() << " >= (" << rhsMaskWidth
<< " + " << rhsMaskOffset
<< ") = ";
556 dbgs() << (lhsMaskOffset
>= (rhsMaskWidth
+ rhsMaskOffset
));
557 dbgs() << "\nRHS = " << rhsMaskOffset
<< "";
558 dbgs() << " >= (" << lhsMaskWidth
<< " + " << lhsMaskOffset
<< ") = ";
559 dbgs() << (rhsMaskOffset
>= (lhsMaskWidth
+ lhsMaskOffset
));
562 if (lhsMaskOffset
>= (rhsMaskWidth
+ rhsMaskOffset
)) {
563 offset
= ConstantInt::get(aType
, lhsMaskOffset
, false);
564 width
= ConstantInt::get(aType
, lhsMaskWidth
, false);
566 if (!isMask_32(lhsMaskVal
) && !isShiftedMask_32(lhsMaskVal
)) {
568 dbgs() << "Value is not a Mask: " << lhsMaskVal
<< "\n";
569 dbgs() << "Failed constraint 2!\n";
574 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
576 } else if (lhsShiftVal
!= lhsMaskOffset
) {
577 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
581 dbgs() << "Optimizing LHS!\n";
583 } else if (rhsMaskOffset
>= (lhsMaskWidth
+ lhsMaskOffset
)) {
584 offset
= ConstantInt::get(aType
, rhsMaskOffset
, false);
585 width
= ConstantInt::get(aType
, rhsMaskWidth
, false);
588 if (!isMask_32(rhsMaskVal
) && !isShiftedMask_32(rhsMaskVal
)) {
590 dbgs() << "Non-Mask: " << rhsMaskVal
<< "\n";
591 dbgs() << "Failed constraint 2!\n";
596 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
598 } else if (rhsShiftVal
!= rhsMaskOffset
) {
599 LHSSrc
= BinaryOperator::Create(Instruction::LShr
, LHSSrc
, offset
,
603 dbgs() << "Optimizing RHS!\n";
607 dbgs() << "Failed constraint 3!\n";
612 dbgs() << "Width: "; if (width
) { width
->dump(); } else { dbgs() << "(0)\n"; }
613 dbgs() << "Offset: "; if (offset
) { offset
->dump(); } else { dbgs() << "(0)\n"; }
614 dbgs() << "LHSSrc: "; if (LHSSrc
) { LHSSrc
->dump(); } else { dbgs() << "(0)\n"; }
615 dbgs() << "RHSSrc: "; if (RHSSrc
) { RHSSrc
->dump(); } else { dbgs() << "(0)\n"; }
617 if (!offset
|| !width
) {
619 dbgs() << "Either width or offset are NULL, failed detection!\n";
623 // Lets create the function signature.
624 std::vector
<Type
*> callTypes
;
625 callTypes
.push_back(aType
);
626 callTypes
.push_back(aType
);
627 callTypes
.push_back(aType
);
628 callTypes
.push_back(aType
);
629 FunctionType
*funcType
= FunctionType::get(aType
, callTypes
, false);
630 std::string name
= "__amdil_ubit_insert";
631 if (isVector
) { name
+= "_v" + itostr(numEle
) + "u32"; } else { name
+= "_u32"; }
633 dyn_cast
<Function
>(inst
->getParent()->getParent()->getParent()->
634 getOrInsertFunction(llvm::StringRef(name
), funcType
));
635 Value
*Operands
[4] = {
641 CallInst
*CI
= CallInst::Create(Func
, Operands
, "BitInsertOpt");
643 dbgs() << "Old Inst: ";
645 dbgs() << "New Inst: ";
649 CI
->insertBefore(inst
);
650 inst
->replaceAllUsesWith(CI
);
655 AMDILPeepholeOpt::optimizeBitExtract(Instruction
*inst
)
660 if (!inst
->isBinaryOp()) {
663 if (inst
->getOpcode() != Instruction::And
) {
666 if (optLevel
== CodeGenOpt::None
) {
669 // We want to do some simple optimizations on Shift right/And patterns. The
670 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
671 // value smaller than 32 and C is a mask. If C is a constant value, then the
672 // following transformation can occur. For signed integers, it turns into the
673 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
674 // integers, it turns into the function call dst =
675 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
676 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
677 // Evergreen hardware.
678 if (mSTM
->device()->getGeneration() == AMDILDeviceInfo::HD4XXX
) {
679 // This does not work on HD4XXX hardware.
682 Type
*aType
= inst
->getType();
683 bool isVector
= aType
->isVectorTy();
685 // This only works on 32bit integers
686 if (aType
->getScalarType()
687 != Type::getInt32Ty(inst
->getContext())) {
691 const VectorType
*VT
= dyn_cast
<VectorType
>(aType
);
692 numEle
= VT
->getNumElements();
693 // We currently cannot support more than 4 elements in a intrinsic and we
694 // cannot support Vec3 types.
695 if (numEle
> 4 || numEle
== 3) {
699 BinaryOperator
*ShiftInst
= dyn_cast
<BinaryOperator
>(inst
->getOperand(0));
700 // If the first operand is not a shift instruction, then we can return as it
701 // doesn't match this pattern.
702 if (!ShiftInst
|| !ShiftInst
->isShift()) {
705 // If we are a shift left, then we need don't match this pattern.
706 if (ShiftInst
->getOpcode() == Instruction::Shl
) {
709 bool isSigned
= ShiftInst
->isArithmeticShift();
710 Constant
*AndMask
= dyn_cast
<Constant
>(inst
->getOperand(1));
711 Constant
*ShrVal
= dyn_cast
<Constant
>(ShiftInst
->getOperand(1));
712 // Lets make sure that the shift value and the and mask are constant integers.
713 if (!AndMask
|| !ShrVal
) {
716 Constant
*newMaskConst
;
717 Constant
*shiftValConst
;
719 // Handle the vector case
720 std::vector
<Constant
*> maskVals
;
721 std::vector
<Constant
*> shiftVals
;
722 ConstantVector
*AndMaskVec
= dyn_cast
<ConstantVector
>(AndMask
);
723 ConstantVector
*ShrValVec
= dyn_cast
<ConstantVector
>(ShrVal
);
724 Type
*scalarType
= AndMaskVec
->getType()->getScalarType();
725 assert(AndMaskVec
->getNumOperands() ==
726 ShrValVec
->getNumOperands() && "cannot have a "
727 "combination where the number of elements to a "
728 "shift and an and are different!");
729 for (size_t x
= 0, y
= AndMaskVec
->getNumOperands(); x
< y
; ++x
) {
730 ConstantInt
*AndCI
= dyn_cast
<ConstantInt
>(AndMaskVec
->getOperand(x
));
731 ConstantInt
*ShiftIC
= dyn_cast
<ConstantInt
>(ShrValVec
->getOperand(x
));
732 if (!AndCI
|| !ShiftIC
) {
735 uint32_t maskVal
= (uint32_t)AndCI
->getZExtValue();
736 if (!isMask_32(maskVal
)) {
739 maskVal
= (uint32_t)CountTrailingOnes_32(maskVal
);
740 uint32_t shiftVal
= (uint32_t)ShiftIC
->getZExtValue();
741 // If the mask or shiftval is greater than the bitcount, then break out.
742 if (maskVal
>= 32 || shiftVal
>= 32) {
745 // If the mask val is greater than the the number of original bits left
746 // then this optimization is invalid.
747 if (maskVal
> (32 - shiftVal
)) {
750 maskVals
.push_back(ConstantInt::get(scalarType
, maskVal
, isSigned
));
751 shiftVals
.push_back(ConstantInt::get(scalarType
, shiftVal
, isSigned
));
753 newMaskConst
= ConstantVector::get(maskVals
);
754 shiftValConst
= ConstantVector::get(shiftVals
);
756 // Handle the scalar case
757 uint32_t maskVal
= (uint32_t)dyn_cast
<ConstantInt
>(AndMask
)->getZExtValue();
758 // This must be a mask value where all lower bits are set to 1 and then any
759 // bit higher is set to 0.
760 if (!isMask_32(maskVal
)) {
763 maskVal
= (uint32_t)CountTrailingOnes_32(maskVal
);
764 // Count the number of bits set in the mask, this is the width of the
765 // resulting bit set that is extracted from the source value.
766 uint32_t shiftVal
= (uint32_t)dyn_cast
<ConstantInt
>(ShrVal
)->getZExtValue();
767 // If the mask or shift val is greater than the bitcount, then break out.
768 if (maskVal
>= 32 || shiftVal
>= 32) {
771 // If the mask val is greater than the the number of original bits left then
772 // this optimization is invalid.
773 if (maskVal
> (32 - shiftVal
)) {
776 newMaskConst
= ConstantInt::get(aType
, maskVal
, isSigned
);
777 shiftValConst
= ConstantInt::get(aType
, shiftVal
, isSigned
);
779 // Lets create the function signature.
780 std::vector
<Type
*> callTypes
;
781 callTypes
.push_back(aType
);
782 callTypes
.push_back(aType
);
783 callTypes
.push_back(aType
);
784 FunctionType
*funcType
= FunctionType::get(aType
, callTypes
, false);
785 std::string name
= "__amdil_ubit_extract";
787 name
+= "_v" + itostr(numEle
) + "i32";
791 // Lets create the function.
793 dyn_cast
<Function
>(inst
->getParent()->getParent()->getParent()->
794 getOrInsertFunction(llvm::StringRef(name
), funcType
));
795 Value
*Operands
[3] = {
798 ShiftInst
->getOperand(0)
800 // Lets create the Call with the operands
801 CallInst
*CI
= CallInst::Create(Func
, Operands
, "ByteExtractOpt");
802 CI
->insertBefore(inst
);
803 inst
->replaceAllUsesWith(CI
);
808 AMDILPeepholeOpt::expandBFI(CallInst
*CI
)
810 if (!CI
|| mSTM
->calVersion() <= CAL_VERSION_SC_150
) {
813 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
814 if (!LHS
->getName().startswith("__amdil_bfi")) {
817 Type
* type
= CI
->getOperand(0)->getType();
818 Constant
*negOneConst
= NULL
;
819 if (type
->isVectorTy()) {
820 std::vector
<Constant
*> negOneVals
;
821 negOneConst
= ConstantInt::get(CI
->getContext(),
822 APInt(32, StringRef("-1"), 10));
824 y
= dyn_cast
<VectorType
>(type
)->getNumElements(); x
< y
; ++x
) {
825 negOneVals
.push_back(negOneConst
);
827 negOneConst
= ConstantVector::get(negOneVals
);
829 negOneConst
= ConstantInt::get(CI
->getContext(),
830 APInt(32, StringRef("-1"), 10));
832 // __amdil_bfi => (A & B) | (~A & C)
833 BinaryOperator
*lhs
=
834 BinaryOperator::Create(Instruction::And
, CI
->getOperand(0),
835 CI
->getOperand(1), "bfi_and", CI
);
836 BinaryOperator
*rhs
=
837 BinaryOperator::Create(Instruction::Xor
, CI
->getOperand(0), negOneConst
,
839 rhs
= BinaryOperator::Create(Instruction::And
, rhs
, CI
->getOperand(2),
841 lhs
= BinaryOperator::Create(Instruction::Or
, lhs
, rhs
, "bfi_or", CI
);
842 CI
->replaceAllUsesWith(lhs
);
847 AMDILPeepholeOpt::expandBFM(CallInst
*CI
)
849 if (!CI
|| mSTM
->calVersion() <= CAL_VERSION_SC_150
) {
852 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
853 if (!LHS
->getName().startswith("__amdil_bfm")) {
856 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
857 Constant
*newMaskConst
= NULL
;
858 Constant
*newShiftConst
= NULL
;
859 Type
* type
= CI
->getOperand(0)->getType();
860 if (type
->isVectorTy()) {
861 std::vector
<Constant
*> newMaskVals
, newShiftVals
;
862 newMaskConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 0x1F);
863 newShiftConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 1);
865 y
= dyn_cast
<VectorType
>(type
)->getNumElements(); x
< y
; ++x
) {
866 newMaskVals
.push_back(newMaskConst
);
867 newShiftVals
.push_back(newShiftConst
);
869 newMaskConst
= ConstantVector::get(newMaskVals
);
870 newShiftConst
= ConstantVector::get(newShiftVals
);
872 newMaskConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 0x1F);
873 newShiftConst
= ConstantInt::get(Type::getInt32Ty(*mCTX
), 1);
875 BinaryOperator
*lhs
=
876 BinaryOperator::Create(Instruction::And
, CI
->getOperand(0),
877 newMaskConst
, "bfm_mask", CI
);
878 lhs
= BinaryOperator::Create(Instruction::Shl
, newShiftConst
,
880 lhs
= BinaryOperator::Create(Instruction::Sub
, lhs
,
881 newShiftConst
, "bfm_sub", CI
);
882 BinaryOperator
*rhs
=
883 BinaryOperator::Create(Instruction::And
, CI
->getOperand(1),
884 newMaskConst
, "bfm_mask", CI
);
885 lhs
= BinaryOperator::Create(Instruction::Shl
, lhs
, rhs
, "bfm_shl", CI
);
886 CI
->replaceAllUsesWith(lhs
);
891 AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator
*bbb
)
893 Instruction
*inst
= (*bbb
);
894 if (optimizeCallInst(bbb
)) {
897 if (optimizeBitExtract(inst
)) {
900 if (optimizeBitInsert(inst
)) {
903 if (correctMisalignedMemOp(inst
)) {
909 AMDILPeepholeOpt::correctMisalignedMemOp(Instruction
*inst
)
911 LoadInst
*linst
= dyn_cast
<LoadInst
>(inst
);
912 StoreInst
*sinst
= dyn_cast
<StoreInst
>(inst
);
914 Type
* Ty
= inst
->getType();
916 alignment
= linst
->getAlignment();
917 Ty
= inst
->getType();
919 alignment
= sinst
->getAlignment();
920 Ty
= sinst
->getValueOperand()->getType();
924 unsigned size
= getTypeSize(Ty
);
925 if (size
== alignment
|| size
< alignment
) {
928 if (!Ty
->isStructTy()) {
933 linst
->setAlignment(0);
936 sinst
->setAlignment(0);
943 AMDILPeepholeOpt::isSigned24BitOps(CallInst
*CI
)
948 Value
*LHS
= CI
->getOperand(CI
->getNumOperands() - 1);
949 std::string namePrefix
= LHS
->getName().substr(0, 14);
950 if (namePrefix
!= "__amdil_imad24" && namePrefix
!= "__amdil_imul24"
951 && namePrefix
!= "__amdil__imul24_high") {
954 if (mSTM
->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps
)) {
961 AMDILPeepholeOpt::expandSigned24BitOps(CallInst
*CI
)
963 assert(isSigned24BitOps(CI
) && "Must be a "
964 "signed 24 bit operation to call this function!");
965 Value
*LHS
= CI
->getOperand(CI
->getNumOperands()-1);
966 // On 7XX and 8XX we do not have signed 24bit, so we need to
967 // expand it to the following:
968 // imul24 turns into 32bit imul
969 // imad24 turns into 32bit imad
970 // imul24_high turns into 32bit imulhigh
971 if (LHS
->getName().substr(0, 14) == "__amdil_imad24") {
972 Type
*aType
= CI
->getOperand(0)->getType();
973 bool isVector
= aType
->isVectorTy();
974 int numEle
= isVector
? dyn_cast
<VectorType
>(aType
)->getNumElements() : 1;
975 std::vector
<Type
*> callTypes
;
976 callTypes
.push_back(CI
->getOperand(0)->getType());
977 callTypes
.push_back(CI
->getOperand(1)->getType());
978 callTypes
.push_back(CI
->getOperand(2)->getType());
979 FunctionType
*funcType
=
980 FunctionType::get(CI
->getOperand(0)->getType(), callTypes
, false);
981 std::string name
= "__amdil_imad";
983 name
+= "_v" + itostr(numEle
) + "i32";
987 Function
*Func
= dyn_cast
<Function
>(
988 CI
->getParent()->getParent()->getParent()->
989 getOrInsertFunction(llvm::StringRef(name
), funcType
));
990 Value
*Operands
[3] = {
995 CallInst
*nCI
= CallInst::Create(Func
, Operands
, "imad24");
996 nCI
->insertBefore(CI
);
997 CI
->replaceAllUsesWith(nCI
);
998 } else if (LHS
->getName().substr(0, 14) == "__amdil_imul24") {
999 BinaryOperator
*mulOp
=
1000 BinaryOperator::Create(Instruction::Mul
, CI
->getOperand(0),
1001 CI
->getOperand(1), "imul24", CI
);
1002 CI
->replaceAllUsesWith(mulOp
);
1003 } else if (LHS
->getName().substr(0, 19) == "__amdil_imul24_high") {
1004 Type
*aType
= CI
->getOperand(0)->getType();
1006 bool isVector
= aType
->isVectorTy();
1007 int numEle
= isVector
? dyn_cast
<VectorType
>(aType
)->getNumElements() : 1;
1008 std::vector
<Type
*> callTypes
;
1009 callTypes
.push_back(CI
->getOperand(0)->getType());
1010 callTypes
.push_back(CI
->getOperand(1)->getType());
1011 FunctionType
*funcType
=
1012 FunctionType::get(CI
->getOperand(0)->getType(), callTypes
, false);
1013 std::string name
= "__amdil_imul_high";
1015 name
+= "_v" + itostr(numEle
) + "i32";
1019 Function
*Func
= dyn_cast
<Function
>(
1020 CI
->getParent()->getParent()->getParent()->
1021 getOrInsertFunction(llvm::StringRef(name
), funcType
));
1022 Value
*Operands
[2] = {
1026 CallInst
*nCI
= CallInst::Create(Func
, Operands
, "imul24_high");
1027 nCI
->insertBefore(CI
);
1028 CI
->replaceAllUsesWith(nCI
);
1033 AMDILPeepholeOpt::isRWGLocalOpt(CallInst
*CI
)
1036 && CI
->getOperand(CI
->getNumOperands() - 1)->getName()
1037 == "__amdil_get_local_size_int");
1041 AMDILPeepholeOpt::convertAccurateDivide(CallInst
*CI
)
1046 if (mSTM
->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
1047 && (mSTM
->getDeviceName() == "cayman")) {
1050 return CI
->getOperand(CI
->getNumOperands() - 1)->getName().substr(0, 20)
1051 == "__amdil_improved_div";
1055 AMDILPeepholeOpt::expandAccurateDivide(CallInst
*CI
)
1057 assert(convertAccurateDivide(CI
)
1058 && "expanding accurate divide can only happen if it is expandable!");
1059 BinaryOperator
*divOp
=
1060 BinaryOperator::Create(Instruction::FDiv
, CI
->getOperand(0),
1061 CI
->getOperand(1), "fdiv32", CI
);
1062 CI
->replaceAllUsesWith(divOp
);
1066 AMDILPeepholeOpt::propagateSamplerInst(CallInst
*CI
)
1068 if (optLevel
!= CodeGenOpt::None
) {
1076 unsigned funcNameIdx
= 0;
1077 funcNameIdx
= CI
->getNumOperands() - 1;
1078 StringRef calleeName
= CI
->getOperand(funcNameIdx
)->getName();
1079 if (calleeName
!= "__amdil_image2d_read_norm"
1080 && calleeName
!= "__amdil_image2d_read_unnorm"
1081 && calleeName
!= "__amdil_image3d_read_norm"
1082 && calleeName
!= "__amdil_image3d_read_unnorm") {
1086 unsigned samplerIdx
= 2;
1088 Value
*sampler
= CI
->getOperand(samplerIdx
);
1089 LoadInst
*lInst
= dyn_cast
<LoadInst
>(sampler
);
1094 if (lInst
->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS
) {
1098 GlobalVariable
*gv
= dyn_cast
<GlobalVariable
>(lInst
->getPointerOperand());
1099 // If we are loading from what is not a global value, then we
1105 // If we don't have an initializer or we have an initializer and
1106 // the initializer is not a 32bit integer, we fail.
1107 if (!gv
->hasInitializer()
1108 || !gv
->getInitializer()->getType()->isIntegerTy(32)) {
1112 // Now that we have the global variable initializer, lets replace
1113 // all uses of the load instruction with the samplerVal and
1114 // reparse the __amdil_is_constant() function.
1115 Constant
*samplerVal
= gv
->getInitializer();
1116 lInst
->replaceAllUsesWith(samplerVal
);
1121 AMDILPeepholeOpt::doInitialization(Module
&M
)
1127 AMDILPeepholeOpt::doFinalization(Module
&M
)
1133 AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage
&AU
) const
1135 AU
.addRequired
<MachineFunctionAnalysis
>();
1136 FunctionPass::getAnalysisUsage(AU
);
1137 AU
.setPreservesAll();