radeon/llvm: Remove AMDILMachineFunctionInfo.cpp
[mesa.git] / src / gallium / drivers / radeon / AMDILPeepholeOptimizer.cpp
1 //===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //==-----------------------------------------------------------------------===//
9
10 #define DEBUG_TYPE "PeepholeOpt"
11 #ifdef DEBUG
12 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
13 #else
14 #define DEBUGME 0
15 #endif
16
17 #include "AMDILAlgorithms.tpp"
18 #include "AMDILDevices.h"
19 #include "AMDILUtilityFunctions.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/ADT/StringExtras.h"
22 #include "llvm/ADT/StringRef.h"
23 #include "llvm/ADT/Twine.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
26 #include "llvm/Function.h"
27 #include "llvm/Instructions.h"
28 #include "llvm/Module.h"
29 #include "llvm/Support/Debug.h"
30 #include "llvm/Support/MathExtras.h"
31
32 #include <sstream>
33
34 #if 0
35 STATISTIC(PointerAssignments, "Number of dynamic pointer "
36 "assigments discovered");
37 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
38 #endif
39
40 using namespace llvm;
41 // The Peephole optimization pass is used to do simple last minute optimizations
42 // that are required for correct code or to remove redundant functions
43 namespace {
44 class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
45 public:
46 TargetMachine &TM;
47 static char ID;
48 AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
49 ~AMDILPeepholeOpt();
50 const char *getPassName() const;
51 bool runOnFunction(Function &F);
52 bool doInitialization(Module &M);
53 bool doFinalization(Module &M);
54 void getAnalysisUsage(AnalysisUsage &AU) const;
55 protected:
56 private:
57 // Function to initiate all of the instruction level optimizations.
58 bool instLevelOptimizations(BasicBlock::iterator *inst);
59 // Quick check to see if we need to dump all of the pointers into the
60 // arena. If this is correct, then we set all pointers to exist in arena. This
61 // is a workaround for aliasing of pointers in a struct/union.
62 bool dumpAllIntoArena(Function &F);
63 // Because I don't want to invalidate any pointers while in the
64 // safeNestedForEachFunction. I push atomic conversions to a vector and handle
65 // it later. This function does the conversions if required.
66 void doAtomicConversionIfNeeded(Function &F);
67 // Because __amdil_is_constant cannot be properly evaluated if
68 // optimizations are disabled, the call's are placed in a vector
69 // and evaluated after the __amdil_image* functions are evaluated
70 // which should allow the __amdil_is_constant function to be
71 // evaluated correctly.
72 void doIsConstCallConversionIfNeeded();
73 bool mChanged;
74 bool mDebug;
75 bool mConvertAtomics;
76 CodeGenOpt::Level optLevel;
77 // Run a series of tests to see if we can optimize a CALL instruction.
78 bool optimizeCallInst(BasicBlock::iterator *bbb);
79 // A peephole optimization to optimize bit extract sequences.
80 bool optimizeBitExtract(Instruction *inst);
81 // A peephole optimization to optimize bit insert sequences.
82 bool optimizeBitInsert(Instruction *inst);
83 bool setupBitInsert(Instruction *base,
84 Instruction *&src,
85 Constant *&mask,
86 Constant *&shift);
87 // Expand the bit field insert instruction on versions of OpenCL that
88 // don't support it.
89 bool expandBFI(CallInst *CI);
90 // Expand the bit field mask instruction on version of OpenCL that
91 // don't support it.
92 bool expandBFM(CallInst *CI);
93 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
94 // this case we need to expand them. These functions check for 24bit functions
95 // and then expand.
96 bool isSigned24BitOps(CallInst *CI);
97 void expandSigned24BitOps(CallInst *CI);
98 // One optimization that can occur is that if the required workgroup size is
99 // specified then the result of get_local_size is known at compile time and
100 // can be returned accordingly.
101 bool isRWGLocalOpt(CallInst *CI);
102 // On northern island cards, the division is slightly less accurate than on
103 // previous generations, so we need to utilize a more accurate division. So we
104 // can translate the accurate divide to a normal divide on all other cards.
105 bool convertAccurateDivide(CallInst *CI);
106 void expandAccurateDivide(CallInst *CI);
107 // If the alignment is set incorrectly, it can produce really inefficient
108 // code. This checks for this scenario and fixes it if possible.
109 bool correctMisalignedMemOp(Instruction *inst);
110
111 // If we are in no opt mode, then we need to make sure that
112 // local samplers are properly propagated as constant propagation
113 // doesn't occur and we need to know the value of kernel defined
114 // samplers at compile time.
115 bool propagateSamplerInst(CallInst *CI);
116
117 LLVMContext *mCTX;
118 Function *mF;
119 const AMDILSubtarget *mSTM;
120 SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
121 SmallVector<CallInst *, 16> isConstVec;
122 }; // class AMDILPeepholeOpt
123 char AMDILPeepholeOpt::ID = 0;
124 } // anonymous namespace
125
126 namespace llvm {
127 FunctionPass *
128 createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
129 {
130 return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR);
131 }
132 } // llvm namespace
133
134 AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
135 : FunctionPass(ID), TM(tm)
136 {
137 mDebug = DEBUGME;
138 optLevel = TM.getOptLevel();
139
140 }
141
142 AMDILPeepholeOpt::~AMDILPeepholeOpt()
143 {
144 }
145
146 const char *
147 AMDILPeepholeOpt::getPassName() const
148 {
149 return "AMDIL PeepHole Optimization Pass";
150 }
151
152 bool
153 containsPointerType(Type *Ty)
154 {
155 if (!Ty) {
156 return false;
157 }
158 switch(Ty->getTypeID()) {
159 default:
160 return false;
161 case Type::StructTyID: {
162 const StructType *ST = dyn_cast<StructType>(Ty);
163 for (StructType::element_iterator stb = ST->element_begin(),
164 ste = ST->element_end(); stb != ste; ++stb) {
165 if (!containsPointerType(*stb)) {
166 continue;
167 }
168 return true;
169 }
170 break;
171 }
172 case Type::VectorTyID:
173 case Type::ArrayTyID:
174 return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
175 case Type::PointerTyID:
176 return true;
177 };
178 return false;
179 }
180
181 bool
182 AMDILPeepholeOpt::dumpAllIntoArena(Function &F)
183 {
184 bool dumpAll = false;
185 for (Function::const_arg_iterator cab = F.arg_begin(),
186 cae = F.arg_end(); cab != cae; ++cab) {
187 const Argument *arg = cab;
188 const PointerType *PT = dyn_cast<PointerType>(arg->getType());
189 if (!PT) {
190 continue;
191 }
192 Type *DereferencedType = PT->getElementType();
193 if (!dyn_cast<StructType>(DereferencedType)
194 ) {
195 continue;
196 }
197 if (!containsPointerType(DereferencedType)) {
198 continue;
199 }
200 // FIXME: Because a pointer inside of a struct/union may be aliased to
201 // another pointer we need to take the conservative approach and place all
202 // pointers into the arena until more advanced detection is implemented.
203 dumpAll = true;
204 }
205 return dumpAll;
206 }
207 void
208 AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
209 {
210 if (isConstVec.empty()) {
211 return;
212 }
213 for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
214 CallInst *CI = isConstVec[x];
215 Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
216 Type *aType = Type::getInt32Ty(*mCTX);
217 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
218 : ConstantInt::get(aType, 0);
219 CI->replaceAllUsesWith(Val);
220 CI->eraseFromParent();
221 }
222 isConstVec.clear();
223 }
224 void
225 AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
226 {
227 // Don't do anything if we don't have any atomic operations.
228 if (atomicFuncs.empty()) {
229 return;
230 }
231 // Change the function name for the atomic if it is required
232 uint32_t size = atomicFuncs.size();
233 for (uint32_t x = 0; x < size; ++x) {
234 atomicFuncs[x].first->setOperand(
235 atomicFuncs[x].first->getNumOperands()-1,
236 atomicFuncs[x].second);
237
238 }
239 mChanged = true;
240 if (mConvertAtomics) {
241 return;
242 }
243 }
244
245 bool
246 AMDILPeepholeOpt::runOnFunction(Function &MF)
247 {
248 mChanged = false;
249 mF = &MF;
250 mSTM = &TM.getSubtarget<AMDILSubtarget>();
251 if (mDebug) {
252 MF.dump();
253 }
254 mCTX = &MF.getType()->getContext();
255 mConvertAtomics = true;
256 safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
257 std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations),
258 this));
259
260 doAtomicConversionIfNeeded(MF);
261 doIsConstCallConversionIfNeeded();
262
263 if (mDebug) {
264 MF.dump();
265 }
266 return mChanged;
267 }
268
269 bool
270 AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
271 {
272 Instruction *inst = (*bbb);
273 CallInst *CI = dyn_cast<CallInst>(inst);
274 if (!CI) {
275 return false;
276 }
277 if (isSigned24BitOps(CI)) {
278 expandSigned24BitOps(CI);
279 ++(*bbb);
280 CI->eraseFromParent();
281 return true;
282 }
283 if (propagateSamplerInst(CI)) {
284 return false;
285 }
286 if (expandBFI(CI) || expandBFM(CI)) {
287 ++(*bbb);
288 CI->eraseFromParent();
289 return true;
290 }
291 if (convertAccurateDivide(CI)) {
292 expandAccurateDivide(CI);
293 ++(*bbb);
294 CI->eraseFromParent();
295 return true;
296 }
297
298 StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
299 if (calleeName.startswith("__amdil_is_constant")) {
300 // If we do not have optimizations, then this
301 // cannot be properly evaluated, so we add the
302 // call instruction to a vector and process
303 // them at the end of processing after the
304 // samplers have been correctly handled.
305 if (optLevel == CodeGenOpt::None) {
306 isConstVec.push_back(CI);
307 return false;
308 } else {
309 Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
310 Type *aType = Type::getInt32Ty(*mCTX);
311 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
312 : ConstantInt::get(aType, 0);
313 CI->replaceAllUsesWith(Val);
314 ++(*bbb);
315 CI->eraseFromParent();
316 return true;
317 }
318 }
319
320 if (calleeName.equals("__amdil_is_asic_id_i32")) {
321 ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
322 Type *aType = Type::getInt32Ty(*mCTX);
323 Value *Val = CV;
324 if (Val) {
325 Val = ConstantInt::get(aType,
326 mSTM->device()->getDeviceFlag() & CV->getZExtValue());
327 } else {
328 Val = ConstantInt::get(aType, 0);
329 }
330 CI->replaceAllUsesWith(Val);
331 ++(*bbb);
332 CI->eraseFromParent();
333 return true;
334 }
335 Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
336 if (!F) {
337 return false;
338 }
339 if (F->getName().startswith("__atom") && !CI->getNumUses()
340 && F->getName().find("_xchg") == StringRef::npos) {
341 std::string buffer(F->getName().str() + "_noret");
342 F = dyn_cast<Function>(
343 F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
344 atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
345 }
346
347 if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
348 && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
349 return false;
350 }
351 if (!mConvertAtomics) {
352 return false;
353 }
354 StringRef name = F->getName();
355 if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
356 mConvertAtomics = false;
357 }
358 return false;
359 }
360
361 bool
362 AMDILPeepholeOpt::setupBitInsert(Instruction *base,
363 Instruction *&src,
364 Constant *&mask,
365 Constant *&shift)
366 {
367 if (!base) {
368 if (mDebug) {
369 dbgs() << "Null pointer passed into function.\n";
370 }
371 return false;
372 }
373 bool andOp = false;
374 if (base->getOpcode() == Instruction::Shl) {
375 shift = dyn_cast<Constant>(base->getOperand(1));
376 } else if (base->getOpcode() == Instruction::And) {
377 mask = dyn_cast<Constant>(base->getOperand(1));
378 andOp = true;
379 } else {
380 if (mDebug) {
381 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
382 }
383 // If the base is neither a Shl or a And, we don't fit any of the patterns above.
384 return false;
385 }
386 src = dyn_cast<Instruction>(base->getOperand(0));
387 if (!src) {
388 if (mDebug) {
389 dbgs() << "Failed setup since the base operand is not an instruction!\n";
390 }
391 return false;
392 }
393 // If we find an 'and' operation, then we don't need to
394 // find the next operation as we already know the
395 // bits that are valid at this point.
396 if (andOp) {
397 return true;
398 }
399 if (src->getOpcode() == Instruction::Shl && !shift) {
400 shift = dyn_cast<Constant>(src->getOperand(1));
401 src = dyn_cast<Instruction>(src->getOperand(0));
402 } else if (src->getOpcode() == Instruction::And && !mask) {
403 mask = dyn_cast<Constant>(src->getOperand(1));
404 }
405 if (!mask && !shift) {
406 if (mDebug) {
407 dbgs() << "Failed setup since both mask and shift are NULL!\n";
408 }
409 // Did not find a constant mask or a shift.
410 return false;
411 }
412 return true;
413 }
414 bool
415 AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst)
416 {
417 if (!inst) {
418 return false;
419 }
420 if (!inst->isBinaryOp()) {
421 return false;
422 }
423 if (inst->getOpcode() != Instruction::Or) {
424 return false;
425 }
426 if (optLevel == CodeGenOpt::None) {
427 return false;
428 }
429 // We want to do an optimization on a sequence of ops that in the end equals a
430 // single ISA instruction.
431 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
432 // Some simplified versions of this pattern are as follows:
433 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
434 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
435 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
436 // (A & B) | (D << F) when (1 << F) >= B
437 // (A << C) | (D & E) when (1 << C) >= E
438 if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
439 // The HD4XXX hardware doesn't support the ubit_insert instruction.
440 return false;
441 }
442 Type *aType = inst->getType();
443 bool isVector = aType->isVectorTy();
444 int numEle = 1;
445 // This optimization only works on 32bit integers.
446 if (aType->getScalarType()
447 != Type::getInt32Ty(inst->getContext())) {
448 return false;
449 }
450 if (isVector) {
451 const VectorType *VT = dyn_cast<VectorType>(aType);
452 numEle = VT->getNumElements();
453 // We currently cannot support more than 4 elements in a intrinsic and we
454 // cannot support Vec3 types.
455 if (numEle > 4 || numEle == 3) {
456 return false;
457 }
458 }
459 // TODO: Handle vectors.
460 if (isVector) {
461 if (mDebug) {
462 dbgs() << "!!! Vectors are not supported yet!\n";
463 }
464 return false;
465 }
466 Instruction *LHSSrc = NULL, *RHSSrc = NULL;
467 Constant *LHSMask = NULL, *RHSMask = NULL;
468 Constant *LHSShift = NULL, *RHSShift = NULL;
469 Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
470 Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
471 if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
472 if (mDebug) {
473 dbgs() << "Found an OR Operation that failed setup!\n";
474 inst->dump();
475 if (LHS) { LHS->dump(); }
476 if (LHSSrc) { LHSSrc->dump(); }
477 if (LHSMask) { LHSMask->dump(); }
478 if (LHSShift) { LHSShift->dump(); }
479 }
480 // There was an issue with the setup for BitInsert.
481 return false;
482 }
483 if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
484 if (mDebug) {
485 dbgs() << "Found an OR Operation that failed setup!\n";
486 inst->dump();
487 if (RHS) { RHS->dump(); }
488 if (RHSSrc) { RHSSrc->dump(); }
489 if (RHSMask) { RHSMask->dump(); }
490 if (RHSShift) { RHSShift->dump(); }
491 }
492 // There was an issue with the setup for BitInsert.
493 return false;
494 }
495 if (mDebug) {
496 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
497 dbgs() << "Op: "; inst->dump();
498 dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
499 dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
500 dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
501 dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
502 dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
503 dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
504 dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
505 dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
506 }
507 Constant *offset = NULL;
508 Constant *width = NULL;
509 int32_t lhsMaskVal = 0, rhsMaskVal = 0;
510 int32_t lhsShiftVal = 0, rhsShiftVal = 0;
511 int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
512 int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
513 lhsMaskVal = (int32_t)(LHSMask
514 ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
515 rhsMaskVal = (int32_t)(RHSMask
516 ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
517 lhsShiftVal = (int32_t)(LHSShift
518 ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
519 rhsShiftVal = (int32_t)(RHSShift
520 ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
521 lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
522 rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
523 lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
524 rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
525 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
526 if (mDebug) {
527 dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
528 dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
529 dbgs() << (RHSMask ? " & E)" : ")");
530 dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
531 dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
532 dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
533 dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
534 dbgs() << "width(B) = " << lhsMaskWidth;
535 dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
536 dbgs() << "offset(B) = " << lhsMaskOffset;
537 dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
538 dbgs() << "Constraints: \n";
539 dbgs() << "\t(1) B ^ E == 0\n";
540 dbgs() << "\t(2-LHS) B is a mask\n";
541 dbgs() << "\t(2-LHS) E is a mask\n";
542 dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
543 dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
544 }
545 if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
546 if (mDebug) {
547 dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
548 dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
549 dbgs() << "Failed constraint 1!\n";
550 }
551 return false;
552 }
553 if (mDebug) {
554 dbgs() << "LHS = " << lhsMaskOffset << "";
555 dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
556 dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
557 dbgs() << "\nRHS = " << rhsMaskOffset << "";
558 dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
559 dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
560 dbgs() << "\n";
561 }
562 if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
563 offset = ConstantInt::get(aType, lhsMaskOffset, false);
564 width = ConstantInt::get(aType, lhsMaskWidth, false);
565 RHSSrc = RHS;
566 if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
567 if (mDebug) {
568 dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
569 dbgs() << "Failed constraint 2!\n";
570 }
571 return false;
572 }
573 if (!LHSShift) {
574 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
575 "MaskShr", LHS);
576 } else if (lhsShiftVal != lhsMaskOffset) {
577 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
578 "MaskShr", LHS);
579 }
580 if (mDebug) {
581 dbgs() << "Optimizing LHS!\n";
582 }
583 } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
584 offset = ConstantInt::get(aType, rhsMaskOffset, false);
585 width = ConstantInt::get(aType, rhsMaskWidth, false);
586 LHSSrc = RHSSrc;
587 RHSSrc = LHS;
588 if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
589 if (mDebug) {
590 dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
591 dbgs() << "Failed constraint 2!\n";
592 }
593 return false;
594 }
595 if (!RHSShift) {
596 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
597 "MaskShr", RHS);
598 } else if (rhsShiftVal != rhsMaskOffset) {
599 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
600 "MaskShr", RHS);
601 }
602 if (mDebug) {
603 dbgs() << "Optimizing RHS!\n";
604 }
605 } else {
606 if (mDebug) {
607 dbgs() << "Failed constraint 3!\n";
608 }
609 return false;
610 }
611 if (mDebug) {
612 dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
613 dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
614 dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
615 dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
616 }
617 if (!offset || !width) {
618 if (mDebug) {
619 dbgs() << "Either width or offset are NULL, failed detection!\n";
620 }
621 return false;
622 }
623 // Lets create the function signature.
624 std::vector<Type *> callTypes;
625 callTypes.push_back(aType);
626 callTypes.push_back(aType);
627 callTypes.push_back(aType);
628 callTypes.push_back(aType);
629 FunctionType *funcType = FunctionType::get(aType, callTypes, false);
630 std::string name = "__amdil_ubit_insert";
631 if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
632 Function *Func =
633 dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
634 getOrInsertFunction(llvm::StringRef(name), funcType));
635 Value *Operands[4] = {
636 width,
637 offset,
638 LHSSrc,
639 RHSSrc
640 };
641 CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
642 if (mDebug) {
643 dbgs() << "Old Inst: ";
644 inst->dump();
645 dbgs() << "New Inst: ";
646 CI->dump();
647 dbgs() << "\n\n";
648 }
649 CI->insertBefore(inst);
650 inst->replaceAllUsesWith(CI);
651 return true;
652 }
653
654 bool
655 AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst)
656 {
657 if (!inst) {
658 return false;
659 }
660 if (!inst->isBinaryOp()) {
661 return false;
662 }
663 if (inst->getOpcode() != Instruction::And) {
664 return false;
665 }
666 if (optLevel == CodeGenOpt::None) {
667 return false;
668 }
669 // We want to do some simple optimizations on Shift right/And patterns. The
670 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
671 // value smaller than 32 and C is a mask. If C is a constant value, then the
672 // following transformation can occur. For signed integers, it turns into the
673 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
674 // integers, it turns into the function call dst =
675 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
676 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
677 // Evergreen hardware.
678 if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
679 // This does not work on HD4XXX hardware.
680 return false;
681 }
682 Type *aType = inst->getType();
683 bool isVector = aType->isVectorTy();
684 int numEle = 1;
685 // This only works on 32bit integers
686 if (aType->getScalarType()
687 != Type::getInt32Ty(inst->getContext())) {
688 return false;
689 }
690 if (isVector) {
691 const VectorType *VT = dyn_cast<VectorType>(aType);
692 numEle = VT->getNumElements();
693 // We currently cannot support more than 4 elements in a intrinsic and we
694 // cannot support Vec3 types.
695 if (numEle > 4 || numEle == 3) {
696 return false;
697 }
698 }
699 BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
700 // If the first operand is not a shift instruction, then we can return as it
701 // doesn't match this pattern.
702 if (!ShiftInst || !ShiftInst->isShift()) {
703 return false;
704 }
705 // If we are a shift left, then we need don't match this pattern.
706 if (ShiftInst->getOpcode() == Instruction::Shl) {
707 return false;
708 }
709 bool isSigned = ShiftInst->isArithmeticShift();
710 Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
711 Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
712 // Lets make sure that the shift value and the and mask are constant integers.
713 if (!AndMask || !ShrVal) {
714 return false;
715 }
716 Constant *newMaskConst;
717 Constant *shiftValConst;
718 if (isVector) {
719 // Handle the vector case
720 std::vector<Constant *> maskVals;
721 std::vector<Constant *> shiftVals;
722 ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
723 ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
724 Type *scalarType = AndMaskVec->getType()->getScalarType();
725 assert(AndMaskVec->getNumOperands() ==
726 ShrValVec->getNumOperands() && "cannot have a "
727 "combination where the number of elements to a "
728 "shift and an and are different!");
729 for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
730 ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
731 ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
732 if (!AndCI || !ShiftIC) {
733 return false;
734 }
735 uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
736 if (!isMask_32(maskVal)) {
737 return false;
738 }
739 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
740 uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
741 // If the mask or shiftval is greater than the bitcount, then break out.
742 if (maskVal >= 32 || shiftVal >= 32) {
743 return false;
744 }
745 // If the mask val is greater than the the number of original bits left
746 // then this optimization is invalid.
747 if (maskVal > (32 - shiftVal)) {
748 return false;
749 }
750 maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
751 shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
752 }
753 newMaskConst = ConstantVector::get(maskVals);
754 shiftValConst = ConstantVector::get(shiftVals);
755 } else {
756 // Handle the scalar case
757 uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
758 // This must be a mask value where all lower bits are set to 1 and then any
759 // bit higher is set to 0.
760 if (!isMask_32(maskVal)) {
761 return false;
762 }
763 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
764 // Count the number of bits set in the mask, this is the width of the
765 // resulting bit set that is extracted from the source value.
766 uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
767 // If the mask or shift val is greater than the bitcount, then break out.
768 if (maskVal >= 32 || shiftVal >= 32) {
769 return false;
770 }
771 // If the mask val is greater than the the number of original bits left then
772 // this optimization is invalid.
773 if (maskVal > (32 - shiftVal)) {
774 return false;
775 }
776 newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
777 shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
778 }
779 // Lets create the function signature.
780 std::vector<Type *> callTypes;
781 callTypes.push_back(aType);
782 callTypes.push_back(aType);
783 callTypes.push_back(aType);
784 FunctionType *funcType = FunctionType::get(aType, callTypes, false);
785 std::string name = "__amdil_ubit_extract";
786 if (isVector) {
787 name += "_v" + itostr(numEle) + "i32";
788 } else {
789 name += "_i32";
790 }
791 // Lets create the function.
792 Function *Func =
793 dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
794 getOrInsertFunction(llvm::StringRef(name), funcType));
795 Value *Operands[3] = {
796 newMaskConst,
797 shiftValConst,
798 ShiftInst->getOperand(0)
799 };
800 // Lets create the Call with the operands
801 CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
802 CI->insertBefore(inst);
803 inst->replaceAllUsesWith(CI);
804 return true;
805 }
806
807 bool
808 AMDILPeepholeOpt::expandBFI(CallInst *CI)
809 {
810 if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
811 return false;
812 }
813 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
814 if (!LHS->getName().startswith("__amdil_bfi")) {
815 return false;
816 }
817 Type* type = CI->getOperand(0)->getType();
818 Constant *negOneConst = NULL;
819 if (type->isVectorTy()) {
820 std::vector<Constant *> negOneVals;
821 negOneConst = ConstantInt::get(CI->getContext(),
822 APInt(32, StringRef("-1"), 10));
823 for (size_t x = 0,
824 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
825 negOneVals.push_back(negOneConst);
826 }
827 negOneConst = ConstantVector::get(negOneVals);
828 } else {
829 negOneConst = ConstantInt::get(CI->getContext(),
830 APInt(32, StringRef("-1"), 10));
831 }
832 // __amdil_bfi => (A & B) | (~A & C)
833 BinaryOperator *lhs =
834 BinaryOperator::Create(Instruction::And, CI->getOperand(0),
835 CI->getOperand(1), "bfi_and", CI);
836 BinaryOperator *rhs =
837 BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
838 "bfi_not", CI);
839 rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
840 "bfi_and", CI);
841 lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
842 CI->replaceAllUsesWith(lhs);
843 return true;
844 }
845
846 bool
847 AMDILPeepholeOpt::expandBFM(CallInst *CI)
848 {
849 if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
850 return false;
851 }
852 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
853 if (!LHS->getName().startswith("__amdil_bfm")) {
854 return false;
855 }
856 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
857 Constant *newMaskConst = NULL;
858 Constant *newShiftConst = NULL;
859 Type* type = CI->getOperand(0)->getType();
860 if (type->isVectorTy()) {
861 std::vector<Constant*> newMaskVals, newShiftVals;
862 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
863 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
864 for (size_t x = 0,
865 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
866 newMaskVals.push_back(newMaskConst);
867 newShiftVals.push_back(newShiftConst);
868 }
869 newMaskConst = ConstantVector::get(newMaskVals);
870 newShiftConst = ConstantVector::get(newShiftVals);
871 } else {
872 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
873 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
874 }
875 BinaryOperator *lhs =
876 BinaryOperator::Create(Instruction::And, CI->getOperand(0),
877 newMaskConst, "bfm_mask", CI);
878 lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
879 lhs, "bfm_shl", CI);
880 lhs = BinaryOperator::Create(Instruction::Sub, lhs,
881 newShiftConst, "bfm_sub", CI);
882 BinaryOperator *rhs =
883 BinaryOperator::Create(Instruction::And, CI->getOperand(1),
884 newMaskConst, "bfm_mask", CI);
885 lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
886 CI->replaceAllUsesWith(lhs);
887 return true;
888 }
889
890 bool
891 AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
892 {
893 Instruction *inst = (*bbb);
894 if (optimizeCallInst(bbb)) {
895 return true;
896 }
897 if (optimizeBitExtract(inst)) {
898 return false;
899 }
900 if (optimizeBitInsert(inst)) {
901 return false;
902 }
903 if (correctMisalignedMemOp(inst)) {
904 return false;
905 }
906 return false;
907 }
908 bool
909 AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
910 {
911 LoadInst *linst = dyn_cast<LoadInst>(inst);
912 StoreInst *sinst = dyn_cast<StoreInst>(inst);
913 unsigned alignment;
914 Type* Ty = inst->getType();
915 if (linst) {
916 alignment = linst->getAlignment();
917 Ty = inst->getType();
918 } else if (sinst) {
919 alignment = sinst->getAlignment();
920 Ty = sinst->getValueOperand()->getType();
921 } else {
922 return false;
923 }
924 unsigned size = getTypeSize(Ty);
925 if (size == alignment || size < alignment) {
926 return false;
927 }
928 if (!Ty->isStructTy()) {
929 return false;
930 }
931 if (alignment < 4) {
932 if (linst) {
933 linst->setAlignment(0);
934 return true;
935 } else if (sinst) {
936 sinst->setAlignment(0);
937 return true;
938 }
939 }
940 return false;
941 }
942 bool
943 AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI)
944 {
945 if (!CI) {
946 return false;
947 }
948 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
949 std::string namePrefix = LHS->getName().substr(0, 14);
950 if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
951 && namePrefix != "__amdil__imul24_high") {
952 return false;
953 }
954 if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
955 return false;
956 }
957 return true;
958 }
959
960 void
961 AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI)
962 {
963 assert(isSigned24BitOps(CI) && "Must be a "
964 "signed 24 bit operation to call this function!");
965 Value *LHS = CI->getOperand(CI->getNumOperands()-1);
966 // On 7XX and 8XX we do not have signed 24bit, so we need to
967 // expand it to the following:
968 // imul24 turns into 32bit imul
969 // imad24 turns into 32bit imad
970 // imul24_high turns into 32bit imulhigh
971 if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
972 Type *aType = CI->getOperand(0)->getType();
973 bool isVector = aType->isVectorTy();
974 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
975 std::vector<Type*> callTypes;
976 callTypes.push_back(CI->getOperand(0)->getType());
977 callTypes.push_back(CI->getOperand(1)->getType());
978 callTypes.push_back(CI->getOperand(2)->getType());
979 FunctionType *funcType =
980 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
981 std::string name = "__amdil_imad";
982 if (isVector) {
983 name += "_v" + itostr(numEle) + "i32";
984 } else {
985 name += "_i32";
986 }
987 Function *Func = dyn_cast<Function>(
988 CI->getParent()->getParent()->getParent()->
989 getOrInsertFunction(llvm::StringRef(name), funcType));
990 Value *Operands[3] = {
991 CI->getOperand(0),
992 CI->getOperand(1),
993 CI->getOperand(2)
994 };
995 CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
996 nCI->insertBefore(CI);
997 CI->replaceAllUsesWith(nCI);
998 } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
999 BinaryOperator *mulOp =
1000 BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1001 CI->getOperand(1), "imul24", CI);
1002 CI->replaceAllUsesWith(mulOp);
1003 } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1004 Type *aType = CI->getOperand(0)->getType();
1005
1006 bool isVector = aType->isVectorTy();
1007 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1008 std::vector<Type*> callTypes;
1009 callTypes.push_back(CI->getOperand(0)->getType());
1010 callTypes.push_back(CI->getOperand(1)->getType());
1011 FunctionType *funcType =
1012 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1013 std::string name = "__amdil_imul_high";
1014 if (isVector) {
1015 name += "_v" + itostr(numEle) + "i32";
1016 } else {
1017 name += "_i32";
1018 }
1019 Function *Func = dyn_cast<Function>(
1020 CI->getParent()->getParent()->getParent()->
1021 getOrInsertFunction(llvm::StringRef(name), funcType));
1022 Value *Operands[2] = {
1023 CI->getOperand(0),
1024 CI->getOperand(1)
1025 };
1026 CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1027 nCI->insertBefore(CI);
1028 CI->replaceAllUsesWith(nCI);
1029 }
1030 }
1031
1032 bool
1033 AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1034 {
1035 return (CI != NULL
1036 && CI->getOperand(CI->getNumOperands() - 1)->getName()
1037 == "__amdil_get_local_size_int");
1038 }
1039
1040 bool
1041 AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI)
1042 {
1043 if (!CI) {
1044 return false;
1045 }
1046 if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
1047 && (mSTM->getDeviceName() == "cayman")) {
1048 return false;
1049 }
1050 return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1051 == "__amdil_improved_div";
1052 }
1053
1054 void
1055 AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI)
1056 {
1057 assert(convertAccurateDivide(CI)
1058 && "expanding accurate divide can only happen if it is expandable!");
1059 BinaryOperator *divOp =
1060 BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1061 CI->getOperand(1), "fdiv32", CI);
1062 CI->replaceAllUsesWith(divOp);
1063 }
1064
1065 bool
1066 AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
1067 {
1068 if (optLevel != CodeGenOpt::None) {
1069 return false;
1070 }
1071
1072 if (!CI) {
1073 return false;
1074 }
1075
1076 unsigned funcNameIdx = 0;
1077 funcNameIdx = CI->getNumOperands() - 1;
1078 StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1079 if (calleeName != "__amdil_image2d_read_norm"
1080 && calleeName != "__amdil_image2d_read_unnorm"
1081 && calleeName != "__amdil_image3d_read_norm"
1082 && calleeName != "__amdil_image3d_read_unnorm") {
1083 return false;
1084 }
1085
1086 unsigned samplerIdx = 2;
1087 samplerIdx = 1;
1088 Value *sampler = CI->getOperand(samplerIdx);
1089 LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1090 if (!lInst) {
1091 return false;
1092 }
1093
1094 if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
1095 return false;
1096 }
1097
1098 GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1099 // If we are loading from what is not a global value, then we
1100 // fail and return.
1101 if (!gv) {
1102 return false;
1103 }
1104
1105 // If we don't have an initializer or we have an initializer and
1106 // the initializer is not a 32bit integer, we fail.
1107 if (!gv->hasInitializer()
1108 || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1109 return false;
1110 }
1111
1112 // Now that we have the global variable initializer, lets replace
1113 // all uses of the load instruction with the samplerVal and
1114 // reparse the __amdil_is_constant() function.
1115 Constant *samplerVal = gv->getInitializer();
1116 lInst->replaceAllUsesWith(samplerVal);
1117 return true;
1118 }
1119
1120 bool
1121 AMDILPeepholeOpt::doInitialization(Module &M)
1122 {
1123 return false;
1124 }
1125
1126 bool
1127 AMDILPeepholeOpt::doFinalization(Module &M)
1128 {
1129 return false;
1130 }
1131
1132 void
1133 AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1134 {
1135 AU.addRequired<MachineFunctionAnalysis>();
1136 FunctionPass::getAnalysisUsage(AU);
1137 AU.setPreservesAll();
1138 }