src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp

   1 //===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //==-----------------------------------------------------------------------===//
   9
  10 #include "AMDILDevices.h"
  11 #include "AMDGPUInstrInfo.h"
  12 #include "llvm/ADT/Statistic.h"
  13 #include "llvm/ADT/StringExtras.h"
  14 #include "llvm/ADT/StringRef.h"
  15 #include "llvm/ADT/Twine.h"
  16 #include "llvm/Constants.h"
  17 #include "llvm/CodeGen/MachineFunction.h"
  18 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
  19 #include "llvm/Function.h"
  20 #include "llvm/Instructions.h"
  21 #include "llvm/Module.h"
  22 #include "llvm/Support/Debug.h"
  23 #include "llvm/Support/MathExtras.h"
  24
  25 #include <sstream>
  26
  27 #if 0
  28 STATISTIC(PointerAssignments, "Number of dynamic pointer "
  29     "assigments discovered");
  30 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
  31 #endif
  32
  33 using namespace llvm;
  34 // The Peephole optimization pass is used to do simple last minute optimizations
  35 // that are required for correct code or to remove redundant functions
  36 namespace {
  37
  38 class OpaqueType;
  39
  40 class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
  41 public:
  42   TargetMachine &TM;
  43   static char ID;
  44   AMDGPUPeepholeOpt(TargetMachine &tm);
  45   ~AMDGPUPeepholeOpt();
  46   const char *getPassName() const;
  47   bool runOnFunction(Function &F);
  48   bool doInitialization(Module &M);
  49   bool doFinalization(Module &M);
  50   void getAnalysisUsage(AnalysisUsage &AU) const;
  51 protected:
  52 private:
  53   // Function to initiate all of the instruction level optimizations.
  54   bool instLevelOptimizations(BasicBlock::iterator *inst);
  55   // Quick check to see if we need to dump all of the pointers into the
  56   // arena. If this is correct, then we set all pointers to exist in arena. This
  57   // is a workaround for aliasing of pointers in a struct/union.
  58   bool dumpAllIntoArena(Function &F);
  59   // Because I don't want to invalidate any pointers while in the
  60   // safeNestedForEachFunction. I push atomic conversions to a vector and handle
  61   // it later. This function does the conversions if required.
  62   void doAtomicConversionIfNeeded(Function &F);
  63   // Because __amdil_is_constant cannot be properly evaluated if
  64   // optimizations are disabled, the call's are placed in a vector
  65   // and evaluated after the __amdil_image* functions are evaluated
  66   // which should allow the __amdil_is_constant function to be
  67   // evaluated correctly.
  68   void doIsConstCallConversionIfNeeded();
  69   bool mChanged;
  70   bool mDebug;
  71   bool mConvertAtomics;
  72   CodeGenOpt::Level optLevel;
  73   // Run a series of tests to see if we can optimize a CALL instruction.
  74   bool optimizeCallInst(BasicBlock::iterator *bbb);
  75   // A peephole optimization to optimize bit extract sequences.
  76   bool optimizeBitExtract(Instruction *inst);
  77   // A peephole optimization to optimize bit insert sequences.
  78   bool optimizeBitInsert(Instruction *inst);
  79   bool setupBitInsert(Instruction *base,
  80                       Instruction *&src,
  81                       Constant *&mask,
  82                       Constant *&shift);
  83   // Expand the bit field insert instruction on versions of OpenCL that
  84   // don't support it.
  85   bool expandBFI(CallInst *CI);
  86   // Expand the bit field mask instruction on version of OpenCL that
  87   // don't support it.
  88   bool expandBFM(CallInst *CI);
  89   // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
  90   // this case we need to expand them. These functions check for 24bit functions
  91   // and then expand.
  92   bool isSigned24BitOps(CallInst *CI);
  93   void expandSigned24BitOps(CallInst *CI);
  94   // One optimization that can occur is that if the required workgroup size is
  95   // specified then the result of get_local_size is known at compile time and
  96   // can be returned accordingly.
  97   bool isRWGLocalOpt(CallInst *CI);
  98   // On northern island cards, the division is slightly less accurate than on
  99   // previous generations, so we need to utilize a more accurate division. So we
 100   // can translate the accurate divide to a normal divide on all other cards.
 101   bool convertAccurateDivide(CallInst *CI);
 102   void expandAccurateDivide(CallInst *CI);
 103   // If the alignment is set incorrectly, it can produce really inefficient
 104   // code. This checks for this scenario and fixes it if possible.
 105   bool correctMisalignedMemOp(Instruction *inst);
 106
 107   // If we are in no opt mode, then we need to make sure that
 108   // local samplers are properly propagated as constant propagation
 109   // doesn't occur and we need to know the value of kernel defined
 110   // samplers at compile time.
 111   bool propagateSamplerInst(CallInst *CI);
 112
 113   // Helper functions
 114
 115   // Group of functions that recursively calculate the size of a structure based
 116   // on it's sub-types.
 117   size_t getTypeSize(Type * const T, bool dereferencePtr = false);
 118   size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
 119   size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
 120   size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
 121   size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
 122   size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
 123   size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
 124   size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
 125
 126   LLVMContext *mCTX;
 127   Function *mF;
 128   const AMDGPUSubtarget *mSTM;
 129   SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
 130   SmallVector<CallInst *, 16> isConstVec;
 131 }; // class AMDGPUPeepholeOpt
 132   char AMDGPUPeepholeOpt::ID = 0;
 133
 134 // A template function that has two levels of looping before calling the
 135 // function with a pointer to the current iterator.
 136 template<class InputIterator, class SecondIterator, class Function>
 137 Function safeNestedForEach(InputIterator First, InputIterator Last,
 138                               SecondIterator S, Function F)
 139 {
 140   for ( ; First != Last; ++First) {
 141     SecondIterator sf, sl;
 142     for (sf = First->begin(), sl = First->end();
 143          sf != sl; )  {
 144       if (!F(&sf)) {
 145         ++sf;
 146       }
 147     }
 148   }
 149   return F;
 150 }
 151
 152 } // anonymous namespace
 153
 154 namespace llvm {
 155   FunctionPass *
 156   createAMDGPUPeepholeOpt(TargetMachine &tm)
 157   {
 158     return new AMDGPUPeepholeOpt(tm);
 159   }
 160 } // llvm namespace
 161
 162 AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
 163   : FunctionPass(ID), TM(tm)
 164 {
 165   mDebug = false;
 166   optLevel = TM.getOptLevel();
 167
 168 }
 169
 170 AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()
 171 {
 172 }
 173
 174 const char *
 175 AMDGPUPeepholeOpt::getPassName() const
 176 {
 177   return "AMDGPU PeepHole Optimization Pass";
 178 }
 179
 180 bool
 181 containsPointerType(Type *Ty)
 182 {
 183   if (!Ty) {
 184     return false;
 185   }
 186   switch(Ty->getTypeID()) {
 187   default:
 188     return false;
 189   case Type::StructTyID: {
 190     const StructType *ST = dyn_cast<StructType>(Ty);
 191     for (StructType::element_iterator stb = ST->element_begin(),
 192            ste = ST->element_end(); stb != ste; ++stb) {
 193       if (!containsPointerType(*stb)) {
 194         continue;
 195       }
 196       return true;
 197     }
 198     break;
 199   }
 200   case Type::VectorTyID:
 201   case Type::ArrayTyID:
 202     return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
 203   case Type::PointerTyID:
 204     return true;
 205   };
 206   return false;
 207 }
 208
 209 bool
 210 AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)
 211 {
 212   bool dumpAll = false;
 213   for (Function::const_arg_iterator cab = F.arg_begin(),
 214        cae = F.arg_end(); cab != cae; ++cab) {
 215     const Argument *arg = cab;
 216     const PointerType *PT = dyn_cast<PointerType>(arg->getType());
 217     if (!PT) {
 218       continue;
 219     }
 220     Type *DereferencedType = PT->getElementType();
 221     if (!dyn_cast<StructType>(DereferencedType)
 222         ) {
 223       continue;
 224     }
 225     if (!containsPointerType(DereferencedType)) {
 226       continue;
 227     }
 228     // FIXME: Because a pointer inside of a struct/union may be aliased to
 229     // another pointer we need to take the conservative approach and place all
 230     // pointers into the arena until more advanced detection is implemented.
 231     dumpAll = true;
 232   }
 233   return dumpAll;
 234 }
 235 void
 236 AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded()
 237 {
 238   if (isConstVec.empty()) {
 239     return;
 240   }
 241   for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
 242     CallInst *CI = isConstVec[x];
 243     Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
 244     Type *aType = Type::getInt32Ty(*mCTX);
 245     Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
 246       : ConstantInt::get(aType, 0);
 247     CI->replaceAllUsesWith(Val);
 248     CI->eraseFromParent();
 249   }
 250   isConstVec.clear();
 251 }
 252 void
 253 AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
 254 {
 255   // Don't do anything if we don't have any atomic operations.
 256   if (atomicFuncs.empty()) {
 257     return;
 258   }
 259   // Change the function name for the atomic if it is required
 260   uint32_t size = atomicFuncs.size();
 261   for (uint32_t x = 0; x < size; ++x) {
 262     atomicFuncs[x].first->setOperand(
 263         atomicFuncs[x].first->getNumOperands()-1,
 264         atomicFuncs[x].second);
 265
 266   }
 267   mChanged = true;
 268   if (mConvertAtomics) {
 269     return;
 270   }
 271 }
 272
 273 bool
 274 AMDGPUPeepholeOpt::runOnFunction(Function &MF)
 275 {
 276   mChanged = false;
 277   mF = &MF;
 278   mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
 279   if (mDebug) {
 280     MF.dump();
 281   }
 282   mCTX = &MF.getType()->getContext();
 283   mConvertAtomics = true;
 284   safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
 285      std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
 286                   this));
 287
 288   doAtomicConversionIfNeeded(MF);
 289   doIsConstCallConversionIfNeeded();
 290
 291   if (mDebug) {
 292     MF.dump();
 293   }
 294   return mChanged;
 295 }
 296
 297 bool
 298 AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
 299 {
 300   Instruction *inst = (*bbb);
 301   CallInst *CI = dyn_cast<CallInst>(inst);
 302   if (!CI) {
 303     return false;
 304   }
 305   if (isSigned24BitOps(CI)) {
 306     expandSigned24BitOps(CI);
 307     ++(*bbb);
 308     CI->eraseFromParent();
 309     return true;
 310   }
 311   if (propagateSamplerInst(CI)) {
 312     return false;
 313   }
 314   if (expandBFI(CI) || expandBFM(CI)) {
 315     ++(*bbb);
 316     CI->eraseFromParent();
 317     return true;
 318   }
 319   if (convertAccurateDivide(CI)) {
 320     expandAccurateDivide(CI);
 321     ++(*bbb);
 322     CI->eraseFromParent();
 323     return true;
 324   }
 325
 326   StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
 327   if (calleeName.startswith("__amdil_is_constant")) {
 328     // If we do not have optimizations, then this
 329     // cannot be properly evaluated, so we add the
 330     // call instruction to a vector and process
 331     // them at the end of processing after the
 332     // samplers have been correctly handled.
 333     if (optLevel == CodeGenOpt::None) {
 334       isConstVec.push_back(CI);
 335       return false;
 336     } else {
 337       Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
 338       Type *aType = Type::getInt32Ty(*mCTX);
 339       Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
 340         : ConstantInt::get(aType, 0);
 341       CI->replaceAllUsesWith(Val);
 342       ++(*bbb);
 343       CI->eraseFromParent();
 344       return true;
 345     }
 346   }
 347
 348   if (calleeName.equals("__amdil_is_asic_id_i32")) {
 349     ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
 350     Type *aType = Type::getInt32Ty(*mCTX);
 351     Value *Val = CV;
 352     if (Val) {
 353       Val = ConstantInt::get(aType,
 354           mSTM->device()->getDeviceFlag() & CV->getZExtValue());
 355     } else {
 356       Val = ConstantInt::get(aType, 0);
 357     }
 358     CI->replaceAllUsesWith(Val);
 359     ++(*bbb);
 360     CI->eraseFromParent();
 361     return true;
 362   }
 363   Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
 364   if (!F) {
 365     return false;
 366   }
 367   if (F->getName().startswith("__atom") && !CI->getNumUses()
 368       && F->getName().find("_xchg") == StringRef::npos) {
 369     std::string buffer(F->getName().str() + "_noret");
 370     F = dyn_cast<Function>(
 371           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
 372     atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
 373   }
 374
 375   if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
 376       && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
 377     return false;
 378   }
 379   if (!mConvertAtomics) {
 380     return false;
 381   }
 382   StringRef name = F->getName();
 383   if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
 384     mConvertAtomics = false;
 385   }
 386   return false;
 387 }
 388
 389 bool
 390 AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
 391     Instruction *&src,
 392     Constant *&mask,
 393     Constant *&shift)
 394 {
 395   if (!base) {
 396     if (mDebug) {
 397       dbgs() << "Null pointer passed into function.\n";
 398     }
 399     return false;
 400   }
 401   bool andOp = false;
 402   if (base->getOpcode() == Instruction::Shl) {
 403     shift = dyn_cast<Constant>(base->getOperand(1));
 404   } else if (base->getOpcode() == Instruction::And) {
 405     mask = dyn_cast<Constant>(base->getOperand(1));
 406     andOp = true;
 407   } else {
 408     if (mDebug) {
 409       dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
 410     }
 411     // If the base is neither a Shl or a And, we don't fit any of the patterns above.
 412     return false;
 413   }
 414   src = dyn_cast<Instruction>(base->getOperand(0));
 415   if (!src) {
 416     if (mDebug) {
 417       dbgs() << "Failed setup since the base operand is not an instruction!\n";
 418     }
 419     return false;
 420   }
 421   // If we find an 'and' operation, then we don't need to
 422   // find the next operation as we already know the
 423   // bits that are valid at this point.
 424   if (andOp) {
 425     return true;
 426   }
 427   if (src->getOpcode() == Instruction::Shl && !shift) {
 428     shift = dyn_cast<Constant>(src->getOperand(1));
 429     src = dyn_cast<Instruction>(src->getOperand(0));
 430   } else if (src->getOpcode() == Instruction::And && !mask) {
 431     mask = dyn_cast<Constant>(src->getOperand(1));
 432   }
 433   if (!mask && !shift) {
 434     if (mDebug) {
 435       dbgs() << "Failed setup since both mask and shift are NULL!\n";
 436     }
 437     // Did not find a constant mask or a shift.
 438     return false;
 439   }
 440   return true;
 441 }
 442 bool
 443 AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)
 444 {
 445   if (!inst) {
 446     return false;
 447   }
 448   if (!inst->isBinaryOp()) {
 449     return false;
 450   }
 451   if (inst->getOpcode() != Instruction::Or) {
 452     return false;
 453   }
 454   if (optLevel == CodeGenOpt::None) {
 455     return false;
 456   }
 457   // We want to do an optimization on a sequence of ops that in the end equals a
 458   // single ISA instruction.
 459   // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
 460   // Some simplified versions of this pattern are as follows:
 461   // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
 462   // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
 463   // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
 464   // (A & B) | (D << F) when (1 << F) >= B
 465   // (A << C) | (D & E) when (1 << C) >= E
 466   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
 467     // The HD4XXX hardware doesn't support the ubit_insert instruction.
 468     return false;
 469   }
 470   Type *aType = inst->getType();
 471   bool isVector = aType->isVectorTy();
 472   int numEle = 1;
 473   // This optimization only works on 32bit integers.
 474   if (aType->getScalarType()
 475       != Type::getInt32Ty(inst->getContext())) {
 476     return false;
 477   }
 478   if (isVector) {
 479     const VectorType *VT = dyn_cast<VectorType>(aType);
 480     numEle = VT->getNumElements();
 481     // We currently cannot support more than 4 elements in a intrinsic and we
 482     // cannot support Vec3 types.
 483     if (numEle > 4 || numEle == 3) {
 484       return false;
 485     }
 486   }
 487   // TODO: Handle vectors.
 488   if (isVector) {
 489     if (mDebug) {
 490       dbgs() << "!!! Vectors are not supported yet!\n";
 491     }
 492     return false;
 493   }
 494   Instruction *LHSSrc = NULL, *RHSSrc = NULL;
 495   Constant *LHSMask = NULL, *RHSMask = NULL;
 496   Constant *LHSShift = NULL, *RHSShift = NULL;
 497   Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
 498   Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
 499   if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
 500     if (mDebug) {
 501       dbgs() << "Found an OR Operation that failed setup!\n";
 502       inst->dump();
 503       if (LHS) { LHS->dump(); }
 504       if (LHSSrc) { LHSSrc->dump(); }
 505       if (LHSMask) { LHSMask->dump(); }
 506       if (LHSShift) { LHSShift->dump(); }
 507     }
 508     // There was an issue with the setup for BitInsert.
 509     return false;
 510   }
 511   if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
 512     if (mDebug) {
 513       dbgs() << "Found an OR Operation that failed setup!\n";
 514       inst->dump();
 515       if (RHS) { RHS->dump(); }
 516       if (RHSSrc) { RHSSrc->dump(); }
 517       if (RHSMask) { RHSMask->dump(); }
 518       if (RHSShift) { RHSShift->dump(); }
 519     }
 520     // There was an issue with the setup for BitInsert.
 521     return false;
 522   }
 523   if (mDebug) {
 524     dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
 525     dbgs() << "Op:        "; inst->dump();
 526     dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
 527     dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
 528     dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
 529     dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
 530     dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
 531     dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
 532     dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
 533     dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
 534   }
 535   Constant *offset = NULL;
 536   Constant *width = NULL;
 537   int32_t lhsMaskVal = 0, rhsMaskVal = 0;
 538   int32_t lhsShiftVal = 0, rhsShiftVal = 0;
 539   int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
 540   int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
 541   lhsMaskVal = (int32_t)(LHSMask
 542       ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
 543   rhsMaskVal = (int32_t)(RHSMask
 544       ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
 545   lhsShiftVal = (int32_t)(LHSShift
 546       ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
 547   rhsShiftVal = (int32_t)(RHSShift
 548       ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
 549   lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
 550   rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
 551   lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
 552   rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
 553   // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
 554   if (mDebug) {
 555       dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
 556       dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
 557       dbgs() << (RHSMask ? " & E)" : ")");
 558       dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
 559       dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
 560       dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
 561       dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
 562       dbgs() << "width(B) = " << lhsMaskWidth;
 563       dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
 564       dbgs() << "offset(B) = " << lhsMaskOffset;
 565       dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
 566       dbgs() << "Constraints: \n";
 567       dbgs() << "\t(1) B ^ E == 0\n";
 568       dbgs() << "\t(2-LHS) B is a mask\n";
 569       dbgs() << "\t(2-LHS) E is a mask\n";
 570       dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
 571       dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
 572   }
 573   if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
 574     if (mDebug) {
 575       dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
 576       dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
 577       dbgs() << "Failed constraint 1!\n";
 578     }
 579     return false;
 580   }
 581   if (mDebug) {
 582     dbgs() << "LHS = " << lhsMaskOffset << "";
 583     dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
 584     dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
 585     dbgs() << "\nRHS = " << rhsMaskOffset << "";
 586     dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
 587     dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
 588     dbgs() << "\n";
 589   }
 590   if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
 591     offset = ConstantInt::get(aType, lhsMaskOffset, false);
 592     width = ConstantInt::get(aType, lhsMaskWidth, false);
 593     RHSSrc = RHS;
 594     if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
 595       if (mDebug) {
 596         dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
 597         dbgs() << "Failed constraint 2!\n";
 598       }
 599       return false;
 600     }
 601     if (!LHSShift) {
 602       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 603           "MaskShr", LHS);
 604     } else if (lhsShiftVal != lhsMaskOffset) {
 605       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 606           "MaskShr", LHS);
 607     }
 608     if (mDebug) {
 609       dbgs() << "Optimizing LHS!\n";
 610     }
 611   } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
 612     offset = ConstantInt::get(aType, rhsMaskOffset, false);
 613     width = ConstantInt::get(aType, rhsMaskWidth, false);
 614     LHSSrc = RHSSrc;
 615     RHSSrc = LHS;
 616     if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
 617       if (mDebug) {
 618         dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
 619         dbgs() << "Failed constraint 2!\n";
 620       }
 621       return false;
 622     }
 623     if (!RHSShift) {
 624       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 625           "MaskShr", RHS);
 626     } else if (rhsShiftVal != rhsMaskOffset) {
 627       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 628           "MaskShr", RHS);
 629     }
 630     if (mDebug) {
 631       dbgs() << "Optimizing RHS!\n";
 632     }
 633   } else {
 634     if (mDebug) {
 635       dbgs() << "Failed constraint 3!\n";
 636     }
 637     return false;
 638   }
 639   if (mDebug) {
 640     dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
 641     dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
 642     dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
 643     dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
 644   }
 645   if (!offset || !width) {
 646     if (mDebug) {
 647       dbgs() << "Either width or offset are NULL, failed detection!\n";
 648     }
 649     return false;
 650   }
 651   // Lets create the function signature.
 652   std::vector<Type *> callTypes;
 653   callTypes.push_back(aType);
 654   callTypes.push_back(aType);
 655   callTypes.push_back(aType);
 656   callTypes.push_back(aType);
 657   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
 658   std::string name = "__amdil_ubit_insert";
 659   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
 660   Function *Func =
 661     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
 662         getOrInsertFunction(llvm::StringRef(name), funcType));
 663   Value *Operands[4] = {
 664     width,
 665     offset,
 666     LHSSrc,
 667     RHSSrc
 668   };
 669   CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
 670   if (mDebug) {
 671     dbgs() << "Old Inst: ";
 672     inst->dump();
 673     dbgs() << "New Inst: ";
 674     CI->dump();
 675     dbgs() << "\n\n";
 676   }
 677   CI->insertBefore(inst);
 678   inst->replaceAllUsesWith(CI);
 679   return true;
 680 }
 681
 682 bool
 683 AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)
 684 {
 685   if (!inst) {
 686     return false;
 687   }
 688   if (!inst->isBinaryOp()) {
 689     return false;
 690   }
 691   if (inst->getOpcode() != Instruction::And) {
 692     return false;
 693   }
 694   if (optLevel == CodeGenOpt::None) {
 695     return false;
 696   }
 697   // We want to do some simple optimizations on Shift right/And patterns. The
 698   // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
 699   // value smaller than 32 and C is a mask. If C is a constant value, then the
 700   // following transformation can occur. For signed integers, it turns into the
 701   // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
 702   // integers, it turns into the function call dst =
 703   // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
 704   // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
 705   // Evergreen hardware.
 706   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
 707     // This does not work on HD4XXX hardware.
 708     return false;
 709   }
 710   Type *aType = inst->getType();
 711   bool isVector = aType->isVectorTy();
 712
 713   // XXX Support vector types
 714   if (isVector) {
 715     return false;
 716   }
 717   int numEle = 1;
 718   // This only works on 32bit integers
 719   if (aType->getScalarType()
 720       != Type::getInt32Ty(inst->getContext())) {
 721     return false;
 722   }
 723   if (isVector) {
 724     const VectorType *VT = dyn_cast<VectorType>(aType);
 725     numEle = VT->getNumElements();
 726     // We currently cannot support more than 4 elements in a intrinsic and we
 727     // cannot support Vec3 types.
 728     if (numEle > 4 || numEle == 3) {
 729       return false;
 730     }
 731   }
 732   BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
 733   // If the first operand is not a shift instruction, then we can return as it
 734   // doesn't match this pattern.
 735   if (!ShiftInst || !ShiftInst->isShift()) {
 736     return false;
 737   }
 738   // If we are a shift left, then we need don't match this pattern.
 739   if (ShiftInst->getOpcode() == Instruction::Shl) {
 740     return false;
 741   }
 742   bool isSigned = ShiftInst->isArithmeticShift();
 743   Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
 744   Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
 745   // Lets make sure that the shift value and the and mask are constant integers.
 746   if (!AndMask || !ShrVal) {
 747     return false;
 748   }
 749   Constant *newMaskConst;
 750   Constant *shiftValConst;
 751   if (isVector) {
 752     // Handle the vector case
 753     std::vector<Constant *> maskVals;
 754     std::vector<Constant *> shiftVals;
 755     ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
 756     ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
 757     Type *scalarType = AndMaskVec->getType()->getScalarType();
 758     assert(AndMaskVec->getNumOperands() ==
 759            ShrValVec->getNumOperands() && "cannot have a "
 760            "combination where the number of elements to a "
 761            "shift and an and are different!");
 762     for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
 763       ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
 764       ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
 765       if (!AndCI || !ShiftIC) {
 766         return false;
 767       }
 768       uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
 769       if (!isMask_32(maskVal)) {
 770         return false;
 771       }
 772       maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
 773       uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
 774       // If the mask or shiftval is greater than the bitcount, then break out.
 775       if (maskVal >= 32 || shiftVal >= 32) {
 776         return false;
 777       }
 778       // If the mask val is greater than the the number of original bits left
 779       // then this optimization is invalid.
 780       if (maskVal > (32 - shiftVal)) {
 781         return false;
 782       }
 783       maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
 784       shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
 785     }
 786     newMaskConst = ConstantVector::get(maskVals);
 787     shiftValConst = ConstantVector::get(shiftVals);
 788   } else {
 789     // Handle the scalar case
 790     uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
 791     // This must be a mask value where all lower bits are set to 1 and then any
 792     // bit higher is set to 0.
 793     if (!isMask_32(maskVal)) {
 794       return false;
 795     }
 796     maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
 797     // Count the number of bits set in the mask, this is the width of the
 798     // resulting bit set that is extracted from the source value.
 799     uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
 800     // If the mask or shift val is greater than the bitcount, then break out.
 801     if (maskVal >= 32 || shiftVal >= 32) {
 802       return false;
 803     }
 804     // If the mask val is greater than the the number of original bits left then
 805     // this optimization is invalid.
 806     if (maskVal > (32 - shiftVal)) {
 807       return false;
 808     }
 809     newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
 810     shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
 811   }
 812   // Lets create the function signature.
 813   std::vector<Type *> callTypes;
 814   callTypes.push_back(aType);
 815   callTypes.push_back(aType);
 816   callTypes.push_back(aType);
 817   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
 818   std::string name = "llvm.AMDIL.bit.extract.u32";
 819   if (isVector) {
 820     name += ".v" + itostr(numEle) + "i32";
 821   } else {
 822     name += ".";
 823   }
 824   // Lets create the function.
 825   Function *Func =
 826     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
 827                        getOrInsertFunction(llvm::StringRef(name), funcType));
 828   Value *Operands[3] = {
 829     ShiftInst->getOperand(0),
 830     shiftValConst,
 831     newMaskConst
 832   };
 833   // Lets create the Call with the operands
 834   CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
 835   CI->setDoesNotAccessMemory();
 836   CI->insertBefore(inst);
 837   inst->replaceAllUsesWith(CI);
 838   return true;
 839 }
 840
 841 bool
 842 AMDGPUPeepholeOpt::expandBFI(CallInst *CI)
 843 {
 844   if (!CI) {
 845     return false;
 846   }
 847   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 848   if (!LHS->getName().startswith("__amdil_bfi")) {
 849     return false;
 850   }
 851   Type* type = CI->getOperand(0)->getType();
 852   Constant *negOneConst = NULL;
 853   if (type->isVectorTy()) {
 854     std::vector<Constant *> negOneVals;
 855     negOneConst = ConstantInt::get(CI->getContext(),
 856         APInt(32, StringRef("-1"), 10));
 857     for (size_t x = 0,
 858         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
 859       negOneVals.push_back(negOneConst);
 860     }
 861     negOneConst = ConstantVector::get(negOneVals);
 862   } else {
 863     negOneConst = ConstantInt::get(CI->getContext(),
 864         APInt(32, StringRef("-1"), 10));
 865   }
 866   // __amdil_bfi => (A & B) | (~A & C)
 867   BinaryOperator *lhs =
 868     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
 869         CI->getOperand(1), "bfi_and", CI);
 870   BinaryOperator *rhs =
 871     BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
 872         "bfi_not", CI);
 873   rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
 874       "bfi_and", CI);
 875   lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
 876   CI->replaceAllUsesWith(lhs);
 877   return true;
 878 }
 879
 880 bool
 881 AMDGPUPeepholeOpt::expandBFM(CallInst *CI)
 882 {
 883   if (!CI) {
 884     return false;
 885   }
 886   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 887   if (!LHS->getName().startswith("__amdil_bfm")) {
 888     return false;
 889   }
 890   // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
 891   Constant *newMaskConst = NULL;
 892   Constant *newShiftConst = NULL;
 893   Type* type = CI->getOperand(0)->getType();
 894   if (type->isVectorTy()) {
 895     std::vector<Constant*> newMaskVals, newShiftVals;
 896     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
 897     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
 898     for (size_t x = 0,
 899         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
 900       newMaskVals.push_back(newMaskConst);
 901       newShiftVals.push_back(newShiftConst);
 902     }
 903     newMaskConst = ConstantVector::get(newMaskVals);
 904     newShiftConst = ConstantVector::get(newShiftVals);
 905   } else {
 906     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
 907     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
 908   }
 909   BinaryOperator *lhs =
 910     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
 911         newMaskConst, "bfm_mask", CI);
 912   lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
 913       lhs, "bfm_shl", CI);
 914   lhs = BinaryOperator::Create(Instruction::Sub, lhs,
 915       newShiftConst, "bfm_sub", CI);
 916   BinaryOperator *rhs =
 917     BinaryOperator::Create(Instruction::And, CI->getOperand(1),
 918         newMaskConst, "bfm_mask", CI);
 919   lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
 920   CI->replaceAllUsesWith(lhs);
 921   return true;
 922 }
 923
 924 bool
 925 AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
 926 {
 927   Instruction *inst = (*bbb);
 928   if (optimizeCallInst(bbb)) {
 929     return true;
 930   }
 931   if (optimizeBitExtract(inst)) {
 932     return false;
 933   }
 934   if (optimizeBitInsert(inst)) {
 935     return false;
 936   }
 937   if (correctMisalignedMemOp(inst)) {
 938     return false;
 939   }
 940   return false;
 941 }
 942 bool
 943 AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
 944 {
 945   LoadInst *linst = dyn_cast<LoadInst>(inst);
 946   StoreInst *sinst = dyn_cast<StoreInst>(inst);
 947   unsigned alignment;
 948   Type* Ty = inst->getType();
 949   if (linst) {
 950     alignment = linst->getAlignment();
 951     Ty = inst->getType();
 952   } else if (sinst) {
 953     alignment = sinst->getAlignment();
 954     Ty = sinst->getValueOperand()->getType();
 955   } else {
 956     return false;
 957   }
 958   unsigned size = getTypeSize(Ty);
 959   if (size == alignment || size < alignment) {
 960     return false;
 961   }
 962   if (!Ty->isStructTy()) {
 963     return false;
 964   }
 965   if (alignment < 4) {
 966     if (linst) {
 967       linst->setAlignment(0);
 968       return true;
 969     } else if (sinst) {
 970       sinst->setAlignment(0);
 971       return true;
 972     }
 973   }
 974   return false;
 975 }
 976 bool
 977 AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)
 978 {
 979   if (!CI) {
 980     return false;
 981   }
 982   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 983   std::string namePrefix = LHS->getName().substr(0, 14);
 984   if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
 985       && namePrefix != "__amdil__imul24_high") {
 986     return false;
 987   }
 988   if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
 989     return false;
 990   }
 991   return true;
 992 }
 993
 994 void
 995 AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)
 996 {
 997   assert(isSigned24BitOps(CI) && "Must be a "
 998       "signed 24 bit operation to call this function!");
 999   Value *LHS = CI->getOperand(CI->getNumOperands()-1);
1000   // On 7XX and 8XX we do not have signed 24bit, so we need to
1001   // expand it to the following:
1002   // imul24 turns into 32bit imul
1003   // imad24 turns into 32bit imad
1004   // imul24_high turns into 32bit imulhigh
1005   if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
1006     Type *aType = CI->getOperand(0)->getType();
1007     bool isVector = aType->isVectorTy();
1008     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1009     std::vector<Type*> callTypes;
1010     callTypes.push_back(CI->getOperand(0)->getType());
1011     callTypes.push_back(CI->getOperand(1)->getType());
1012     callTypes.push_back(CI->getOperand(2)->getType());
1013     FunctionType *funcType =
1014       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1015     std::string name = "__amdil_imad";
1016     if (isVector) {
1017       name += "_v" + itostr(numEle) + "i32";
1018     } else {
1019       name += "_i32";
1020     }
1021     Function *Func = dyn_cast<Function>(
1022                        CI->getParent()->getParent()->getParent()->
1023                        getOrInsertFunction(llvm::StringRef(name), funcType));
1024     Value *Operands[3] = {
1025       CI->getOperand(0),
1026       CI->getOperand(1),
1027       CI->getOperand(2)
1028     };
1029     CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
1030     nCI->insertBefore(CI);
1031     CI->replaceAllUsesWith(nCI);
1032   } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
1033     BinaryOperator *mulOp =
1034       BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1035           CI->getOperand(1), "imul24", CI);
1036     CI->replaceAllUsesWith(mulOp);
1037   } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1038     Type *aType = CI->getOperand(0)->getType();
1039
1040     bool isVector = aType->isVectorTy();
1041     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1042     std::vector<Type*> callTypes;
1043     callTypes.push_back(CI->getOperand(0)->getType());
1044     callTypes.push_back(CI->getOperand(1)->getType());
1045     FunctionType *funcType =
1046       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1047     std::string name = "__amdil_imul_high";
1048     if (isVector) {
1049       name += "_v" + itostr(numEle) + "i32";
1050     } else {
1051       name += "_i32";
1052     }
1053     Function *Func = dyn_cast<Function>(
1054                        CI->getParent()->getParent()->getParent()->
1055                        getOrInsertFunction(llvm::StringRef(name), funcType));
1056     Value *Operands[2] = {
1057       CI->getOperand(0),
1058       CI->getOperand(1)
1059     };
1060     CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1061     nCI->insertBefore(CI);
1062     CI->replaceAllUsesWith(nCI);
1063   }
1064 }
1065
1066 bool
1067 AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1068 {
1069   return (CI != NULL
1070           && CI->getOperand(CI->getNumOperands() - 1)->getName()
1071           == "__amdil_get_local_size_int");
1072 }
1073
1074 bool
1075 AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)
1076 {
1077   if (!CI) {
1078     return false;
1079   }
1080   if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
1081       && (mSTM->getDeviceName() == "cayman")) {
1082     return false;
1083   }
1084   return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1085       == "__amdil_improved_div";
1086 }
1087
1088 void
1089 AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)
1090 {
1091   assert(convertAccurateDivide(CI)
1092          && "expanding accurate divide can only happen if it is expandable!");
1093   BinaryOperator *divOp =
1094     BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1095                            CI->getOperand(1), "fdiv32", CI);
1096   CI->replaceAllUsesWith(divOp);
1097 }
1098
1099 bool
1100 AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI)
1101 {
1102   if (optLevel != CodeGenOpt::None) {
1103     return false;
1104   }
1105
1106   if (!CI) {
1107     return false;
1108   }
1109
1110   unsigned funcNameIdx = 0;
1111   funcNameIdx = CI->getNumOperands() - 1;
1112   StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1113   if (calleeName != "__amdil_image2d_read_norm"
1114    && calleeName != "__amdil_image2d_read_unnorm"
1115    && calleeName != "__amdil_image3d_read_norm"
1116    && calleeName != "__amdil_image3d_read_unnorm") {
1117     return false;
1118   }
1119
1120   unsigned samplerIdx = 2;
1121   samplerIdx = 1;
1122   Value *sampler = CI->getOperand(samplerIdx);
1123   LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1124   if (!lInst) {
1125     return false;
1126   }
1127
1128   if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1129     return false;
1130   }
1131
1132   GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1133   // If we are loading from what is not a global value, then we
1134   // fail and return.
1135   if (!gv) {
1136     return false;
1137   }
1138
1139   // If we don't have an initializer or we have an initializer and
1140   // the initializer is not a 32bit integer, we fail.
1141   if (!gv->hasInitializer()
1142       || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1143       return false;
1144   }
1145
1146   // Now that we have the global variable initializer, lets replace
1147   // all uses of the load instruction with the samplerVal and
1148   // reparse the __amdil_is_constant() function.
1149   Constant *samplerVal = gv->getInitializer();
1150   lInst->replaceAllUsesWith(samplerVal);
1151   return true;
1152 }
1153
1154 bool
1155 AMDGPUPeepholeOpt::doInitialization(Module &M)
1156 {
1157   return false;
1158 }
1159
1160 bool
1161 AMDGPUPeepholeOpt::doFinalization(Module &M)
1162 {
1163   return false;
1164 }
1165
1166 void
1167 AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1168 {
1169   AU.addRequired<MachineFunctionAnalysis>();
1170   FunctionPass::getAnalysisUsage(AU);
1171   AU.setPreservesAll();
1172 }
1173
1174 size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
1175   size_t size = 0;
1176   if (!T) {
1177     return size;
1178   }
1179   switch (T->getTypeID()) {
1180   case Type::X86_FP80TyID:
1181   case Type::FP128TyID:
1182   case Type::PPC_FP128TyID:
1183   case Type::LabelTyID:
1184     assert(0 && "These types are not supported by this backend");
1185   default:
1186   case Type::FloatTyID:
1187   case Type::DoubleTyID:
1188     size = T->getPrimitiveSizeInBits() >> 3;
1189     break;
1190   case Type::PointerTyID:
1191     size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
1192     break;
1193   case Type::IntegerTyID:
1194     size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
1195     break;
1196   case Type::StructTyID:
1197     size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
1198     break;
1199   case Type::ArrayTyID:
1200     size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
1201     break;
1202   case Type::FunctionTyID:
1203     size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
1204     break;
1205   case Type::VectorTyID:
1206     size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
1207     break;
1208   };
1209   return size;
1210 }
1211
1212 size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
1213     bool dereferencePtr) {
1214   size_t size = 0;
1215   if (!ST) {
1216     return size;
1217   }
1218   Type *curType;
1219   StructType::element_iterator eib;
1220   StructType::element_iterator eie;
1221   for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
1222     curType = *eib;
1223     size += getTypeSize(curType, dereferencePtr);
1224   }
1225   return size;
1226 }
1227
1228 size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
1229     bool dereferencePtr) {
1230   return IT ? (IT->getBitWidth() >> 3) : 0;
1231 }
1232
1233 size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
1234     bool dereferencePtr) {
1235     assert(0 && "Should not be able to calculate the size of an function type");
1236     return 0;
1237 }
1238
1239 size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
1240     bool dereferencePtr) {
1241   return (size_t)(AT ? (getTypeSize(AT->getElementType(),
1242                                     dereferencePtr) * AT->getNumElements())
1243                      : 0);
1244 }
1245
1246 size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
1247     bool dereferencePtr) {
1248   return VT ? (VT->getBitWidth() >> 3) : 0;
1249 }
1250
1251 size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
1252     bool dereferencePtr) {
1253   if (!PT) {
1254     return 0;
1255   }
1256   Type *CT = PT->getElementType();
1257   if (CT->getTypeID() == Type::StructTyID &&
1258       PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1259     return getTypeSize(dyn_cast<StructType>(CT));
1260   } else if (dereferencePtr) {
1261     size_t size = 0;
1262     for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
1263       size += getTypeSize(PT->getContainedType(x), dereferencePtr);
1264     }
1265     return size;
1266   } else {
1267     return 4;
1268   }
1269 }
1270
1271 size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
1272     bool dereferencePtr) {
1273   //assert(0 && "Should not be able to calculate the size of an opaque type");
1274   return 4;
1275 }