src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp

   1 //===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //==-----------------------------------------------------------------------===//
   9
  10 #include "AMDILAlgorithms.tpp"
  11 #include "AMDILDevices.h"
  12 #include "AMDGPUInstrInfo.h"
  13 #include "llvm/ADT/Statistic.h"
  14 #include "llvm/ADT/StringExtras.h"
  15 #include "llvm/ADT/StringRef.h"
  16 #include "llvm/ADT/Twine.h"
  17 #include "llvm/Constants.h"
  18 #include "llvm/CodeGen/MachineFunction.h"
  19 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
  20 #include "llvm/Function.h"
  21 #include "llvm/Instructions.h"
  22 #include "llvm/Module.h"
  23 #include "llvm/Support/Debug.h"
  24 #include "llvm/Support/MathExtras.h"
  25
  26 #include <sstream>
  27
  28 #if 0
  29 STATISTIC(PointerAssignments, "Number of dynamic pointer "
  30     "assigments discovered");
  31 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
  32 #endif
  33
  34 using namespace llvm;
  35 // The Peephole optimization pass is used to do simple last minute optimizations
  36 // that are required for correct code or to remove redundant functions
  37 namespace {
  38
  39 class OpaqueType;
  40
  41 class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
  42 public:
  43   TargetMachine &TM;
  44   static char ID;
  45   AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
  46   ~AMDILPeepholeOpt();
  47   const char *getPassName() const;
  48   bool runOnFunction(Function &F);
  49   bool doInitialization(Module &M);
  50   bool doFinalization(Module &M);
  51   void getAnalysisUsage(AnalysisUsage &AU) const;
  52 protected:
  53 private:
  54   // Function to initiate all of the instruction level optimizations.
  55   bool instLevelOptimizations(BasicBlock::iterator *inst);
  56   // Quick check to see if we need to dump all of the pointers into the
  57   // arena. If this is correct, then we set all pointers to exist in arena. This
  58   // is a workaround for aliasing of pointers in a struct/union.
  59   bool dumpAllIntoArena(Function &F);
  60   // Because I don't want to invalidate any pointers while in the
  61   // safeNestedForEachFunction. I push atomic conversions to a vector and handle
  62   // it later. This function does the conversions if required.
  63   void doAtomicConversionIfNeeded(Function &F);
  64   // Because __amdil_is_constant cannot be properly evaluated if
  65   // optimizations are disabled, the call's are placed in a vector
  66   // and evaluated after the __amdil_image* functions are evaluated
  67   // which should allow the __amdil_is_constant function to be
  68   // evaluated correctly.
  69   void doIsConstCallConversionIfNeeded();
  70   bool mChanged;
  71   bool mDebug;
  72   bool mConvertAtomics;
  73   CodeGenOpt::Level optLevel;
  74   // Run a series of tests to see if we can optimize a CALL instruction.
  75   bool optimizeCallInst(BasicBlock::iterator *bbb);
  76   // A peephole optimization to optimize bit extract sequences.
  77   bool optimizeBitExtract(Instruction *inst);
  78   // A peephole optimization to optimize bit insert sequences.
  79   bool optimizeBitInsert(Instruction *inst);
  80   bool setupBitInsert(Instruction *base,
  81                       Instruction *&src,
  82                       Constant *&mask,
  83                       Constant *&shift);
  84   // Expand the bit field insert instruction on versions of OpenCL that
  85   // don't support it.
  86   bool expandBFI(CallInst *CI);
  87   // Expand the bit field mask instruction on version of OpenCL that
  88   // don't support it.
  89   bool expandBFM(CallInst *CI);
  90   // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
  91   // this case we need to expand them. These functions check for 24bit functions
  92   // and then expand.
  93   bool isSigned24BitOps(CallInst *CI);
  94   void expandSigned24BitOps(CallInst *CI);
  95   // One optimization that can occur is that if the required workgroup size is
  96   // specified then the result of get_local_size is known at compile time and
  97   // can be returned accordingly.
  98   bool isRWGLocalOpt(CallInst *CI);
  99   // On northern island cards, the division is slightly less accurate than on
 100   // previous generations, so we need to utilize a more accurate division. So we
 101   // can translate the accurate divide to a normal divide on all other cards.
 102   bool convertAccurateDivide(CallInst *CI);
 103   void expandAccurateDivide(CallInst *CI);
 104   // If the alignment is set incorrectly, it can produce really inefficient
 105   // code. This checks for this scenario and fixes it if possible.
 106   bool correctMisalignedMemOp(Instruction *inst);
 107
 108   // If we are in no opt mode, then we need to make sure that
 109   // local samplers are properly propagated as constant propagation
 110   // doesn't occur and we need to know the value of kernel defined
 111   // samplers at compile time.
 112   bool propagateSamplerInst(CallInst *CI);
 113
 114   // Helper functions
 115
 116   // Group of functions that recursively calculate the size of a structure based
 117   // on it's sub-types.
 118   size_t getTypeSize(Type * const T, bool dereferencePtr = false);
 119   size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
 120   size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
 121   size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
 122   size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
 123   size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
 124   size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
 125   size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
 126
 127   LLVMContext *mCTX;
 128   Function *mF;
 129   const AMDILSubtarget *mSTM;
 130   SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
 131   SmallVector<CallInst *, 16> isConstVec;
 132 }; // class AMDILPeepholeOpt
 133   char AMDILPeepholeOpt::ID = 0;
 134 } // anonymous namespace
 135
 136 namespace llvm {
 137   FunctionPass *
 138   createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
 139   {
 140     return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR);
 141   }
 142 } // llvm namespace
 143
 144 AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
 145   : FunctionPass(ID), TM(tm)
 146 {
 147   mDebug = false;
 148   optLevel = TM.getOptLevel();
 149
 150 }
 151
 152 AMDILPeepholeOpt::~AMDILPeepholeOpt()
 153 {
 154 }
 155
 156 const char *
 157 AMDILPeepholeOpt::getPassName() const
 158 {
 159   return "AMDIL PeepHole Optimization Pass";
 160 }
 161
 162 bool
 163 containsPointerType(Type *Ty)
 164 {
 165   if (!Ty) {
 166     return false;
 167   }
 168   switch(Ty->getTypeID()) {
 169   default:
 170     return false;
 171   case Type::StructTyID: {
 172     const StructType *ST = dyn_cast<StructType>(Ty);
 173     for (StructType::element_iterator stb = ST->element_begin(),
 174            ste = ST->element_end(); stb != ste; ++stb) {
 175       if (!containsPointerType(*stb)) {
 176         continue;
 177       }
 178       return true;
 179     }
 180     break;
 181   }
 182   case Type::VectorTyID:
 183   case Type::ArrayTyID:
 184     return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
 185   case Type::PointerTyID:
 186     return true;
 187   };
 188   return false;
 189 }
 190
 191 bool
 192 AMDILPeepholeOpt::dumpAllIntoArena(Function &F)
 193 {
 194   bool dumpAll = false;
 195   for (Function::const_arg_iterator cab = F.arg_begin(),
 196        cae = F.arg_end(); cab != cae; ++cab) {
 197     const Argument *arg = cab;
 198     const PointerType *PT = dyn_cast<PointerType>(arg->getType());
 199     if (!PT) {
 200       continue;
 201     }
 202     Type *DereferencedType = PT->getElementType();
 203     if (!dyn_cast<StructType>(DereferencedType)
 204         ) {
 205       continue;
 206     }
 207     if (!containsPointerType(DereferencedType)) {
 208       continue;
 209     }
 210     // FIXME: Because a pointer inside of a struct/union may be aliased to
 211     // another pointer we need to take the conservative approach and place all
 212     // pointers into the arena until more advanced detection is implemented.
 213     dumpAll = true;
 214   }
 215   return dumpAll;
 216 }
 217 void
 218 AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
 219 {
 220   if (isConstVec.empty()) {
 221     return;
 222   }
 223   for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
 224     CallInst *CI = isConstVec[x];
 225     Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
 226     Type *aType = Type::getInt32Ty(*mCTX);
 227     Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
 228       : ConstantInt::get(aType, 0);
 229     CI->replaceAllUsesWith(Val);
 230     CI->eraseFromParent();
 231   }
 232   isConstVec.clear();
 233 }
 234 void
 235 AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
 236 {
 237   // Don't do anything if we don't have any atomic operations.
 238   if (atomicFuncs.empty()) {
 239     return;
 240   }
 241   // Change the function name for the atomic if it is required
 242   uint32_t size = atomicFuncs.size();
 243   for (uint32_t x = 0; x < size; ++x) {
 244     atomicFuncs[x].first->setOperand(
 245         atomicFuncs[x].first->getNumOperands()-1,
 246         atomicFuncs[x].second);
 247
 248   }
 249   mChanged = true;
 250   if (mConvertAtomics) {
 251     return;
 252   }
 253 }
 254
 255 bool
 256 AMDILPeepholeOpt::runOnFunction(Function &MF)
 257 {
 258   mChanged = false;
 259   mF = &MF;
 260   mSTM = &TM.getSubtarget<AMDILSubtarget>();
 261   if (mDebug) {
 262     MF.dump();
 263   }
 264   mCTX = &MF.getType()->getContext();
 265   mConvertAtomics = true;
 266   safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
 267      std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations),
 268                   this));
 269
 270   doAtomicConversionIfNeeded(MF);
 271   doIsConstCallConversionIfNeeded();
 272
 273   if (mDebug) {
 274     MF.dump();
 275   }
 276   return mChanged;
 277 }
 278
 279 bool
 280 AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
 281 {
 282   Instruction *inst = (*bbb);
 283   CallInst *CI = dyn_cast<CallInst>(inst);
 284   if (!CI) {
 285     return false;
 286   }
 287   if (isSigned24BitOps(CI)) {
 288     expandSigned24BitOps(CI);
 289     ++(*bbb);
 290     CI->eraseFromParent();
 291     return true;
 292   }
 293   if (propagateSamplerInst(CI)) {
 294     return false;
 295   }
 296   if (expandBFI(CI) || expandBFM(CI)) {
 297     ++(*bbb);
 298     CI->eraseFromParent();
 299     return true;
 300   }
 301   if (convertAccurateDivide(CI)) {
 302     expandAccurateDivide(CI);
 303     ++(*bbb);
 304     CI->eraseFromParent();
 305     return true;
 306   }
 307
 308   StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
 309   if (calleeName.startswith("__amdil_is_constant")) {
 310     // If we do not have optimizations, then this
 311     // cannot be properly evaluated, so we add the
 312     // call instruction to a vector and process
 313     // them at the end of processing after the
 314     // samplers have been correctly handled.
 315     if (optLevel == CodeGenOpt::None) {
 316       isConstVec.push_back(CI);
 317       return false;
 318     } else {
 319       Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
 320       Type *aType = Type::getInt32Ty(*mCTX);
 321       Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
 322         : ConstantInt::get(aType, 0);
 323       CI->replaceAllUsesWith(Val);
 324       ++(*bbb);
 325       CI->eraseFromParent();
 326       return true;
 327     }
 328   }
 329
 330   if (calleeName.equals("__amdil_is_asic_id_i32")) {
 331     ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
 332     Type *aType = Type::getInt32Ty(*mCTX);
 333     Value *Val = CV;
 334     if (Val) {
 335       Val = ConstantInt::get(aType,
 336           mSTM->device()->getDeviceFlag() & CV->getZExtValue());
 337     } else {
 338       Val = ConstantInt::get(aType, 0);
 339     }
 340     CI->replaceAllUsesWith(Val);
 341     ++(*bbb);
 342     CI->eraseFromParent();
 343     return true;
 344   }
 345   Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
 346   if (!F) {
 347     return false;
 348   }
 349   if (F->getName().startswith("__atom") && !CI->getNumUses()
 350       && F->getName().find("_xchg") == StringRef::npos) {
 351     std::string buffer(F->getName().str() + "_noret");
 352     F = dyn_cast<Function>(
 353           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
 354     atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
 355   }
 356
 357   if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
 358       && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
 359     return false;
 360   }
 361   if (!mConvertAtomics) {
 362     return false;
 363   }
 364   StringRef name = F->getName();
 365   if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
 366     mConvertAtomics = false;
 367   }
 368   return false;
 369 }
 370
 371 bool
 372 AMDILPeepholeOpt::setupBitInsert(Instruction *base,
 373     Instruction *&src,
 374     Constant *&mask,
 375     Constant *&shift)
 376 {
 377   if (!base) {
 378     if (mDebug) {
 379       dbgs() << "Null pointer passed into function.\n";
 380     }
 381     return false;
 382   }
 383   bool andOp = false;
 384   if (base->getOpcode() == Instruction::Shl) {
 385     shift = dyn_cast<Constant>(base->getOperand(1));
 386   } else if (base->getOpcode() == Instruction::And) {
 387     mask = dyn_cast<Constant>(base->getOperand(1));
 388     andOp = true;
 389   } else {
 390     if (mDebug) {
 391       dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
 392     }
 393     // If the base is neither a Shl or a And, we don't fit any of the patterns above.
 394     return false;
 395   }
 396   src = dyn_cast<Instruction>(base->getOperand(0));
 397   if (!src) {
 398     if (mDebug) {
 399       dbgs() << "Failed setup since the base operand is not an instruction!\n";
 400     }
 401     return false;
 402   }
 403   // If we find an 'and' operation, then we don't need to
 404   // find the next operation as we already know the
 405   // bits that are valid at this point.
 406   if (andOp) {
 407     return true;
 408   }
 409   if (src->getOpcode() == Instruction::Shl && !shift) {
 410     shift = dyn_cast<Constant>(src->getOperand(1));
 411     src = dyn_cast<Instruction>(src->getOperand(0));
 412   } else if (src->getOpcode() == Instruction::And && !mask) {
 413     mask = dyn_cast<Constant>(src->getOperand(1));
 414   }
 415   if (!mask && !shift) {
 416     if (mDebug) {
 417       dbgs() << "Failed setup since both mask and shift are NULL!\n";
 418     }
 419     // Did not find a constant mask or a shift.
 420     return false;
 421   }
 422   return true;
 423 }
 424 bool
 425 AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst)
 426 {
 427   if (!inst) {
 428     return false;
 429   }
 430   if (!inst->isBinaryOp()) {
 431     return false;
 432   }
 433   if (inst->getOpcode() != Instruction::Or) {
 434     return false;
 435   }
 436   if (optLevel == CodeGenOpt::None) {
 437     return false;
 438   }
 439   // We want to do an optimization on a sequence of ops that in the end equals a
 440   // single ISA instruction.
 441   // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
 442   // Some simplified versions of this pattern are as follows:
 443   // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
 444   // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
 445   // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
 446   // (A & B) | (D << F) when (1 << F) >= B
 447   // (A << C) | (D & E) when (1 << C) >= E
 448   if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
 449     // The HD4XXX hardware doesn't support the ubit_insert instruction.
 450     return false;
 451   }
 452   Type *aType = inst->getType();
 453   bool isVector = aType->isVectorTy();
 454   int numEle = 1;
 455   // This optimization only works on 32bit integers.
 456   if (aType->getScalarType()
 457       != Type::getInt32Ty(inst->getContext())) {
 458     return false;
 459   }
 460   if (isVector) {
 461     const VectorType *VT = dyn_cast<VectorType>(aType);
 462     numEle = VT->getNumElements();
 463     // We currently cannot support more than 4 elements in a intrinsic and we
 464     // cannot support Vec3 types.
 465     if (numEle > 4 || numEle == 3) {
 466       return false;
 467     }
 468   }
 469   // TODO: Handle vectors.
 470   if (isVector) {
 471     if (mDebug) {
 472       dbgs() << "!!! Vectors are not supported yet!\n";
 473     }
 474     return false;
 475   }
 476   Instruction *LHSSrc = NULL, *RHSSrc = NULL;
 477   Constant *LHSMask = NULL, *RHSMask = NULL;
 478   Constant *LHSShift = NULL, *RHSShift = NULL;
 479   Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
 480   Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
 481   if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
 482     if (mDebug) {
 483       dbgs() << "Found an OR Operation that failed setup!\n";
 484       inst->dump();
 485       if (LHS) { LHS->dump(); }
 486       if (LHSSrc) { LHSSrc->dump(); }
 487       if (LHSMask) { LHSMask->dump(); }
 488       if (LHSShift) { LHSShift->dump(); }
 489     }
 490     // There was an issue with the setup for BitInsert.
 491     return false;
 492   }
 493   if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
 494     if (mDebug) {
 495       dbgs() << "Found an OR Operation that failed setup!\n";
 496       inst->dump();
 497       if (RHS) { RHS->dump(); }
 498       if (RHSSrc) { RHSSrc->dump(); }
 499       if (RHSMask) { RHSMask->dump(); }
 500       if (RHSShift) { RHSShift->dump(); }
 501     }
 502     // There was an issue with the setup for BitInsert.
 503     return false;
 504   }
 505   if (mDebug) {
 506     dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
 507     dbgs() << "Op:        "; inst->dump();
 508     dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
 509     dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
 510     dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
 511     dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
 512     dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
 513     dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
 514     dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
 515     dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
 516   }
 517   Constant *offset = NULL;
 518   Constant *width = NULL;
 519   int32_t lhsMaskVal = 0, rhsMaskVal = 0;
 520   int32_t lhsShiftVal = 0, rhsShiftVal = 0;
 521   int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
 522   int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
 523   lhsMaskVal = (int32_t)(LHSMask
 524       ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
 525   rhsMaskVal = (int32_t)(RHSMask
 526       ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
 527   lhsShiftVal = (int32_t)(LHSShift
 528       ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
 529   rhsShiftVal = (int32_t)(RHSShift
 530       ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
 531   lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
 532   rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
 533   lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
 534   rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
 535   // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
 536   if (mDebug) {
 537       dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
 538       dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
 539       dbgs() << (RHSMask ? " & E)" : ")");
 540       dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
 541       dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
 542       dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
 543       dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
 544       dbgs() << "width(B) = " << lhsMaskWidth;
 545       dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
 546       dbgs() << "offset(B) = " << lhsMaskOffset;
 547       dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
 548       dbgs() << "Constraints: \n";
 549       dbgs() << "\t(1) B ^ E == 0\n";
 550       dbgs() << "\t(2-LHS) B is a mask\n";
 551       dbgs() << "\t(2-LHS) E is a mask\n";
 552       dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
 553       dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
 554   }
 555   if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
 556     if (mDebug) {
 557       dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
 558       dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
 559       dbgs() << "Failed constraint 1!\n";
 560     }
 561     return false;
 562   }
 563   if (mDebug) {
 564     dbgs() << "LHS = " << lhsMaskOffset << "";
 565     dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
 566     dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
 567     dbgs() << "\nRHS = " << rhsMaskOffset << "";
 568     dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
 569     dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
 570     dbgs() << "\n";
 571   }
 572   if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
 573     offset = ConstantInt::get(aType, lhsMaskOffset, false);
 574     width = ConstantInt::get(aType, lhsMaskWidth, false);
 575     RHSSrc = RHS;
 576     if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
 577       if (mDebug) {
 578         dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
 579         dbgs() << "Failed constraint 2!\n";
 580       }
 581       return false;
 582     }
 583     if (!LHSShift) {
 584       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 585           "MaskShr", LHS);
 586     } else if (lhsShiftVal != lhsMaskOffset) {
 587       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 588           "MaskShr", LHS);
 589     }
 590     if (mDebug) {
 591       dbgs() << "Optimizing LHS!\n";
 592     }
 593   } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
 594     offset = ConstantInt::get(aType, rhsMaskOffset, false);
 595     width = ConstantInt::get(aType, rhsMaskWidth, false);
 596     LHSSrc = RHSSrc;
 597     RHSSrc = LHS;
 598     if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
 599       if (mDebug) {
 600         dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
 601         dbgs() << "Failed constraint 2!\n";
 602       }
 603       return false;
 604     }
 605     if (!RHSShift) {
 606       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 607           "MaskShr", RHS);
 608     } else if (rhsShiftVal != rhsMaskOffset) {
 609       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 610           "MaskShr", RHS);
 611     }
 612     if (mDebug) {
 613       dbgs() << "Optimizing RHS!\n";
 614     }
 615   } else {
 616     if (mDebug) {
 617       dbgs() << "Failed constraint 3!\n";
 618     }
 619     return false;
 620   }
 621   if (mDebug) {
 622     dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
 623     dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
 624     dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
 625     dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
 626   }
 627   if (!offset || !width) {
 628     if (mDebug) {
 629       dbgs() << "Either width or offset are NULL, failed detection!\n";
 630     }
 631     return false;
 632   }
 633   // Lets create the function signature.
 634   std::vector<Type *> callTypes;
 635   callTypes.push_back(aType);
 636   callTypes.push_back(aType);
 637   callTypes.push_back(aType);
 638   callTypes.push_back(aType);
 639   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
 640   std::string name = "__amdil_ubit_insert";
 641   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
 642   Function *Func =
 643     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
 644         getOrInsertFunction(llvm::StringRef(name), funcType));
 645   Value *Operands[4] = {
 646     width,
 647     offset,
 648     LHSSrc,
 649     RHSSrc
 650   };
 651   CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
 652   if (mDebug) {
 653     dbgs() << "Old Inst: ";
 654     inst->dump();
 655     dbgs() << "New Inst: ";
 656     CI->dump();
 657     dbgs() << "\n\n";
 658   }
 659   CI->insertBefore(inst);
 660   inst->replaceAllUsesWith(CI);
 661   return true;
 662 }
 663
 664 bool
 665 AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst)
 666 {
 667   if (!inst) {
 668     return false;
 669   }
 670   if (!inst->isBinaryOp()) {
 671     return false;
 672   }
 673   if (inst->getOpcode() != Instruction::And) {
 674     return false;
 675   }
 676   if (optLevel == CodeGenOpt::None) {
 677     return false;
 678   }
 679   // We want to do some simple optimizations on Shift right/And patterns. The
 680   // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
 681   // value smaller than 32 and C is a mask. If C is a constant value, then the
 682   // following transformation can occur. For signed integers, it turns into the
 683   // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
 684   // integers, it turns into the function call dst =
 685   // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
 686   // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
 687   // Evergreen hardware.
 688   if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
 689     // This does not work on HD4XXX hardware.
 690     return false;
 691   }
 692   Type *aType = inst->getType();
 693   bool isVector = aType->isVectorTy();
 694
 695   // XXX Support vector types
 696   if (isVector) {
 697     return false;
 698   }
 699   int numEle = 1;
 700   // This only works on 32bit integers
 701   if (aType->getScalarType()
 702       != Type::getInt32Ty(inst->getContext())) {
 703     return false;
 704   }
 705   if (isVector) {
 706     const VectorType *VT = dyn_cast<VectorType>(aType);
 707     numEle = VT->getNumElements();
 708     // We currently cannot support more than 4 elements in a intrinsic and we
 709     // cannot support Vec3 types.
 710     if (numEle > 4 || numEle == 3) {
 711       return false;
 712     }
 713   }
 714   BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
 715   // If the first operand is not a shift instruction, then we can return as it
 716   // doesn't match this pattern.
 717   if (!ShiftInst || !ShiftInst->isShift()) {
 718     return false;
 719   }
 720   // If we are a shift left, then we need don't match this pattern.
 721   if (ShiftInst->getOpcode() == Instruction::Shl) {
 722     return false;
 723   }
 724   bool isSigned = ShiftInst->isArithmeticShift();
 725   Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
 726   Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
 727   // Lets make sure that the shift value and the and mask are constant integers.
 728   if (!AndMask || !ShrVal) {
 729     return false;
 730   }
 731   Constant *newMaskConst;
 732   Constant *shiftValConst;
 733   if (isVector) {
 734     // Handle the vector case
 735     std::vector<Constant *> maskVals;
 736     std::vector<Constant *> shiftVals;
 737     ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
 738     ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
 739     Type *scalarType = AndMaskVec->getType()->getScalarType();
 740     assert(AndMaskVec->getNumOperands() ==
 741            ShrValVec->getNumOperands() && "cannot have a "
 742            "combination where the number of elements to a "
 743            "shift and an and are different!");
 744     for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
 745       ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
 746       ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
 747       if (!AndCI || !ShiftIC) {
 748         return false;
 749       }
 750       uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
 751       if (!isMask_32(maskVal)) {
 752         return false;
 753       }
 754       maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
 755       uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
 756       // If the mask or shiftval is greater than the bitcount, then break out.
 757       if (maskVal >= 32 || shiftVal >= 32) {
 758         return false;
 759       }
 760       // If the mask val is greater than the the number of original bits left
 761       // then this optimization is invalid.
 762       if (maskVal > (32 - shiftVal)) {
 763         return false;
 764       }
 765       maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
 766       shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
 767     }
 768     newMaskConst = ConstantVector::get(maskVals);
 769     shiftValConst = ConstantVector::get(shiftVals);
 770   } else {
 771     // Handle the scalar case
 772     uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
 773     // This must be a mask value where all lower bits are set to 1 and then any
 774     // bit higher is set to 0.
 775     if (!isMask_32(maskVal)) {
 776       return false;
 777     }
 778     maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
 779     // Count the number of bits set in the mask, this is the width of the
 780     // resulting bit set that is extracted from the source value.
 781     uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
 782     // If the mask or shift val is greater than the bitcount, then break out.
 783     if (maskVal >= 32 || shiftVal >= 32) {
 784       return false;
 785     }
 786     // If the mask val is greater than the the number of original bits left then
 787     // this optimization is invalid.
 788     if (maskVal > (32 - shiftVal)) {
 789       return false;
 790     }
 791     newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
 792     shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
 793   }
 794   // Lets create the function signature.
 795   std::vector<Type *> callTypes;
 796   callTypes.push_back(aType);
 797   callTypes.push_back(aType);
 798   callTypes.push_back(aType);
 799   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
 800   std::string name = "llvm.AMDIL.bit.extract.u32";
 801   if (isVector) {
 802     name += ".v" + itostr(numEle) + "i32";
 803   } else {
 804     name += ".";
 805   }
 806   // Lets create the function.
 807   Function *Func =
 808     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
 809                        getOrInsertFunction(llvm::StringRef(name), funcType));
 810   Value *Operands[3] = {
 811     ShiftInst->getOperand(0),
 812     shiftValConst,
 813     newMaskConst
 814   };
 815   // Lets create the Call with the operands
 816   CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
 817   CI->setDoesNotAccessMemory();
 818   CI->insertBefore(inst);
 819   inst->replaceAllUsesWith(CI);
 820   return true;
 821 }
 822
 823 bool
 824 AMDILPeepholeOpt::expandBFI(CallInst *CI)
 825 {
 826   if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
 827     return false;
 828   }
 829   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 830   if (!LHS->getName().startswith("__amdil_bfi")) {
 831     return false;
 832   }
 833   Type* type = CI->getOperand(0)->getType();
 834   Constant *negOneConst = NULL;
 835   if (type->isVectorTy()) {
 836     std::vector<Constant *> negOneVals;
 837     negOneConst = ConstantInt::get(CI->getContext(),
 838         APInt(32, StringRef("-1"), 10));
 839     for (size_t x = 0,
 840         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
 841       negOneVals.push_back(negOneConst);
 842     }
 843     negOneConst = ConstantVector::get(negOneVals);
 844   } else {
 845     negOneConst = ConstantInt::get(CI->getContext(),
 846         APInt(32, StringRef("-1"), 10));
 847   }
 848   // __amdil_bfi => (A & B) | (~A & C)
 849   BinaryOperator *lhs =
 850     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
 851         CI->getOperand(1), "bfi_and", CI);
 852   BinaryOperator *rhs =
 853     BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
 854         "bfi_not", CI);
 855   rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
 856       "bfi_and", CI);
 857   lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
 858   CI->replaceAllUsesWith(lhs);
 859   return true;
 860 }
 861
 862 bool
 863 AMDILPeepholeOpt::expandBFM(CallInst *CI)
 864 {
 865   if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
 866     return false;
 867   }
 868   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 869   if (!LHS->getName().startswith("__amdil_bfm")) {
 870     return false;
 871   }
 872   // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
 873   Constant *newMaskConst = NULL;
 874   Constant *newShiftConst = NULL;
 875   Type* type = CI->getOperand(0)->getType();
 876   if (type->isVectorTy()) {
 877     std::vector<Constant*> newMaskVals, newShiftVals;
 878     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
 879     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
 880     for (size_t x = 0,
 881         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
 882       newMaskVals.push_back(newMaskConst);
 883       newShiftVals.push_back(newShiftConst);
 884     }
 885     newMaskConst = ConstantVector::get(newMaskVals);
 886     newShiftConst = ConstantVector::get(newShiftVals);
 887   } else {
 888     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
 889     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
 890   }
 891   BinaryOperator *lhs =
 892     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
 893         newMaskConst, "bfm_mask", CI);
 894   lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
 895       lhs, "bfm_shl", CI);
 896   lhs = BinaryOperator::Create(Instruction::Sub, lhs,
 897       newShiftConst, "bfm_sub", CI);
 898   BinaryOperator *rhs =
 899     BinaryOperator::Create(Instruction::And, CI->getOperand(1),
 900         newMaskConst, "bfm_mask", CI);
 901   lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
 902   CI->replaceAllUsesWith(lhs);
 903   return true;
 904 }
 905
 906 bool
 907 AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
 908 {
 909   Instruction *inst = (*bbb);
 910   if (optimizeCallInst(bbb)) {
 911     return true;
 912   }
 913   if (optimizeBitExtract(inst)) {
 914     return false;
 915   }
 916   if (optimizeBitInsert(inst)) {
 917     return false;
 918   }
 919   if (correctMisalignedMemOp(inst)) {
 920     return false;
 921   }
 922   return false;
 923 }
 924 bool
 925 AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
 926 {
 927   LoadInst *linst = dyn_cast<LoadInst>(inst);
 928   StoreInst *sinst = dyn_cast<StoreInst>(inst);
 929   unsigned alignment;
 930   Type* Ty = inst->getType();
 931   if (linst) {
 932     alignment = linst->getAlignment();
 933     Ty = inst->getType();
 934   } else if (sinst) {
 935     alignment = sinst->getAlignment();
 936     Ty = sinst->getValueOperand()->getType();
 937   } else {
 938     return false;
 939   }
 940   unsigned size = getTypeSize(Ty);
 941   if (size == alignment || size < alignment) {
 942     return false;
 943   }
 944   if (!Ty->isStructTy()) {
 945     return false;
 946   }
 947   if (alignment < 4) {
 948     if (linst) {
 949       linst->setAlignment(0);
 950       return true;
 951     } else if (sinst) {
 952       sinst->setAlignment(0);
 953       return true;
 954     }
 955   }
 956   return false;
 957 }
 958 bool
 959 AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI)
 960 {
 961   if (!CI) {
 962     return false;
 963   }
 964   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 965   std::string namePrefix = LHS->getName().substr(0, 14);
 966   if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
 967       && namePrefix != "__amdil__imul24_high") {
 968     return false;
 969   }
 970   if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
 971     return false;
 972   }
 973   return true;
 974 }
 975
 976 void
 977 AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI)
 978 {
 979   assert(isSigned24BitOps(CI) && "Must be a "
 980       "signed 24 bit operation to call this function!");
 981   Value *LHS = CI->getOperand(CI->getNumOperands()-1);
 982   // On 7XX and 8XX we do not have signed 24bit, so we need to
 983   // expand it to the following:
 984   // imul24 turns into 32bit imul
 985   // imad24 turns into 32bit imad
 986   // imul24_high turns into 32bit imulhigh
 987   if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
 988     Type *aType = CI->getOperand(0)->getType();
 989     bool isVector = aType->isVectorTy();
 990     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
 991     std::vector<Type*> callTypes;
 992     callTypes.push_back(CI->getOperand(0)->getType());
 993     callTypes.push_back(CI->getOperand(1)->getType());
 994     callTypes.push_back(CI->getOperand(2)->getType());
 995     FunctionType *funcType =
 996       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
 997     std::string name = "__amdil_imad";
 998     if (isVector) {
 999       name += "_v" + itostr(numEle) + "i32";
1000     } else {
1001       name += "_i32";
1002     }
1003     Function *Func = dyn_cast<Function>(
1004                        CI->getParent()->getParent()->getParent()->
1005                        getOrInsertFunction(llvm::StringRef(name), funcType));
1006     Value *Operands[3] = {
1007       CI->getOperand(0),
1008       CI->getOperand(1),
1009       CI->getOperand(2)
1010     };
1011     CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
1012     nCI->insertBefore(CI);
1013     CI->replaceAllUsesWith(nCI);
1014   } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
1015     BinaryOperator *mulOp =
1016       BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1017           CI->getOperand(1), "imul24", CI);
1018     CI->replaceAllUsesWith(mulOp);
1019   } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1020     Type *aType = CI->getOperand(0)->getType();
1021
1022     bool isVector = aType->isVectorTy();
1023     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1024     std::vector<Type*> callTypes;
1025     callTypes.push_back(CI->getOperand(0)->getType());
1026     callTypes.push_back(CI->getOperand(1)->getType());
1027     FunctionType *funcType =
1028       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1029     std::string name = "__amdil_imul_high";
1030     if (isVector) {
1031       name += "_v" + itostr(numEle) + "i32";
1032     } else {
1033       name += "_i32";
1034     }
1035     Function *Func = dyn_cast<Function>(
1036                        CI->getParent()->getParent()->getParent()->
1037                        getOrInsertFunction(llvm::StringRef(name), funcType));
1038     Value *Operands[2] = {
1039       CI->getOperand(0),
1040       CI->getOperand(1)
1041     };
1042     CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1043     nCI->insertBefore(CI);
1044     CI->replaceAllUsesWith(nCI);
1045   }
1046 }
1047
1048 bool
1049 AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1050 {
1051   return (CI != NULL
1052           && CI->getOperand(CI->getNumOperands() - 1)->getName()
1053           == "__amdil_get_local_size_int");
1054 }
1055
1056 bool
1057 AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI)
1058 {
1059   if (!CI) {
1060     return false;
1061   }
1062   if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
1063       && (mSTM->getDeviceName() == "cayman")) {
1064     return false;
1065   }
1066   return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1067       == "__amdil_improved_div";
1068 }
1069
1070 void
1071 AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI)
1072 {
1073   assert(convertAccurateDivide(CI)
1074          && "expanding accurate divide can only happen if it is expandable!");
1075   BinaryOperator *divOp =
1076     BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1077                            CI->getOperand(1), "fdiv32", CI);
1078   CI->replaceAllUsesWith(divOp);
1079 }
1080
1081 bool
1082 AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
1083 {
1084   if (optLevel != CodeGenOpt::None) {
1085     return false;
1086   }
1087
1088   if (!CI) {
1089     return false;
1090   }
1091
1092   unsigned funcNameIdx = 0;
1093   funcNameIdx = CI->getNumOperands() - 1;
1094   StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1095   if (calleeName != "__amdil_image2d_read_norm"
1096    && calleeName != "__amdil_image2d_read_unnorm"
1097    && calleeName != "__amdil_image3d_read_norm"
1098    && calleeName != "__amdil_image3d_read_unnorm") {
1099     return false;
1100   }
1101
1102   unsigned samplerIdx = 2;
1103   samplerIdx = 1;
1104   Value *sampler = CI->getOperand(samplerIdx);
1105   LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1106   if (!lInst) {
1107     return false;
1108   }
1109
1110   if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
1111     return false;
1112   }
1113
1114   GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1115   // If we are loading from what is not a global value, then we
1116   // fail and return.
1117   if (!gv) {
1118     return false;
1119   }
1120
1121   // If we don't have an initializer or we have an initializer and
1122   // the initializer is not a 32bit integer, we fail.
1123   if (!gv->hasInitializer()
1124       || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1125       return false;
1126   }
1127
1128   // Now that we have the global variable initializer, lets replace
1129   // all uses of the load instruction with the samplerVal and
1130   // reparse the __amdil_is_constant() function.
1131   Constant *samplerVal = gv->getInitializer();
1132   lInst->replaceAllUsesWith(samplerVal);
1133   return true;
1134 }
1135
1136 bool
1137 AMDILPeepholeOpt::doInitialization(Module &M)
1138 {
1139   return false;
1140 }
1141
1142 bool
1143 AMDILPeepholeOpt::doFinalization(Module &M)
1144 {
1145   return false;
1146 }
1147
1148 void
1149 AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1150 {
1151   AU.addRequired<MachineFunctionAnalysis>();
1152   FunctionPass::getAnalysisUsage(AU);
1153   AU.setPreservesAll();
1154 }
1155
1156 size_t AMDILPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
1157   size_t size = 0;
1158   if (!T) {
1159     return size;
1160   }
1161   switch (T->getTypeID()) {
1162   case Type::X86_FP80TyID:
1163   case Type::FP128TyID:
1164   case Type::PPC_FP128TyID:
1165   case Type::LabelTyID:
1166     assert(0 && "These types are not supported by this backend");
1167   default:
1168   case Type::FloatTyID:
1169   case Type::DoubleTyID:
1170     size = T->getPrimitiveSizeInBits() >> 3;
1171     break;
1172   case Type::PointerTyID:
1173     size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
1174     break;
1175   case Type::IntegerTyID:
1176     size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
1177     break;
1178   case Type::StructTyID:
1179     size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
1180     break;
1181   case Type::ArrayTyID:
1182     size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
1183     break;
1184   case Type::FunctionTyID:
1185     size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
1186     break;
1187   case Type::VectorTyID:
1188     size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
1189     break;
1190   };
1191   return size;
1192 }
1193
1194 size_t AMDILPeepholeOpt::getTypeSize(StructType * const ST,
1195     bool dereferencePtr) {
1196   size_t size = 0;
1197   if (!ST) {
1198     return size;
1199   }
1200   Type *curType;
1201   StructType::element_iterator eib;
1202   StructType::element_iterator eie;
1203   for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
1204     curType = *eib;
1205     size += getTypeSize(curType, dereferencePtr);
1206   }
1207   return size;
1208 }
1209
1210 size_t AMDILPeepholeOpt::getTypeSize(IntegerType * const IT,
1211     bool dereferencePtr) {
1212   return IT ? (IT->getBitWidth() >> 3) : 0;
1213 }
1214
1215 size_t AMDILPeepholeOpt::getTypeSize(FunctionType * const FT,
1216     bool dereferencePtr) {
1217     assert(0 && "Should not be able to calculate the size of an function type");
1218     return 0;
1219 }
1220
1221 size_t AMDILPeepholeOpt::getTypeSize(ArrayType * const AT,
1222     bool dereferencePtr) {
1223   return (size_t)(AT ? (getTypeSize(AT->getElementType(),
1224                                     dereferencePtr) * AT->getNumElements())
1225                      : 0);
1226 }
1227
1228 size_t AMDILPeepholeOpt::getTypeSize(VectorType * const VT,
1229     bool dereferencePtr) {
1230   return VT ? (VT->getBitWidth() >> 3) : 0;
1231 }
1232
1233 size_t AMDILPeepholeOpt::getTypeSize(PointerType * const PT,
1234     bool dereferencePtr) {
1235   if (!PT) {
1236     return 0;
1237   }
1238   Type *CT = PT->getElementType();
1239   if (CT->getTypeID() == Type::StructTyID &&
1240       PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
1241     return getTypeSize(dyn_cast<StructType>(CT));
1242   } else if (dereferencePtr) {
1243     size_t size = 0;
1244     for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
1245       size += getTypeSize(PT->getContainedType(x), dereferencePtr);
1246     }
1247     return size;
1248   } else {
1249     return 4;
1250   }
1251 }
1252
1253 size_t AMDILPeepholeOpt::getTypeSize(OpaqueType * const OT,
1254     bool dereferencePtr) {
1255   //assert(0 && "Should not be able to calculate the size of an opaque type");
1256   return 4;
1257 }