radeon/llvm: Add flag operand to some instructions

[mesa.git] / src / gallium / drivers / radeon / AMDILPeepholeOptimizer.cpp
diff --git a/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp

index 4859fe9df51985d3d5ddb1ee5a2ea35c5cdec365..f869b332e53e0c71ab2c8d60b06ae56bce4c8cd1 100644 (file)
--- a/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
+++ b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
+//===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
  //
  //                     The LLVM Compiler Infrastructure
  //
@@ -7,21 +7,13 @@
  //
  //==-----------------------------------------------------------------------===//
  
-#define DEBUG_TYPE "PeepholeOpt"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME 0
-#endif
-
-#include "AMDILAlgorithms.tpp"
  #include "AMDILDevices.h"
-#include "AMDILMachineFunctionInfo.h"
-#include "AMDILUtilityFunctions.h"
+#include "AMDGPUInstrInfo.h"
  #include "llvm/ADT/Statistic.h"
  #include "llvm/ADT/StringExtras.h"
  #include "llvm/ADT/StringRef.h"
  #include "llvm/ADT/Twine.h"
+#include "llvm/Constants.h"
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/CodeGen/MachineFunctionAnalysis.h"
  #include "llvm/Function.h"
@@ -42,12 +34,15 @@ using namespace llvm;
  // The Peephole optimization pass is used to do simple last minute optimizations
  // that are required for correct code or to remove redundant functions
  namespace {
-class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
+
+class OpaqueType;
+
+class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
  public:
    TargetMachine &TM;
    static char ID;
-  AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
-  ~AMDILPeepholeOpt();
+  AMDGPUPeepholeOpt(TargetMachine &tm);
+  ~AMDGPUPeepholeOpt();
    const char *getPassName() const;
    bool runOnFunction(Function &F);
    bool doInitialization(Module &M);
@@ -115,39 +110,71 @@ private:
    // samplers at compile time.
    bool propagateSamplerInst(CallInst *CI);
  
+  // Helper functions
+
+  // Group of functions that recursively calculate the size of a structure based
+  // on it's sub-types.
+  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
+  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
+  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
+  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
+  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
+  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
+  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
+  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
+
    LLVMContext *mCTX;
    Function *mF;
-  const AMDILSubtarget *mSTM;
+  const AMDGPUSubtarget *mSTM;
    SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
    SmallVector<CallInst *, 16> isConstVec;
-}; // class AMDILPeepholeOpt
-  char AMDILPeepholeOpt::ID = 0;
+}; // class AMDGPUPeepholeOpt
+  char AMDGPUPeepholeOpt::ID = 0;
+
+// A template function that has two levels of looping before calling the
+// function with a pointer to the current iterator.
+template<class InputIterator, class SecondIterator, class Function>
+Function safeNestedForEach(InputIterator First, InputIterator Last,
+                              SecondIterator S, Function F)
+{
+  for ( ; First != Last; ++First) {
+    SecondIterator sf, sl;
+    for (sf = First->begin(), sl = First->end();
+         sf != sl; )  {
+      if (!F(&sf)) {
+        ++sf;
+      } 
+    }
+  }
+  return F;
+}
+
  } // anonymous namespace
  
  namespace llvm {
    FunctionPass *
-  createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) 
+  createAMDGPUPeepholeOpt(TargetMachine &tm) 
    {
-    return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR);
+    return new AMDGPUPeepholeOpt(tm);
    }
  } // llvm namespace
  
-AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
    : FunctionPass(ID), TM(tm) 
  {
-  mDebug = DEBUGME;
+  mDebug = false;
    optLevel = TM.getOptLevel();
  
  }
  
-AMDILPeepholeOpt::~AMDILPeepholeOpt() 
+AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() 
  {
  }
  
  const char *
-AMDILPeepholeOpt::getPassName() const 
+AMDGPUPeepholeOpt::getPassName() const 
  {
-  return "AMDIL PeepHole Optimization Pass";
+  return "AMDGPU PeepHole Optimization Pass";
  }
  
  bool 
@@ -180,7 +207,7 @@ containsPointerType(Type *Ty)
  }
  
  bool 
-AMDILPeepholeOpt::dumpAllIntoArena(Function &F) 
+AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) 
  {
    bool dumpAll = false;
    for (Function::const_arg_iterator cab = F.arg_begin(),
@@ -206,7 +233,7 @@ AMDILPeepholeOpt::dumpAllIntoArena(Function &F)
    return dumpAll;
  }
  void
-AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
+AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded()
  {
    if (isConstVec.empty()) {
      return;
@@ -223,7 +250,7 @@ AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
    isConstVec.clear();
  }
  void 
-AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F) 
+AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) 
  {
    // Don't do anything if we don't have any atomic operations.
    if (atomicFuncs.empty()) {
@@ -241,35 +268,21 @@ AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
    if (mConvertAtomics) {
      return;
    }
-  // If we did not convert all of the atomics, then we need to make sure that
-  // the atomics that were not converted have their base pointers set to use the
-  // arena path.
-  Function::arg_iterator argB = F.arg_begin();
-  Function::arg_iterator argE = F.arg_end();
-  AMDILMachineFunctionInfo *mMFI = getAnalysis<MachineFunctionAnalysis>().getMF()
-    .getInfo<AMDILMachineFunctionInfo>();
-  for (; argB != argE; ++argB) {
-    if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaUAV)) {
-      mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
-    } else {
-      mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
-    }
-  }
  }
  
  bool 
-AMDILPeepholeOpt::runOnFunction(Function &MF) 
+AMDGPUPeepholeOpt::runOnFunction(Function &MF) 
  {
    mChanged = false;
    mF = &MF;
-  mSTM = &TM.getSubtarget<AMDILSubtarget>();
+  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
    if (mDebug) {
      MF.dump();
    }
    mCTX = &MF.getType()->getContext();
    mConvertAtomics = true;
    safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
-     std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations),
+     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
                    this));
  
    doAtomicConversionIfNeeded(MF);
@@ -282,7 +295,7 @@ AMDILPeepholeOpt::runOnFunction(Function &MF)
  }
  
  bool 
-AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) 
+AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) 
  {
    Instruction *inst = (*bbb);
    CallInst *CI = dyn_cast<CallInst>(inst);
@@ -359,8 +372,8 @@ AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
      atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
    }
    
-  if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
-      && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
+  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
+      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
      return false;
    }
    if (!mConvertAtomics) {
@@ -374,7 +387,7 @@ AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
  }
  
  bool
-AMDILPeepholeOpt::setupBitInsert(Instruction *base, 
+AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 
      Instruction *&src, 
      Constant *&mask, 
      Constant *&shift)
@@ -427,7 +440,7 @@ AMDILPeepholeOpt::setupBitInsert(Instruction *base,
    return true;
  }
  bool
-AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst) 
+AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) 
  {
    if (!inst) {
      return false;
@@ -450,7 +463,7 @@ AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst)
    // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
    // (A & B) | (D << F) when (1 << F) >= B
    // (A << C) | (D & E) when (1 << C) >= E
-  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
      // The HD4XXX hardware doesn't support the ubit_insert instruction.
      return false;
    }
@@ -667,7 +680,7 @@ AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst)
  }
  
  bool 
-AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst) 
+AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) 
  {
    if (!inst) {
      return false;
@@ -690,12 +703,17 @@ AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst)
    // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
    // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
    // Evergreen hardware.
-  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
      // This does not work on HD4XXX hardware.
      return false;
    }
    Type *aType = inst->getType();
    bool isVector = aType->isVectorTy();
+
+  // XXX Support vector types
+  if (isVector) {
+    return false;
+  }
    int numEle = 1;
    // This only works on 32bit integers
    if (aType->getScalarType()
@@ -797,32 +815,33 @@ AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst)
    callTypes.push_back(aType);
    callTypes.push_back(aType);
    FunctionType *funcType = FunctionType::get(aType, callTypes, false);
-  std::string name = "__amdil_ubit_extract";
+  std::string name = "llvm.AMDIL.bit.extract.u32";
    if (isVector) {
-    name += "_v" + itostr(numEle) + "i32";
+    name += ".v" + itostr(numEle) + "i32";
    } else {
-    name += "_i32";
+    name += ".";
    }
    // Lets create the function.
    Function *Func = 
      dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
                         getOrInsertFunction(llvm::StringRef(name), funcType));
    Value *Operands[3] = {
-    newMaskConst,
+    ShiftInst->getOperand(0),
      shiftValConst,
-    ShiftInst->getOperand(0)
+    newMaskConst
    };
    // Lets create the Call with the operands
    CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
+  CI->setDoesNotAccessMemory();
    CI->insertBefore(inst);
    inst->replaceAllUsesWith(CI);
    return true;
  }
  
  bool
-AMDILPeepholeOpt::expandBFI(CallInst *CI)
+AMDGPUPeepholeOpt::expandBFI(CallInst *CI)
  {
-  if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
+  if (!CI) {
      return false;
    }
    Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
@@ -859,9 +878,9 @@ AMDILPeepholeOpt::expandBFI(CallInst *CI)
  }
  
  bool
-AMDILPeepholeOpt::expandBFM(CallInst *CI)
+AMDGPUPeepholeOpt::expandBFM(CallInst *CI)
  {
-  if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
+  if (!CI) {
      return false;
    }
    Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
@@ -903,7 +922,7 @@ AMDILPeepholeOpt::expandBFM(CallInst *CI)
  }
  
  bool
-AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) 
+AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) 
  {
    Instruction *inst = (*bbb);
    if (optimizeCallInst(bbb)) {
@@ -921,7 +940,7 @@ AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
    return false;
  }
  bool
-AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
+AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
  {
    LoadInst *linst = dyn_cast<LoadInst>(inst);
    StoreInst *sinst = dyn_cast<StoreInst>(inst);
@@ -955,7 +974,7 @@ AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
    return false;
  }
  bool 
-AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI) 
+AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) 
  {
    if (!CI) {
      return false;
@@ -966,14 +985,14 @@ AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI)
        && namePrefix != "__amdil__imul24_high") {
      return false;
    }
-  if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
+  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
      return false;
    }
    return true;
  }
  
  void 
-AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI) 
+AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) 
  {
    assert(isSigned24BitOps(CI) && "Must be a "
        "signed 24 bit operation to call this function!");
@@ -1045,7 +1064,7 @@ AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI)
  }
  
  bool 
-AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI) 
+AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) 
  {
    return (CI != NULL
            && CI->getOperand(CI->getNumOperands() - 1)->getName() 
@@ -1053,12 +1072,12 @@ AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI)
  }
  
  bool 
-AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI) 
+AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) 
  {
    if (!CI) {
      return false;
    }
-  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
        && (mSTM->getDeviceName() == "cayman")) {
      return false;
    }
@@ -1067,7 +1086,7 @@ AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI)
  }
  
  void 
-AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI) 
+AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) 
  {
    assert(convertAccurateDivide(CI)
           && "expanding accurate divide can only happen if it is expandable!");
@@ -1078,7 +1097,7 @@ AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI)
  }
  
  bool
-AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
+AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI)
  {
    if (optLevel != CodeGenOpt::None) {
      return false;
@@ -1106,7 +1125,7 @@ AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
      return false;
    }
  
-  if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
+  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
      return false;
    }
  
@@ -1133,21 +1152,124 @@ AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
  }
  
  bool 
-AMDILPeepholeOpt::doInitialization(Module &M) 
+AMDGPUPeepholeOpt::doInitialization(Module &M) 
  {
    return false;
  }
  
  bool 
-AMDILPeepholeOpt::doFinalization(Module &M) 
+AMDGPUPeepholeOpt::doFinalization(Module &M) 
  {
    return false;
  }
  
  void 
-AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const 
+AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const 
  {
    AU.addRequired<MachineFunctionAnalysis>();
    FunctionPass::getAnalysisUsage(AU);
    AU.setPreservesAll();
  }
+
+size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
+  size_t size = 0;
+  if (!T) {
+    return size;
+  }
+  switch (T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    size = T->getPrimitiveSizeInBits() >> 3;
+    break;
+  case Type::PointerTyID:
+    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
+    break;
+  case Type::IntegerTyID:
+    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
+    break;
+  case Type::StructTyID:
+    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
+    break;
+  case Type::ArrayTyID:
+    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
+    break;
+  case Type::FunctionTyID:
+    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
+    break;
+  case Type::VectorTyID:
+    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
+    break;
+  };
+  return size;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
+    bool dereferencePtr) {
+  size_t size = 0;
+  if (!ST) {
+    return size;
+  }
+  Type *curType;
+  StructType::element_iterator eib;
+  StructType::element_iterator eie;
+  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
+    curType = *eib;
+    size += getTypeSize(curType, dereferencePtr);
+  }
+  return size;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
+    bool dereferencePtr) {
+  return IT ? (IT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
+    bool dereferencePtr) {
+    assert(0 && "Should not be able to calculate the size of an function type");
+    return 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
+    bool dereferencePtr) {
+  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
+                                    dereferencePtr) * AT->getNumElements())
+                     : 0);
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
+    bool dereferencePtr) {
+  return VT ? (VT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
+    bool dereferencePtr) {
+  if (!PT) {
+    return 0;
+  }
+  Type *CT = PT->getElementType();
+  if (CT->getTypeID() == Type::StructTyID &&
+      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+    return getTypeSize(dyn_cast<StructType>(CT));
+  } else if (dereferencePtr) {
+    size_t size = 0;
+    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
+      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
+    }
+    return size;
+  } else {
+    return 4;
+  }
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
+    bool dereferencePtr) {
+  //assert(0 && "Should not be able to calculate the size of an opaque type");
+  return 4;
+}