nvir/gv100: enable support for tu1xx

[mesa.git] / src / gallium / drivers / nouveau / codegen / nv50_ir_ra.cpp
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp

index dd3beb701ce90010d7896a8ffadc23a5ea9b6998..4e5b21d917642a6ad2a36f37704dab5b6ff24a8b 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -23,11 +23,25 @@
  #include "codegen/nv50_ir.h"
  #include "codegen/nv50_ir_target.h"
  
+#include <algorithm>
  #include <stack>
  #include <limits>
+#if __cplusplus >= 201103L
+#include <unordered_map>
+#else
+#include <tr1/unordered_map>
+#endif
  
  namespace nv50_ir {
  
+#if __cplusplus >= 201103L
+using std::hash;
+using std::unordered_map;
+#else
+using std::tr1::hash;
+using std::tr1::unordered_map;
+#endif
+
  #define MAX_REGISTER_FILE_SIZE 256
  
  class RegisterSet
@@ -41,7 +55,7 @@ public:
     void periodicMask(DataFile f, uint32_t lock, uint32_t unlock);
     void intersect(DataFile f, const RegisterSet *);
  
-   bool assign(int32_t& reg, DataFile f, unsigned int size);
+   bool assign(int32_t& reg, DataFile f, unsigned int size, unsigned int maxReg);
     void release(DataFile f, int32_t reg, unsigned int size);
     void occupy(DataFile f, int32_t reg, unsigned int size);
     void occupy(const Value *);
@@ -52,10 +66,8 @@ public:
  
     inline int getMaxAssigned(DataFile f) const { return fill[f]; }
  
-   inline unsigned int getFileSize(DataFile f, uint8_t regSize) const
+   inline unsigned int getFileSize(DataFile f) const
     {
-      if (restrictedGPR16Range && f == FILE_GPR && regSize == 2)
-         return (last[f] + 1) / 2;
        return last[f] + 1;
     }
  
@@ -85,7 +97,9 @@ public:
        return (size < 4) ? u : ((u << unit[f]) / 4);
     }
  
-   void print() const;
+   void print(DataFile f) const;
+
+   const bool restrictedGPR16Range;
  
  private:
     BitSet bits[LAST_REGISTER_FILE + 1];
@@ -94,8 +108,6 @@ private:
  
     int last[LAST_REGISTER_FILE + 1];
     int fill[LAST_REGISTER_FILE + 1];
-
-   const bool restrictedGPR16Range;
  };
  
  void
@@ -140,17 +152,17 @@ RegisterSet::intersect(DataFile f, const RegisterSet *set)
  }
  
  void
-RegisterSet::print() const
+RegisterSet::print(DataFile f) const
  {
     INFO("GPR:");
-   bits[FILE_GPR].print();
+   bits[f].print();
     INFO("\n");
  }
  
  bool
-RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size)
+RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size, unsigned int maxReg)
  {
-   reg = bits[f].findFreeRange(size);
+   reg = bits[f].findFreeRange(size, maxReg);
     if (reg < 0)
        return false;
     fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
@@ -222,6 +234,7 @@ private:
     private:
        virtual bool visit(BasicBlock *);
        inline bool needNewElseBlock(BasicBlock *b, BasicBlock *p);
+      inline void splitEdges(BasicBlock *b);
     };
  
     class ArgumentMovesPass : public Pass {
@@ -242,9 +255,11 @@ private:
     private:
        virtual bool visit(BasicBlock *);
  
+      void insertConstraintMove(Instruction *, int s);
        bool insertConstraintMoves();
  
        void condenseDefs(Instruction *);
+      void condenseDefs(Instruction *, const int first, const int last);
        void condenseSrcs(Instruction *, const int first, const int last);
  
        void addHazard(Instruction *i, const ValueRef *src);
@@ -256,6 +271,10 @@ private:
        void texConstraintNV50(TexInstruction *);
        void texConstraintNVC0(TexInstruction *);
        void texConstraintNVE0(TexInstruction *);
+      void texConstraintGM107(TexInstruction *);
+
+      bool isScalarTexGM107(TexInstruction *);
+      void handleScalarTexGM107(TexInstruction *);
  
        std::list<Instruction *> constrList;
  
@@ -284,7 +303,7 @@ public:
     bool run(const std::list<ValuePair>&);
  
     Symbol *assignSlot(const Interval&, const unsigned int size);
-   Symbol *offsetSlot(Symbol *, const LValue *);
+   Value *offsetSlot(Value *, const LValue *);
     inline int32_t getStackSize() const { return stackSize; }
  
  private:
@@ -344,28 +363,55 @@ RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p)
     return (n == 2);
  }
  
-// For each operand of each PHI in b, generate a new value by inserting a MOV
-// at the end of the block it is coming from and replace the operand with its
-// result. This eliminates liveness conflicts and enables us to let values be
-// copied to the right register if such a conflict exists nonetheless.
+struct PhiMapHash {
+   size_t operator()(const std::pair<Instruction *, BasicBlock *>& val) const {
+      return hash<Instruction*>()(val.first) * 31 +
+         hash<BasicBlock*>()(val.second);
+   }
+};
+
+typedef unordered_map<
+   std::pair<Instruction *, BasicBlock *>, Value *, PhiMapHash> PhiMap;
+
+// Critical edges need to be split up so that work can be inserted along
+// specific edge transitions. Unfortunately manipulating incident edges into a
+// BB invalidates all the PHI nodes since their sources are implicitly ordered
+// by incident edge order.
  //
-// These MOVs are also crucial in making sure the live intervals of phi srces
-// are extended until the end of the loop, since they are not included in the
-// live-in sets.
-bool
-RegAlloc::PhiMovesPass::visit(BasicBlock *bb)
+// TODO: Make it so that that is not the case, and PHI nodes store pointers to
+// the original BBs.
+void
+RegAlloc::PhiMovesPass::splitEdges(BasicBlock *bb)
  {
-   Instruction *phi, *mov;
     BasicBlock *pb, *pn;
-
+   Instruction *phi;
+   Graph::EdgeIterator ei;
     std::stack<BasicBlock *> stack;
+   int j = 0;
  
-   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+   for (ei = bb->cfg.incident(); !ei.end(); ei.next()) {
        pb = BasicBlock::get(ei.getNode());
        assert(pb);
        if (needNewElseBlock(bb, pb))
           stack.push(pb);
     }
+
+   // No critical edges were found, no need to perform any work.
+   if (stack.empty())
+      return;
+
+   // We're about to, potentially, reorder the inbound edges. This means that
+   // we need to hold on to the (phi, bb) -> src mapping, and fix up the phi
+   // nodes after the graph has been modified.
+   PhiMap phis;
+
+   j = 0;
+   for (ei = bb->cfg.incident(); !ei.end(); ei.next(), j++) {
+      pb = BasicBlock::get(ei.getNode());
+      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next)
+         phis.insert(std::make_pair(std::make_pair(phi, pb), phi->getSrc(j)));
+   }
+
     while (!stack.empty()) {
        pb = stack.top();
        pn = new BasicBlock(func);
@@ -378,21 +424,57 @@ RegAlloc::PhiMovesPass::visit(BasicBlock *bb)
        assert(pb->getExit()->op != OP_CALL);
        if (pb->getExit()->asFlow()->target.bb == bb)
           pb->getExit()->asFlow()->target.bb = pn;
+
+      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
+         PhiMap::iterator it = phis.find(std::make_pair(phi, pb));
+         assert(it != phis.end());
+         phis.insert(std::make_pair(std::make_pair(phi, pn), it->second));
+         phis.erase(it);
+      }
     }
  
+   // Now go through and fix up all of the phi node sources.
+   j = 0;
+   for (ei = bb->cfg.incident(); !ei.end(); ei.next(), j++) {
+      pb = BasicBlock::get(ei.getNode());
+      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
+         PhiMap::const_iterator it = phis.find(std::make_pair(phi, pb));
+         assert(it != phis.end());
+
+         phi->setSrc(j, it->second);
+      }
+   }
+}
+
+// For each operand of each PHI in b, generate a new value by inserting a MOV
+// at the end of the block it is coming from and replace the operand with its
+// result. This eliminates liveness conflicts and enables us to let values be
+// copied to the right register if such a conflict exists nonetheless.
+//
+// These MOVs are also crucial in making sure the live intervals of phi srces
+// are extended until the end of the loop, since they are not included in the
+// live-in sets.
+bool
+RegAlloc::PhiMovesPass::visit(BasicBlock *bb)
+{
+   Instruction *phi, *mov;
+
+   splitEdges(bb);
+
     // insert MOVs (phi->src(j) should stem from j-th in-BB)
     int j = 0;
     for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
-      pb = BasicBlock::get(ei.getNode());
+      BasicBlock *pb = BasicBlock::get(ei.getNode());
        if (!pb->isTerminated())
           pb->insertTail(new_FlowInstruction(func, OP_BRA, bb));
  
        for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
-         mov = new_Instruction(func, OP_MOV, TYPE_U32);
+         LValue *tmp = new_LValue(func, phi->getDef(0)->asLValue());
+         mov = new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
  
           mov->setSrc(0, phi->getSrc(j));
-         mov->setDef(0, new_LValue(func, phi->getDef(0)->asLValue()));
-         phi->setSrc(j, mov->getDef(0));
+         mov->setDef(0, tmp);
+         phi->setSrc(j, tmp);
  
           pb->insertBefore(pb->getExit(), mov);
        }
@@ -542,8 +624,6 @@ RegAlloc::BuildIntervalsPass::collectLiveValues(BasicBlock *bb)
        // trickery to save a loop of OR'ing liveSets
        // aliasing works fine with BitSet::setOr
        for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
-         if (ei.getType() == Graph::Edge::DUMMY)
-            continue;
           if (bbA) {
              bb->liveSet.setOr(&bbA->liveSet, &bbB->liveSet);
              bbA = bb;
@@ -665,6 +745,7 @@ private:
     public:
        uint32_t degree;
        uint16_t degreeLimit; // if deg < degLimit, node is trivially colourable
+      uint16_t maxReg;
        uint16_t colors;
  
        DataFile f;
@@ -690,7 +771,7 @@ private:
     bool coalesce(ArrayList&);
     bool doCoalesce(ArrayList&, unsigned int mask);
     void calculateSpillWeights();
-   void simplify();
+   bool simplify();
     bool selectRegisters();
     void cleanup(const bool success);
  
@@ -720,7 +801,21 @@ private:
     Function *func;
     Program *prog;
  
-   static uint8_t relDegree[17][17];
+   struct RelDegree {
+      uint8_t data[17][17];
+
+      RelDegree() {
+         for (int i = 1; i <= 16; ++i)
+            for (int j = 1; j <= 16; ++j)
+               data[i][j] = j * ((i + j - 1) / j);
+      }
+
+      const uint8_t* operator[](std::size_t i) const {
+         return data[i];
+      }
+   };
+
+   static const RelDegree relDegree;
  
     RegisterSet regs;
  
@@ -732,7 +827,7 @@ private:
     std::list<ValuePair> mustSpill;
  };
  
-uint8_t GCRA::relDegree[17][17];
+const GCRA::RelDegree GCRA::relDegree;
  
  GCRA::RIG_Node::RIG_Node() : Node(NULL), next(this), prev(this)
  {
@@ -759,6 +854,34 @@ GCRA::printNodeInfo() const
     }
  }
  
+static bool
+isShortRegOp(Instruction *insn)
+{
+   // Immediates are always in src1 (except zeroes, which end up getting
+   // replaced with a zero reg). Every other situation can be resolved by
+   // using a long encoding.
+   return insn->srcExists(1) && insn->src(1).getFile() == FILE_IMMEDIATE &&
+      insn->getSrc(1)->reg.data.u64;
+}
+
+// Check if this LValue is ever used in an instruction that can't be encoded
+// with long registers (i.e. > r63)
+static bool
+isShortRegVal(LValue *lval)
+{
+   if (lval->getInsn() == NULL)
+      return false;
+   for (Value::DefCIterator def = lval->defs.begin();
+        def != lval->defs.end(); ++def)
+      if (isShortRegOp((*def)->getInsn()))
+         return true;
+   for (Value::UseCIterator use = lval->uses.begin();
+        use != lval->uses.end(); ++use)
+      if (isShortRegOp((*use)->getInsn()))
+         return true;
+   return false;
+}
+
  void
  GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval)
  {
@@ -774,7 +897,12 @@ GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval)
  
     weight = std::numeric_limits<float>::infinity();
     degree = 0;
-   degreeLimit = regs.getFileSize(f, lval->reg.size);
+   maxReg = regs.getFileSize(f);
+   // On nv50, we lose a bit of gpr encoding when there's an embedded
+   // immediate.
+   if (regs.restrictedGPR16Range && f == FILE_GPR && (lval->reg.size == 2 || isShortRegVal(lval)))
+      maxReg /= 2;
+   degreeLimit = maxReg;
     degreeLimit -= relDegree[1][colors] - 1;
  
     livei.insert(lval->livei);
@@ -834,6 +962,8 @@ GCRA::coalesceValues(Value *dst, Value *src, bool force)
     // add val's definitions to rep and extend the live interval of its RIG node
     rep->defs.insert(rep->defs.end(), val->defs.begin(), val->defs.end());
     nRep->livei.unify(nVal->livei);
+   nRep->degreeLimit = MIN2(nRep->degreeLimit, nVal->degreeLimit);
+   nRep->maxReg = MIN2(nRep->maxReg, nVal->maxReg);
     return true;
  }
  
@@ -855,6 +985,11 @@ GCRA::coalesce(ArrayList& insns)
     case 0xe0:
     case 0xf0:
     case 0x100:
+   case 0x110:
+   case 0x120:
+   case 0x130:
+   case 0x140:
+   case 0x160:
        ret = doCoalesce(insns, JOIN_MASK_UNION);
        break;
     default:
@@ -981,7 +1116,7 @@ GCRA::doCoalesce(ArrayList& insns, unsigned int mask)
              break;
           i = NULL;
           if (!insn->getDef(0)->uses.empty())
-            i = insn->getDef(0)->uses.front()->getInsn();
+            i = (*insn->getDef(0)->uses.begin())->getInsn();
           // if this is a contraint-move there will only be a single use
           if (i && i->op == OP_MERGE) // do we really still need this ?
              break;
@@ -998,7 +1133,9 @@ GCRA::doCoalesce(ArrayList& insns, unsigned int mask)
        case OP_TXQ:
        case OP_TXD:
        case OP_TXG:
+      case OP_TXLQ:
        case OP_TEXCSAA:
+      case OP_TEXPREP:
           if (!(mask & JOIN_MASK_TEX))
              break;
           for (c = 0; insn->srcExists(c) && c != insn->predSrc; ++c)
@@ -1032,11 +1169,6 @@ GCRA::GCRA(Function *fn, SpillCodeInserter& spill) :
     spill(spill)
  {
     prog = func->getProgram();
-
-   // initialize relative degrees array - i takes away from j
-   for (int i = 1; i <= 16; ++i)
-      for (int j = 1; j <= 16; ++j)
-         relDegree[i][j] = j * ((i + j - 1) / j);
  }
  
  GCRA::~GCRA()
@@ -1188,7 +1320,7 @@ GCRA::simplifyNode(RIG_Node *node)
              (node->degree < node->degreeLimit) ? "" : "(spill)");
  }
  
-void
+bool
  GCRA::simplify()
  {
     for (;;) {
@@ -1202,22 +1334,26 @@ GCRA::simplify()
        } else
        if (!DLLIST_EMPTY(&hi)) {
           RIG_Node *best = hi.next;
+         unsigned bestMaxReg = best->maxReg;
           float bestScore = best->weight / (float)best->degree;
-         // spill candidate
+         // Spill candidate. First go through the ones with the highest max
+         // register, then the ones with lower. That way the ones with the
+         // lowest requirement will be allocated first, since it's a stack.
           for (RIG_Node *it = best->next; it != &hi; it = it->next) {
              float score = it->weight / (float)it->degree;
-            if (score < bestScore) {
+            if (score < bestScore || it->maxReg > bestMaxReg) {
                 best = it;
                 bestScore = score;
+               bestMaxReg = it->maxReg;
              }
           }
           if (isinf(bestScore)) {
              ERROR("no viable spill candidates left\n");
-            break;
+            return false;
           }
           simplifyNode(best);
        } else {
-         break;
+         return true;
        }
     }
  }
@@ -1308,8 +1444,8 @@ GCRA::selectRegisters()
           continue;
        LValue *lval = node->getValue();
        if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
-         regs.print();
-      bool ret = regs.assign(node->reg, node->f, node->colors);
+         regs.print(node->f);
+      bool ret = regs.assign(node->reg, node->f, node->colors, node->maxReg);
        if (ret) {
           INFO_DBG(prog->dbgFlags, REG_ALLOC, "assigned reg %i\n", node->reg);
           lval->compMask = node->getCompMask();
@@ -1350,6 +1486,38 @@ GCRA::allocateRegisters(ArrayList& insns)
        if (lval) {
           nodes[i].init(regs, lval);
           RIG.insert(&nodes[i]);
+
+         if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL) {
+            Instruction *insn = lval->getInsn();
+            if (insn->op != OP_MAD && insn->op != OP_FMA && insn->op != OP_SAD)
+               continue;
+            // For both of the cases below, we only want to add the preference
+            // if all arguments are in registers.
+            if (insn->src(0).getFile() != FILE_GPR ||
+                insn->src(1).getFile() != FILE_GPR ||
+                insn->src(2).getFile() != FILE_GPR)
+               continue;
+            if (prog->getTarget()->getChipset() < 0xc0) {
+               // Outputting a flag is not supported with short encodings nor
+               // with immediate arguments.
+               // See handleMADforNV50.
+               if (insn->flagsDef >= 0)
+                  continue;
+            } else {
+               // We can only fold immediate arguments if dst == src2. This
+               // only matters if one of the first two arguments is an
+               // immediate. This form is also only supported for floats.
+               // See handleMADforNVC0.
+               ImmediateValue imm;
+               if (insn->dType != TYPE_F32)
+                  continue;
+               if (!insn->src(0).getImmediate(imm) &&
+                   !insn->src(1).getImmediate(imm))
+                  continue;
+            }
+
+            nodes[i].addRegPreference(getNode(insn->getSrc(2)->asLValue()));
+         }
        }
     }
  
@@ -1363,7 +1531,9 @@ GCRA::allocateRegisters(ArrayList& insns)
  
     buildRIG(insns);
     calculateSpillWeights();
-   simplify();
+   ret = simplify();
+   if (!ret)
+      goto out;
  
     ret = selectRegisters();
     if (!ret) {
@@ -1416,6 +1586,9 @@ GCRA::cleanup(const bool success)
  
     delete[] nodes;
     nodes = NULL;
+   hi.next = hi.prev = &hi;
+   lo[0].next = lo[0].prev = &lo[0];
+   lo[1].next = lo[1].prev = &lo[1];
  }
  
  Symbol *
@@ -1468,12 +1641,12 @@ SpillCodeInserter::assignSlot(const Interval &livei, const unsigned int size)
     return slot.sym;
  }
  
-Symbol *
-SpillCodeInserter::offsetSlot(Symbol *base, const LValue *lval)
+Value *
+SpillCodeInserter::offsetSlot(Value *base, const LValue *lval)
  {
-   if (!base || !lval->compound || (lval->compMask & 0x1))
+   if (!lval->compound || (lval->compMask & 0x1))
        return base;
-   Symbol *slot = cloneShallow(func, base);
+   Value *slot = cloneShallow(func, base);
  
     slot->reg.data.offset += (ffs(lval->compMask) - 1) * lval->reg.size;
     slot->reg.size = lval->reg.size;
@@ -1486,18 +1659,38 @@ SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)
  {
     const DataType ty = typeOfSize(lval->reg.size);
  
-   slot = offsetSlot(slot->asSym(), lval);
+   slot = offsetSlot(slot, lval);
  
     Instruction *st;
     if (slot->reg.file == FILE_MEMORY_LOCAL) {
-      st = new_Instruction(func, OP_STORE, ty);
-      st->setSrc(0, slot);
-      st->setSrc(1, lval);
        lval->noSpill = 1;
+      if (ty != TYPE_B96) {
+         st = new_Instruction(func, OP_STORE, ty);
+         st->setSrc(0, slot);
+         st->setSrc(1, lval);
+      } else {
+         st = new_Instruction(func, OP_SPLIT, ty);
+         st->setSrc(0, lval);
+         for (int d = 0; d < lval->reg.size / 4; ++d)
+            st->setDef(d, new_LValue(func, FILE_GPR));
+
+         for (int d = lval->reg.size / 4 - 1; d >= 0; --d) {
+            Value *tmp = cloneShallow(func, slot);
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32);
+            s->setSrc(0, tmp);
+            s->setSrc(1, st->getDef(d));
+            defi->bb->insertAfter(defi, s);
+         }
+      }
     } else {
        st = new_Instruction(func, OP_CVT, ty);
        st->setDef(0, slot);
        st->setSrc(0, lval);
+      if (lval->reg.file == FILE_FLAGS)
+         st->flagsSrc = 0;
     }
     defi->bb->insertAfter(defi, st);
  }
@@ -1507,23 +1700,52 @@ SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
  {
     const DataType ty = typeOfSize(lval->reg.size);
  
-   slot = offsetSlot(slot->asSym(), lval);
+   slot = offsetSlot(slot, lval);
     lval = cloneShallow(func, lval);
  
     Instruction *ld;
     if (slot->reg.file == FILE_MEMORY_LOCAL) {
        lval->noSpill = 1;
-      ld = new_Instruction(func, OP_LOAD, ty);
+      if (ty != TYPE_B96) {
+         ld = new_Instruction(func, OP_LOAD, ty);
+      } else {
+         ld = new_Instruction(func, OP_MERGE, ty);
+         for (int d = 0; d < lval->reg.size / 4; ++d) {
+            Value *tmp = cloneShallow(func, slot);
+            LValue *val;
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32);
+            l->setDef(0, (val = new_LValue(func, FILE_GPR)));
+            l->setSrc(0, tmp);
+            usei->bb->insertBefore(usei, l);
+            ld->setSrc(d, val);
+            val->noSpill = 1;
+         }
+         ld->setDef(0, lval);
+         usei->bb->insertBefore(usei, ld);
+         return lval;
+      }
     } else {
        ld = new_Instruction(func, OP_CVT, ty);
     }
     ld->setDef(0, lval);
     ld->setSrc(0, slot);
+   if (lval->reg.file == FILE_FLAGS)
+      ld->flagsDef = 0;
  
     usei->bb->insertBefore(usei, ld);
     return lval;
  }
  
+static bool
+value_cmp(ValueRef *a, ValueRef *b) {
+   Instruction *ai = a->getInsn(), *bi = b->getInsn();
+   if (ai->bb != bi->bb)
+      return ai->bb->getId() < bi->bb->getId();
+   return ai->serial < bi->serial;
+}
  
  // For each value that is to be spilled, go through all its definitions.
  // A value can have multiple definitions if it has been coalesced before.
@@ -1542,6 +1764,11 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
        LValue *lval = it->first->asLValue();
        Symbol *mem = it->second ? it->second->asSym() : NULL;
  
+      // Keep track of which instructions to delete later. Deleting them
+      // inside the loop is unsafe since a single instruction may have
+      // multiple destinations that all need to be spilled (like OP_SPLIT).
+      unordered_set<Instruction *> to_del;
+
        for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
             ++d) {
           Value *slot = mem ?
@@ -1552,18 +1779,25 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
           LValue *dval = (*d)->get()->asLValue();
           Instruction *defi = (*d)->getInsn();
  
+         // Sort all the uses by BB/instruction so that we don't unspill
+         // multiple times in a row, and also remove a source of
+         // non-determinism.
+         std::vector<ValueRef *> refs(dval->uses.begin(), dval->uses.end());
+         std::sort(refs.begin(), refs.end(), value_cmp);
+
           // Unspill at each use *before* inserting spill instructions,
           // we don't want to have the spill instructions in the use list here.
-         while (!dval->uses.empty()) {
-            ValueRef *u = dval->uses.front();
+         for (std::vector<ValueRef*>::const_iterator it = refs.begin();
+              it != refs.end(); ++it) {
+            ValueRef *u = *it;
              Instruction *usei = u->getInsn();
              assert(usei);
              if (usei->isPseudo()) {
                 tmp = (slot->reg.file == FILE_MEMORY_LOCAL) ? NULL : slot;
                 last = NULL;
-            } else
-            if (!last || usei != last->next) { // TODO: sort uses
-               tmp = unspill(usei, dval, slot);
+            } else {
+               if (!last || (usei != last->next && usei != last))
+                  tmp = unspill(usei, dval, slot);
                 last = usei;
              }
              u->set(tmp);
@@ -1574,7 +1808,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
              d = lval->defs.erase(d);
              --d;
              if (slot->reg.file == FILE_MEMORY_LOCAL)
-               delete_Instruction(func->getProgram(), defi);
+               to_del.insert(defi);
              else
                 defi->setDef(0, slot);
           } else {
@@ -1582,6 +1816,9 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
           }
        }
  
+      for (unordered_set<Instruction *>::const_iterator it = to_del.begin();
+           it != to_del.end(); ++it)
+         delete_Instruction(func->getProgram(), *it);
     }
  
     // TODO: We're not trying to reuse old slots in a potential next iteration.
@@ -1652,6 +1889,10 @@ RegAlloc::execFunc()
             ret && i <= func->loopNestingBound;
             sequence = func->cfg.nextSequence(), ++i)
           ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot()));
+      // reset marker
+      for (ArrayList::Iterator bi = func->allBBlocks.iterator();
+           !bi.end(); bi.next())
+         BasicBlock::get(bi)->liveSet.marker = false;
        if (!ret)
           break;
        func->orderInstructions(this->insns);
@@ -1697,6 +1938,16 @@ GCRA::resolveSplitsAndMerges()
           Value *v = merge->getSrc(s);
           v->reg.data.id = regs.bytesToId(v, reg);
           v->join = v;
+         // If the value is defined by a phi/union node, we also need to
+         // perform the same fixup on that node's sources, since after RA
+         // their registers should be identical.
+         if (v->getInsn()->op == OP_PHI || v->getInsn()->op == OP_UNION) {
+            Instruction *phi = v->getInsn();
+            for (int phis = 0; phi->srcExists(phis); ++phis) {
+               phi->getSrc(phis)->join = v;
+               phi->getSrc(phis)->reg.data.id = v->reg.data.id;
+            }
+         }
           reg += v->reg.size;
        }
     }
@@ -1817,24 +2068,35 @@ RegAlloc::InsertConstraintsPass::addHazard(Instruction *i, const ValueRef *src)
  void
  RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn)
  {
-   uint8_t size = 0;
     int n;
-   for (n = 0; insn->defExists(n) && insn->def(n).getFile() == FILE_GPR; ++n)
-      size += insn->getDef(n)->reg.size;
-   if (n < 2)
+   for (n = 0; insn->defExists(n) && insn->def(n).getFile() == FILE_GPR; ++n);
+   condenseDefs(insn, 0, n - 1);
+}
+
+void
+RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn,
+                                              const int a, const int b)
+{
+   uint8_t size = 0;
+   if (a >= b)
        return;
+   for (int s = a; s <= b; ++s)
+      size += insn->getDef(s)->reg.size;
+   if (!size)
+      return;
+
     LValue *lval = new_LValue(func, FILE_GPR);
     lval->reg.size = size;
  
     Instruction *split = new_Instruction(func, OP_SPLIT, typeOfSize(size));
     split->setSrc(0, lval);
-   for (int d = 0; d < n; ++d) {
-      split->setDef(d, insn->getDef(d));
+   for (int d = a; d <= b; ++d) {
+      split->setDef(d - a, insn->getDef(d));
        insn->setDef(d, NULL);
     }
-   insn->setDef(0, lval);
+   insn->setDef(a, lval);
  
-   for (int k = 1, d = n; insn->defExists(d); ++d, ++k) {
+   for (int k = a + 1, d = b + 1; insn->defExists(d); ++d, ++k) {
        insn->setDef(k, insn->getDef(d));
        insn->setDef(d, NULL);
     }
@@ -1844,6 +2106,7 @@ RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn)
     insn->bb->insertAfter(insn, split);
     constrList.push_back(split);
  }
+
  void
  RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn,
                                                const int a, const int b)
@@ -1865,14 +2128,9 @@ RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn,
     merge->setDef(0, lval);
     for (int s = a, i = 0; s <= b; ++s, ++i) {
        merge->setSrc(i, insn->getSrc(s));
-      insn->setSrc(s, NULL);
     }
+   insn->moveSources(b + 1, a - b);
     insn->setSrc(a, lval);
-
-   for (int k = a + 1, s = b + 1; insn->srcExists(s); ++s, ++k) {
-      insn->setSrc(k, insn->getSrc(s));
-      insn->setSrc(s, NULL);
-   }
     insn->bb->insertBefore(insn, merge);
  
     insn->putExtraSources(0, save);
@@ -1880,6 +2138,244 @@ RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn,
     constrList.push_back(merge);
  }
  
+bool
+RegAlloc::InsertConstraintsPass::isScalarTexGM107(TexInstruction *tex)
+{
+   if (tex->tex.sIndirectSrc >= 0 ||
+       tex->tex.rIndirectSrc >= 0 ||
+       tex->tex.derivAll)
+      return false;
+
+   if (tex->tex.mask == 5 || tex->tex.mask == 6)
+      return false;
+
+   switch (tex->op) {
+   case OP_TEX:
+   case OP_TXF:
+   case OP_TXG:
+   case OP_TXL:
+      break;
+   default:
+      return false;
+   }
+
+   // legal variants:
+   // TEXS.1D.LZ
+   // TEXS.2D
+   // TEXS.2D.LZ
+   // TEXS.2D.LL
+   // TEXS.2D.DC
+   // TEXS.2D.LL.DC
+   // TEXS.2D.LZ.DC
+   // TEXS.A2D
+   // TEXS.A2D.LZ
+   // TEXS.A2D.LZ.DC
+   // TEXS.3D
+   // TEXS.3D.LZ
+   // TEXS.CUBE
+   // TEXS.CUBE.LL
+
+   // TLDS.1D.LZ
+   // TLDS.1D.LL
+   // TLDS.2D.LZ
+   // TLSD.2D.LZ.AOFFI
+   // TLDS.2D.LZ.MZ
+   // TLDS.2D.LL
+   // TLDS.2D.LL.AOFFI
+   // TLDS.A2D.LZ
+   // TLDS.3D.LZ
+
+   // TLD4S: all 2D/RECT variants and only offset
+
+   switch (tex->op) {
+   case OP_TEX:
+      if (tex->tex.useOffsets)
+         return false;
+
+      switch (tex->tex.target.getEnum()) {
+      case TEX_TARGET_1D:
+      case TEX_TARGET_2D_ARRAY_SHADOW:
+         return tex->tex.levelZero;
+      case TEX_TARGET_CUBE:
+         return !tex->tex.levelZero;
+      case TEX_TARGET_2D:
+      case TEX_TARGET_2D_ARRAY:
+      case TEX_TARGET_2D_SHADOW:
+      case TEX_TARGET_3D:
+      case TEX_TARGET_RECT:
+      case TEX_TARGET_RECT_SHADOW:
+         return true;
+      default:
+         return false;
+      }
+
+   case OP_TXL:
+      if (tex->tex.useOffsets)
+         return false;
+
+      switch (tex->tex.target.getEnum()) {
+      case TEX_TARGET_2D:
+      case TEX_TARGET_2D_SHADOW:
+      case TEX_TARGET_RECT:
+      case TEX_TARGET_RECT_SHADOW:
+      case TEX_TARGET_CUBE:
+         return true;
+      default:
+         return false;
+      }
+
+   case OP_TXF:
+      switch (tex->tex.target.getEnum()) {
+      case TEX_TARGET_1D:
+         return !tex->tex.useOffsets;
+      case TEX_TARGET_2D:
+      case TEX_TARGET_RECT:
+         return true;
+      case TEX_TARGET_2D_ARRAY:
+      case TEX_TARGET_2D_MS:
+      case TEX_TARGET_3D:
+         return !tex->tex.useOffsets && tex->tex.levelZero;
+      default:
+         return false;
+      }
+
+   case OP_TXG:
+      if (tex->tex.useOffsets > 1)
+         return false;
+      if (tex->tex.mask != 0x3 && tex->tex.mask != 0xf)
+         return false;
+
+      switch (tex->tex.target.getEnum()) {
+      case TEX_TARGET_2D:
+      case TEX_TARGET_2D_MS:
+      case TEX_TARGET_2D_SHADOW:
+      case TEX_TARGET_RECT:
+      case TEX_TARGET_RECT_SHADOW:
+         return true;
+      default:
+         return false;
+      }
+
+   default:
+      return false;
+   }
+}
+
+void
+RegAlloc::InsertConstraintsPass::handleScalarTexGM107(TexInstruction *tex)
+{
+   int defCount = tex->defCount(0xff);
+   int srcCount = tex->srcCount(0xff);
+
+   tex->tex.scalar = true;
+
+   // 1. handle defs
+   if (defCount > 3)
+      condenseDefs(tex, 2, 3);
+   if (defCount > 1)
+      condenseDefs(tex, 0, 1);
+
+   // 2. handle srcs
+   // special case for TXF.A2D
+   if (tex->op == OP_TXF && tex->tex.target == TEX_TARGET_2D_ARRAY) {
+      assert(srcCount >= 3);
+      condenseSrcs(tex, 1, 2);
+   } else {
+      if (srcCount > 3)
+         condenseSrcs(tex, 2, 3);
+      // only if we have more than 2 sources
+      if (srcCount > 2)
+         condenseSrcs(tex, 0, 1);
+   }
+
+   assert(!tex->defExists(2) && !tex->srcExists(2));
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex)
+{
+   int n, s;
+
+   if (isTextureOp(tex->op))
+      textureMask(tex);
+
+   if (targ->getChipset() < NVISA_GV100_CHIPSET) {
+      if (isScalarTexGM107(tex)) {
+         handleScalarTexGM107(tex);
+         return;
+      }
+
+      assert(!tex->tex.scalar);
+      condenseDefs(tex);
+   } else {
+      if (isTextureOp(tex->op)) {
+         int defCount = tex->defCount(0xff);
+         if (defCount > 3)
+            condenseDefs(tex, 2, 3);
+         if (defCount > 1)
+            condenseDefs(tex, 0, 1);
+      } else {
+         condenseDefs(tex);
+      }
+   }
+
+   if (isSurfaceOp(tex->op)) {
+      int s = tex->tex.target.getDim() +
+         (tex->tex.target.isArray() || tex->tex.target.isCube());
+      int n = 0;
+
+      switch (tex->op) {
+      case OP_SUSTB:
+      case OP_SUSTP:
+         n = 4;
+         break;
+      case OP_SUREDB:
+      case OP_SUREDP:
+         if (tex->subOp == NV50_IR_SUBOP_ATOM_CAS)
+            n = 2;
+         break;
+      default:
+         break;
+      }
+
+      if (s > 1)
+         condenseSrcs(tex, 0, s - 1);
+      if (n > 1)
+         condenseSrcs(tex, 1, n); // do not condense the tex handle
+   } else
+   if (isTextureOp(tex->op)) {
+      if (tex->op != OP_TXQ) {
+         s = tex->tex.target.getArgCount() - tex->tex.target.isMS();
+         if (tex->op == OP_TXD) {
+            // Indirect handle belongs in the first arg
+            if (tex->tex.rIndirectSrc >= 0)
+               s++;
+            if (!tex->tex.target.isArray() && tex->tex.useOffsets)
+               s++;
+         }
+         n = tex->srcCount(0xff, true) - s;
+         // TODO: Is this necessary? Perhaps just has to be aligned to the
+         // level that the first arg is, not necessarily to 4. This
+         // requirement has not been rigorously verified, as it has been on
+         // Kepler.
+         if (n > 0 && n < 3) {
+            if (tex->srcExists(n + s)) // move potential predicate out of the way
+               tex->moveSources(n + s, 3 - n);
+            while (n < 3)
+               tex->setSrc(s + n++, new_LValue(func, FILE_GPR));
+         }
+      } else {
+         s = tex->srcCount(0xff, true);
+         n = 0;
+      }
+
+      if (s > 1)
+         condenseSrcs(tex, 0, s - 1);
+      if (n > 1) // NOTE: first call modified positions already
+         condenseSrcs(tex, 1, n);
+   }
+}
+
  void
  RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
  {
@@ -1888,18 +2384,22 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
     condenseDefs(tex);
  
     if (tex->op == OP_SUSTB || tex->op == OP_SUSTP) {
-      condenseSrcs(tex, 3, (3 + typeSizeof(tex->dType) / 4) - 1);
+      condenseSrcs(tex, 3, 6);
     } else
     if (isTextureOp(tex->op)) {
        int n = tex->srcCount(0xff, true);
-      if (n > 4) {
-         condenseSrcs(tex, 0, 3);
-         if (n > 5) // NOTE: first call modified positions already
-            condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
-      } else
-      if (n > 1) {
-         condenseSrcs(tex, 0, n - 1);
+      int s = n > 4 ? 4 : n;
+      if (n > 4 && n < 7) {
+         if (tex->srcExists(n)) // move potential predicate out of the way
+            tex->moveSources(n, 7 - n);
+
+         while (n < 7)
+            tex->setSrc(n++, new_LValue(func, FILE_GPR));
        }
+      if (s > 1)
+         condenseSrcs(tex, 0, s - 1);
+      if (n > 4)
+         condenseSrcs(tex, 1, n - s);
     }
  }
  
@@ -1908,13 +2408,20 @@ RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex)
  {
     int n, s;
  
-   textureMask(tex);
+   if (isTextureOp(tex->op))
+      textureMask(tex);
  
     if (tex->op == OP_TXQ) {
        s = tex->srcCount(0xff);
        n = 0;
+   } else if (isSurfaceOp(tex->op)) {
+      s = tex->tex.target.getDim() + (tex->tex.target.isArray() || tex->tex.target.isCube());
+      if (tex->op == OP_SUSTB || tex->op == OP_SUSTP)
+         n = 4;
+      else
+         n = 0;
     } else {
-      s = tex->tex.target.getArgCount();
+      s = tex->tex.target.getArgCount() - tex->tex.target.isMS();
        if (!tex->tex.target.isArray() &&
            (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0))
           ++s;
@@ -1947,6 +2454,8 @@ RegAlloc::InsertConstraintsPass::texConstraintNV50(TexInstruction *tex)
     for (c = 0; tex->srcExists(c) || tex->defExists(c); ++c) {
        if (!tex->srcExists(c))
           tex->setSrc(c, new_LValue(func, tex->getSrc(0)->asLValue()));
+      else
+         insertConstraintMove(tex, c);
        if (!tex->defExists(c))
           tex->setDef(c, new_LValue(func, tex->getDef(0)->asLValue()));
     }
@@ -1987,6 +2496,13 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
           case 0x100:
              texConstraintNVE0(tex);
              break;
+         case 0x110:
+         case 0x120:
+         case 0x130:
+         case 0x140:
+         case 0x160:
+            texConstraintGM107(tex);
+            break;
           default:
              break;
           }
@@ -2002,6 +2518,8 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
           condenseDefs(i);
           if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8)
              addHazard(i, i->src(0).getIndirect(0));
+         if (i->src(0).isIndirect(1) && typeSizeof(i->dType) >= 8)
+            addHazard(i, i->src(0).getIndirect(1));
        } else
        if (i->op == OP_UNION ||
            i->op == OP_MERGE ||
@@ -2012,6 +2530,54 @@ RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
     return true;
  }
  
+void
+RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s)
+{
+   const uint8_t size = cst->src(s).getSize();
+
+   assert(cst->getSrc(s)->defs.size() == 1); // still SSA
+
+   Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
+
+   bool imm = defi->op == OP_MOV &&
+      defi->src(0).getFile() == FILE_IMMEDIATE;
+   bool load = defi->op == OP_LOAD &&
+      defi->src(0).getFile() == FILE_MEMORY_CONST &&
+      !defi->src(0).isIndirect(0);
+   // catch some cases where don't really need MOVs
+   if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs()) {
+      if (imm || load) {
+         // Move the defi right before the cst. No point in expanding
+         // the range.
+         defi->bb->remove(defi);
+         cst->bb->insertBefore(cst, defi);
+      }
+      return;
+   }
+
+   LValue *lval = new_LValue(func, cst->src(s).getFile());
+   lval->reg.size = size;
+
+   Instruction *mov = new_Instruction(func, OP_MOV, typeOfSize(size));
+   mov->setDef(0, lval);
+   mov->setSrc(0, cst->getSrc(s));
+
+   if (load) {
+      mov->op = OP_LOAD;
+      mov->setSrc(0, defi->getSrc(0));
+   } else if (imm) {
+      mov->setSrc(0, defi->getSrc(0));
+   }
+
+   if (defi->getPredicate())
+      mov->setPredicate(defi->cc, defi->getPredicate());
+
+   cst->setSrc(s, mov->getDef(0));
+   cst->bb->insertBefore(cst, mov);
+
+   cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help
+}
+
  // Insert extra moves so that, if multiple register constraints on a value are
  // in conflict, these conflicts can be resolved.
  bool
@@ -2052,26 +2618,8 @@ RegAlloc::InsertConstraintsPass::insertConstraintMoves()
                 cst->bb->insertBefore(cst, mov);
                 continue;
              }
-            assert(cst->getSrc(s)->defs.size() == 1); // still SSA
-
-            Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
-            // catch some cases where don't really need MOVs
-            if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs())
-               continue;
-
-            LValue *lval = new_LValue(func, cst->src(s).getFile());
-            lval->reg.size = size;
-
-            mov = new_Instruction(func, OP_MOV, typeOfSize(size));
-            mov->setDef(0, lval);
-            mov->setSrc(0, cst->getSrc(s));
-            cst->setSrc(s, mov->getDef(0));
-            cst->bb->insertBefore(cst, mov);
-
-            cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help
  
-            if (cst->op == OP_UNION)
-               mov->setPredicate(defi->cc, defi->getPredicate());
+            insertConstraintMove(cst, s);
           }
        }
     }