Merge remote-tracking branch 'mesa-public/master' into vulkan

[mesa.git] / src / gallium / drivers / nouveau / codegen / nv50_ir_lowering_nvc0.cpp
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp

index c7e9063fe65a3c974e70e6432bf9799e1691971b..0f575f2eeddd42035ff8d296fcaf854e4a55f0e6 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -24,6 +24,7 @@
  #include "codegen/nv50_ir_build_util.h"
  
  #include "codegen/nv50_ir_target_nvc0.h"
+#include "codegen/nv50_ir_lowering_nvc0.h"
  
  #include <limits>
  
@@ -39,20 +40,6 @@ namespace nv50_ir {
     ((QOP_##q << 6) | (QOP_##r << 4) |           \
      (QOP_##s << 2) | (QOP_##t << 0))
  
-class NVC0LegalizeSSA : public Pass
-{
-private:
-   virtual bool visit(BasicBlock *);
-   virtual bool visit(Function *);
-
-   // we want to insert calls to the builtin library only after optimization
-   void handleDIV(Instruction *); // integer division, modulus
-   void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
-
-private:
-   BuildUtil bld;
-};
-
  void
  NVC0LegalizeSSA::handleDIV(Instruction *i)
  {
@@ -83,7 +70,49 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
  void
  NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
  {
-   // TODO
+   assert(i->dType == TYPE_F64);
+   // There are instructions that will compute the high 32 bits of the 64-bit
+   // float. We will just stick 0 in the bottom 32 bits.
+
+   bld.setPosition(i, false);
+
+   // 1. Take the source and it up.
+   Value *src[2], *dst[2], *def = i->getDef(0);
+   bld.mkSplit(src, 4, i->getSrc(0));
+
+   // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
+   dst[0] = bld.loadImm(NULL, 0);
+   dst[1] = bld.getSSA();
+
+   // 3. The new version of the instruction takes the high 32 bits of the
+   // source and outputs the high 32 bits of the destination.
+   i->setSrc(0, src[1]);
+   i->setDef(0, dst[1]);
+   i->setType(TYPE_F32);
+   i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
+
+   // 4. Recombine the two dst pieces back into the original destination.
+   bld.setPosition(i, true);
+   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
+}
+
+void
+NVC0LegalizeSSA::handleFTZ(Instruction *i)
+{
+   // Only want to flush float inputs
+   assert(i->sType == TYPE_F32);
+
+   // If we're already flushing denorms (and NaN's) to zero, no need for this.
+   if (i->dnz)
+      return;
+
+   // Only certain classes of operations can flush
+   OpClass cls = prog->getTarget()->getOpClass(i->op);
+   if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
+       cls != OPCLASS_CONVERT)
+      return;
+
+   i->ftz = true;
  }
  
  bool
@@ -99,8 +128,11 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
     Instruction *next;
     for (Instruction *i = bb->getEntry(); i; i = next) {
        next = i->next;
-      if (i->dType == TYPE_F32)
+      if (i->sType == TYPE_F32) {
+         if (prog->getType() != Program::TYPE_COMPUTE)
+            handleFTZ(i);
           continue;
+      }
        switch (i->op) {
        case OP_DIV:
        case OP_MOD:
@@ -118,49 +150,6 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
     return true;
  }
  
-class NVC0LegalizePostRA : public Pass
-{
-public:
-   NVC0LegalizePostRA(const Program *);
-
-private:
-   virtual bool visit(Function *);
-   virtual bool visit(BasicBlock *);
-
-   void replaceZero(Instruction *);
-   bool tryReplaceContWithBra(BasicBlock *);
-   void propagateJoin(BasicBlock *);
-
-   struct TexUse
-   {
-      TexUse(Instruction *use, const Instruction *tex)
-         : insn(use), tex(tex), level(-1) { }
-      Instruction *insn;
-      const Instruction *tex; // or split / mov
-      int level;
-   };
-   struct Limits
-   {
-      Limits() { }
-      Limits(int min, int max) : min(min), max(max) { }
-      int min, max;
-   };
-   bool insertTextureBarriers(Function *);
-   inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
-   void findFirstUses(const Instruction *tex, const Instruction *def,
-                      std::list<TexUse>&);
-   void findOverwritingDefs(const Instruction *tex, Instruction *insn,
-                            const BasicBlock *term,
-                            std::list<TexUse>&);
-   void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
-   const Instruction *recurseDef(const Instruction *);
-
-private:
-   LValue *rZero;
-   LValue *carry;
-   const bool needTexBar;
-};
-
  NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
     : rZero(NULL),
       carry(NULL),
@@ -179,7 +168,7 @@ NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
  
  void
  NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
-                              Instruction *usei, const Instruction *insn)
+                              Instruction *usei, const Instruction *texi)
  {
     bool add = true;
     for (std::list<TexUse>::iterator it = uses.begin();
@@ -194,7 +183,7 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
           ++it;
     }
     if (add)
-      uses.push_back(TexUse(usei, insn));
+      uses.push_back(TexUse(usei, texi));
  }
  
  void
@@ -206,7 +195,8 @@ NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
     while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
        insn = insn->getSrc(0)->getUniqueInsn();
  
-   if (!insn || !insn->bb->reachableBy(texi->bb, term))
+   // NOTE: the tex itself is, of course, not an overwriting definition
+   if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
        return;
  
     switch (insn->op) {
@@ -230,17 +220,36 @@ NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
  }
  
  void
-NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
-                                  const Instruction *insn,
-                                  std::list<TexUse> &uses)
+NVC0LegalizePostRA::findFirstUses(
+      const Instruction *texi,
+      const Instruction *insn,
+      std::list<TexUse> &uses,
+      unordered_set<const Instruction *>& visited)
  {
     for (int d = 0; insn->defExists(d); ++d) {
        Value *v = insn->getDef(d);
        for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
           Instruction *usei = (*u)->getInsn();
  
+         // NOTE: In case of a loop that overwrites a value but never uses
+         // it, it can happen that we have a cycle of uses that consists only
+         // of phis and no-op moves and will thus cause an infinite loop here
+         // since these are not considered actual uses.
+         // The most obvious (and perhaps the only) way to prevent this is to
+         // remember which instructions we've already visited.
+
+         if (visited.find(usei) != visited.end())
+            continue;
+
+         visited.insert(usei);
+
           if (usei->op == OP_PHI || usei->op == OP_UNION) {
-            // need a barrier before WAW cases
+            // need a barrier before WAW cases, like:
+            //   %r0 = tex
+            //   if ...
+            //     texbar <- is required or tex might replace x again
+            //     %r1 = x <- overwriting def
+            //   %r2 = phi %r0, %r1
              for (int s = 0; usei->srcExists(s); ++s) {
                 Instruction *defi = usei->getSrc(s)->getUniqueInsn();
                 if (defi && &usei->src(s) != *u)
@@ -253,13 +262,13 @@ NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
               usei->op == OP_PHI ||
               usei->op == OP_UNION) {
              // these uses don't manifest in the machine code
-            findFirstUses(texi, usei, uses);
+            findFirstUses(texi, usei, uses, visited);
           } else
           if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
               usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
-            findFirstUses(texi, usei, uses);
+            findFirstUses(texi, usei, uses, visited);
           } else {
-            addTexUse(uses, usei, insn);
+            addTexUse(uses, usei, texi);
           }
        }
     }
@@ -313,8 +322,10 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
     uses = new std::list<TexUse>[texes.size()];
     if (!uses)
        return false;
-   for (size_t i = 0; i < texes.size(); ++i)
-      findFirstUses(texes[i], texes[i], uses[i]);
+   for (size_t i = 0; i < texes.size(); ++i) {
+      unordered_set<const Instruction *> visited;
+      findFirstUses(texes[i], texes[i], uses[i], visited);
+   }
  
     // determine the barrier level at each use
     for (size_t i = 0; i < texes.size(); ++i) {
@@ -349,7 +360,6 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
        }
     }
     delete[] uses;
-   uses = NULL;
  
     // insert the barriers
     for (size_t i = 0; i < useVec.size(); ++i) {
@@ -370,11 +380,8 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
        }
     }
  
-   if (fn->getProgram()->optLevel < 3) {
-      if (uses)
-         delete[] uses;
+   if (fn->getProgram()->optLevel < 3)
        return true;
-   }
  
     std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
  
@@ -459,8 +466,6 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
              prev = i;
        }
     }
-   if (uses)
-      delete[] uses;
     return true;
  }
  
@@ -550,9 +555,16 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
              i->setDef(0, NULL);
           if (i->src(0).getFile() == FILE_IMMEDIATE)
              i->setSrc(0, rZero); // initial value must be 0
+         replaceZero(i);
        } else
        if (i->isNop()) {
           bb->remove(i);
+      } else
+      if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
+          prog->getType() != Program::TYPE_COMPUTE) {
+         // It seems like barriers are never required for tessellation since
+         // the warp size is 32, and there are always at most 32 tcs threads.
+         bb->remove(i);
        } else {
           // TODO: Move this to before register allocation for operations that
           // need the $c register !
@@ -576,53 +588,6 @@ NVC0LegalizePostRA::visit(BasicBlock *bb)
     return true;
  }
  
-class NVC0LoweringPass : public Pass
-{
-public:
-   NVC0LoweringPass(Program *);
-
-private:
-   virtual bool visit(Function *);
-   virtual bool visit(BasicBlock *);
-   virtual bool visit(Instruction *);
-
-   bool handleRDSV(Instruction *);
-   bool handleWRSV(Instruction *);
-   bool handleEXPORT(Instruction *);
-   bool handleOUT(Instruction *);
-   bool handleDIV(Instruction *);
-   bool handleMOD(Instruction *);
-   bool handleSQRT(Instruction *);
-   bool handlePOW(Instruction *);
-   bool handleTEX(TexInstruction *);
-   bool handleTXD(TexInstruction *);
-   bool handleTXQ(TexInstruction *);
-   bool handleManualTXD(TexInstruction *);
-   bool handleTXLQ(TexInstruction *);
-   bool handleATOM(Instruction *);
-   bool handleCasExch(Instruction *, bool needCctl);
-   void handleSurfaceOpNVE4(TexInstruction *);
-
-   void checkPredicate(Instruction *);
-
-   void readTessCoord(LValue *dst, int c);
-
-   Value *loadResInfo32(Value *ptr, uint32_t off);
-   Value *loadMsInfo32(Value *ptr, uint32_t off);
-   Value *loadTexHandle(Value *ptr, unsigned int slot);
-
-   void adjustCoordinatesMS(TexInstruction *);
-   void processSurfaceCoordsNVE4(TexInstruction *);
-
-private:
-   const Target *const targ;
-
-   BuildUtil bld;
-
-   Symbol *gMemBase;
-   LValue *gpEmitAddress;
-};
-
  NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
  {
     bld.setProgram(prog);
@@ -669,11 +634,59 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
     const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
     const int chipset = prog->getTarget()->getChipset();
  
+   // Arguments to the TEX instruction are a little insane. Even though the
+   // encoding is identical between SM20 and SM30, the arguments mean
+   // different things between Fermi and Kepler+. A lot of arguments are
+   // optional based on flags passed to the instruction. This summarizes the
+   // order of things.
+   //
+   // Fermi:
+   //  array/indirect
+   //  coords
+   //  sample
+   //  lod bias
+   //  depth compare
+   //  offsets:
+   //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
+   //    - other: 4 bits each, single reg
+   //
+   // Kepler+:
+   //  indirect handle
+   //  array (+ offsets for txd in upper 16 bits)
+   //  coords
+   //  sample
+   //  lod bias
+   //  depth compare
+   //  offsets (same as fermi, except txd which takes it with array)
+   //
+   // Maxwell (tex):
+   //  array
+   //  coords
+   //  indirect handle
+   //  sample
+   //  lod bias
+   //  depth compare
+   //  offsets
+   //
+   // Maxwell (txd):
+   //  indirect handle
+   //  coords
+   //  array + offsets
+   //  derivatives
+
     if (chipset >= NVISA_GK104_CHIPSET) {
        if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
-         WARN("indirect TEX not implemented\n");
-      }
-      if (i->tex.r == i->tex.s) {
+         // XXX this ignores tsc, and assumes a 1:1 mapping
+         assert(i->tex.rIndirectSrc >= 0);
+         Value *hnd = loadTexHandle(
+               bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                          i->getIndirectR(), bld.mkImm(2)),
+               i->tex.r);
+         i->tex.r = 0xff;
+         i->tex.s = 0x1f;
+         i->setIndirectR(hnd);
+         i->setIndirectS(NULL);
+      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
           i->tex.r += prog->driver->io.texBindBase / 4;
           i->tex.s  = 0; // only a single cX[] value possible here
        } else {
@@ -693,23 +706,51 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
           const int sat = (i->op == OP_TXF) ? 1 : 0;
           DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
           bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
-         for (int s = dim; s >= 1; --s)
-            i->setSrc(s, i->getSrc(s - 1));
-         i->setSrc(0, layer);
+         if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
+            for (int s = dim; s >= 1; --s)
+               i->setSrc(s, i->getSrc(s - 1));
+            i->setSrc(0, layer);
+         } else {
+            i->setSrc(dim, layer);
+         }
+      }
+      // Move the indirect reference to the first place
+      if (i->tex.rIndirectSrc >= 0 && (
+                i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
+         Value *hnd = i->getIndirectR();
+
+         i->setIndirectR(NULL);
+         i->moveSources(0, 1);
+         i->setSrc(0, hnd);
+         i->tex.rIndirectSrc = 0;
+         i->tex.sIndirectSrc = -1;
        }
     } else
     // (nvc0) generate and move the tsc/tic/array source to the front
     if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
        LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
  
+      Value *ticRel = i->getIndirectR();
+      Value *tscRel = i->getIndirectS();
+
+      if (ticRel) {
+         i->setSrc(i->tex.rIndirectSrc, NULL);
+         if (i->tex.r)
+            ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                                ticRel, bld.mkImm(i->tex.r));
+      }
+      if (tscRel) {
+         i->setSrc(i->tex.sIndirectSrc, NULL);
+         if (i->tex.s)
+            tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                                tscRel, bld.mkImm(i->tex.s));
+      }
+
        Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
        for (int s = dim; s >= 1; --s)
           i->setSrc(s, i->getSrc(s - 1));
        i->setSrc(0, arrayIndex);
  
-      Value *ticRel = i->getIndirectR();
-      Value *tscRel = i->getIndirectS();
-
        if (arrayIndex) {
           int sat = (i->op == OP_TXF) ? 1 : 0;
           DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
@@ -718,14 +759,10 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
           bld.loadImm(src, 0);
        }
  
-      if (ticRel) {
-         i->setSrc(i->tex.rIndirectSrc, NULL);
+      if (ticRel)
           bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
-      }
-      if (tscRel) {
-         i->setSrc(i->tex.sIndirectSrc, NULL);
+      if (tscRel)
           bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
-      }
  
        i->setSrc(0, src);
     }
@@ -737,23 +774,66 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
     assert(chipset >= NVISA_GK104_CHIPSET ||
            !i->tex.useOffsets || !i->tex.target.isMS());
  
-   // offset is last source (lod 1st, dc 2nd)
+   // offset is between lod and dc
     if (i->tex.useOffsets) {
-      uint32_t value = 0;
        int n, c;
        int s = i->srcCount(0xff, true);
-      if (i->srcExists(s)) // move potential predicate out of the way
-         i->moveSources(s, 1);
+      if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
+         if (i->tex.target.isShadow())
+            s--;
+         if (i->srcExists(s)) // move potential predicate out of the way
+            i->moveSources(s, 1);
+         if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
+            i->moveSources(s + 1, 1);
+      }
        if (i->op == OP_TXG) {
-         assert(i->tex.useOffsets == 1);
-         for (c = 0; c < 3; ++c)
-            value |= (i->tex.offset[0][c] & 0xff) << (c * 8);
+         // Either there is 1 offset, which goes into the 2 low bytes of the
+         // first source, or there are 4 offsets, which go into 2 sources (8
+         // values, 1 byte each).
+         Value *offs[2] = {NULL, NULL};
+         for (n = 0; n < i->tex.useOffsets; n++) {
+            for (c = 0; c < 2; ++c) {
+               if ((n % 2) == 0 && c == 0)
+                  offs[n / 2] = i->offset[n][c].get();
+               else
+                  bld.mkOp3(OP_INSBF, TYPE_U32,
+                            offs[n / 2],
+                            i->offset[n][c].get(),
+                            bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
+                            offs[n / 2]);
+            }
+         }
+         i->setSrc(s, offs[0]);
+         if (offs[1])
+            i->setSrc(s + 1, offs[1]);
        } else {
-         for (n = 0; n < i->tex.useOffsets; ++n)
-            for (c = 0; c < 3; ++c)
-               value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
+         unsigned imm = 0;
+         assert(i->tex.useOffsets == 1);
+         for (c = 0; c < 3; ++c) {
+            ImmediateValue val;
+            if (!i->offset[0][c].getImmediate(val))
+               assert(!"non-immediate offset passed to non-TXG");
+            imm |= (val.reg.data.u32 & 0xf) << (c * 4);
+         }
+         if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
+            // The offset goes into the upper 16 bits of the array index. So
+            // create it if it's not already there, and INSBF it if it already
+            // is.
+            s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
+            if (chipset >= NVISA_GM107_CHIPSET)
+               s += dim;
+            if (i->tex.target.isArray()) {
+               bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
+                         bld.loadImm(NULL, imm), bld.mkImm(0xc10),
+                         i->getSrc(s));
+            } else {
+               i->moveSources(s, 1);
+               i->setSrc(s, bld.loadImm(NULL, imm << 16));
+            }
+         } else {
+            i->setSrc(s, bld.loadImm(NULL, imm));
+         }
        }
-      i->setSrc(s, bld.loadImm(NULL, value));
     }
  
     if (chipset >= NVISA_GK104_CHIPSET) {
@@ -791,6 +871,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
     Value *zero = bld.loadImm(bld.getSSA(), 0);
     int l, c;
     const int dim = i->tex.target.getDim();
+   const int array = i->tex.target.isArray();
  
     i->op = OP_TEX; // no need to clone dPdx/dPdy later
  
@@ -801,7 +882,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
     for (l = 0; l < 4; ++l) {
        // mov coordinates from lane l to all lanes
        for (c = 0; c < dim; ++c)
-         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
        // add dPdx from lane l to lanes dx
        for (c = 0; c < dim; ++c)
           bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
@@ -811,7 +892,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
        // texture
        bld.insert(tex = cloneForward(func, i));
        for (c = 0; c < dim; ++c)
-         tex->setSrc(c, crd[c]);
+         tex->setSrc(c + array, crd[c]);
        // save results
        for (c = 0; i->defExists(c); ++c) {
           Instruction *mov;
@@ -837,19 +918,38 @@ bool
  NVC0LoweringPass::handleTXD(TexInstruction *txd)
  {
     int dim = txd->tex.target.getDim();
-   int arg = txd->tex.target.getArgCount();
+   unsigned arg = txd->tex.target.getArgCount();
+   unsigned expected_args = arg;
+   const int chipset = prog->getTarget()->getChipset();
+
+   if (chipset >= NVISA_GK104_CHIPSET) {
+      if (!txd->tex.target.isArray() && txd->tex.useOffsets)
+         expected_args++;
+      if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
+         expected_args++;
+   } else {
+      if (txd->tex.useOffsets)
+         expected_args++;
+      if (!txd->tex.target.isArray() && (
+                txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
+         expected_args++;
+   }
+
+   if (expected_args > 4 ||
+       dim > 2 ||
+       txd->tex.target.isShadow() ||
+       txd->tex.target.isCube())
+      txd->op = OP_TEX;
  
     handleTEX(txd);
     while (txd->srcExists(arg))
        ++arg;
  
     txd->tex.derivAll = true;
-   if (dim > 2 ||
-       txd->tex.target.isCube() ||
-       arg > 4 ||
-       txd->tex.target.isShadow())
+   if (txd->op == OP_TEX)
        return handleManualTXD(txd);
  
+   assert(arg == expected_args);
     for (int c = 0; c < dim; ++c) {
        txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
        txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
@@ -862,7 +962,46 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
  bool
  NVC0LoweringPass::handleTXQ(TexInstruction *txq)
  {
-   // TODO: indirect resource/sampler index
+   const int chipset = prog->getTarget()->getChipset();
+   if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
+      txq->tex.r += prog->driver->io.texBindBase / 4;
+
+   if (txq->tex.rIndirectSrc < 0)
+      return true;
+
+   Value *ticRel = txq->getIndirectR();
+
+   txq->setIndirectS(NULL);
+   txq->tex.sIndirectSrc = -1;
+
+   assert(ticRel);
+
+   if (chipset < NVISA_GK104_CHIPSET) {
+      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+      txq->setSrc(txq->tex.rIndirectSrc, NULL);
+      if (txq->tex.r)
+         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                             ticRel, bld.mkImm(txq->tex.r));
+
+      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
+
+      txq->moveSources(0, 1);
+      txq->setSrc(0, src);
+   } else {
+      Value *hnd = loadTexHandle(
+            bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                       txq->getIndirectR(), bld.mkImm(2)),
+            txq->tex.r);
+      txq->tex.r = 0xff;
+      txq->tex.s = 0x1f;
+
+      txq->setIndirectR(NULL);
+      txq->moveSources(0, 1);
+      txq->setSrc(0, hnd);
+      txq->tex.rIndirectSrc = 0;
+   }
+
     return true;
  }
  
@@ -1391,21 +1530,33 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
           i->op = OP_MOV;
           i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
        }
+      if (sv == SV_VERTEX_COUNT) {
+         bld.setPosition(i, true);
+         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
+      }
        return true;
     }
  
     switch (sv) {
     case SV_POSITION:
        assert(prog->getType() == Program::TYPE_FRAGMENT);
-      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+      if (i->srcExists(1)) {
+         // Pass offset through to the interpolation logic
+         ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
+                           i->getDef(0), addr, NULL);
+         ld->setSrc(1, i->getSrc(1));
+      } else {
+         bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+      }
        break;
     case SV_FACE:
     {
        Value *face = i->getDef(0);
        bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
        if (i->dType == TYPE_F32) {
-         bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
-         bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
+         bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
+         bld.mkOp1(OP_NEG, TYPE_S32, face, face);
+         bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
        }
     }
        break;
@@ -1452,7 +1603,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
        ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
        break;
     default:
-      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
+      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
           vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
        ld = bld.mkFetch(i->getDef(0), i->dType,
                         FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
@@ -1469,7 +1620,7 @@ NVC0LoweringPass::handleDIV(Instruction *i)
     if (!isFloatType(i->dType))
        return true;
     bld.setPosition(i, false);
-   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
     i->op = OP_MUL;
     i->setSrc(1, rcp->getDef(0));
     return true;
@@ -1478,13 +1629,13 @@ NVC0LoweringPass::handleDIV(Instruction *i)
  bool
  NVC0LoweringPass::handleMOD(Instruction *i)
  {
-   if (i->dType != TYPE_F32)
+   if (!isFloatType(i->dType))
        return true;
-   LValue *value = bld.getScratch();
-   bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
-   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
-   bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
-   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+   LValue *value = bld.getScratch(typeSizeof(i->dType));
+   bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
+   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
+   bld.mkOp1(OP_TRUNC, i->dType, value, value);
+   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
     i->op = OP_SUB;
     i->setSrc(1, value);
     return true;
@@ -1493,10 +1644,22 @@ NVC0LoweringPass::handleMOD(Instruction *i)
  bool
  NVC0LoweringPass::handleSQRT(Instruction *i)
  {
-   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
-                                bld.getSSA(), i->getSrc(0));
+   Value *pred = bld.getSSA(1, FILE_PREDICATE);
+   Value *zero = bld.getSSA();
+   Instruction *rsq;
+
+   bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
+   if (i->dType == TYPE_F64)
+      zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
+   bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
+   bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred);
+   rsq = bld.mkOp1(OP_RSQ, i->dType,
+                   bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
+   rsq->setPredicate(CC_NOT_P, pred);
     i->op = OP_MUL;
     i->setSrc(1, rsq->getDef(0));
+   i->setPredicate(CC_NOT_P, pred);
+
  
     return true;
  }
@@ -1543,14 +1706,21 @@ NVC0LoweringPass::handleEXPORT(Instruction *i)
  bool
  NVC0LoweringPass::handleOUT(Instruction *i)
  {
-   if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
+   Instruction *prev = i->prev;
+   ImmediateValue stream, prevStream;
+
+   // Only merge if the stream ids match. Also, note that the previous
+   // instruction would have already been lowered, so we take arg1 from it.
+   if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
+       i->src(0).getImmediate(stream) &&
+       prev->src(1).getImmediate(prevStream) &&
+       stream.reg.data.u32 == prevStream.reg.data.u32) {
        i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
        delete_Instruction(prog, i);
     } else {
        assert(gpEmitAddress);
        i->setDef(0, gpEmitAddress);
-      if (i->srcExists(0))
-         i->setSrc(1, i->getSrc(0));
+      i->setSrc(1, i->getSrc(0));
        i->setSrc(0, gpEmitAddress);
     }
     return true;
@@ -1584,6 +1754,7 @@ NVC0LoweringPass::checkPredicate(Instruction *insn)
  bool
  NVC0LoweringPass::visit(Instruction *i)
  {
+   bool ret = true;
     bld.setPosition(i, false);
  
     if (i->cc != CC_ALWAYS)
@@ -1615,7 +1786,8 @@ NVC0LoweringPass::visit(Instruction *i)
     case OP_SQRT:
        return handleSQRT(i);
     case OP_EXPORT:
-      return handleEXPORT(i);
+      ret = handleEXPORT(i);
+      break;
     case OP_EMIT:
     case OP_RESTART:
        return handleOUT(i);
@@ -1635,10 +1807,28 @@ NVC0LoweringPass::visit(Instruction *i)
              Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                                      i->getIndirect(0, 0), bld.mkImm(4));
              i->setIndirect(0, 0, ptr);
+            i->op = OP_VFETCH;
           } else {
              i->op = OP_VFETCH;
              assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
           }
+      } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
+         if (i->src(0).isIndirect(1)) {
+            Value *ptr;
+            if (i->src(0).isIndirect(0))
+               ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
+                                i->getIndirect(0, 1), bld.mkImm(0x1010),
+                                i->getIndirect(0, 0));
+            else
+               ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                                i->getIndirect(0, 1), bld.mkImm(16));
+            i->setIndirect(0, 1, NULL);
+            i->setIndirect(0, 0, ptr);
+            i->subOp = NV50_IR_SUBOP_LDC_IS;
+         }
+      } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
+         assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+         i->op = OP_VFETCH;
        }
        break;
     case OP_ATOM:
@@ -1660,7 +1850,20 @@ NVC0LoweringPass::visit(Instruction *i)
     default:
        break;
     }
-   return true;
+
+   /* Kepler+ has a special opcode to compute a new base address to be used
+    * for indirect loads.
+    */
+   if (targ->getChipset() >= NVISA_GK104_CHIPSET && !i->perPatch &&
+       (i->op == OP_VFETCH || i->op == OP_EXPORT) && i->src(0).isIndirect(0)) {
+      Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
+                                      cloneShallow(func, i->getSrc(0)));
+      afetch->setIndirect(0, 0, i->getIndirect(0, 0));
+      i->src(0).get()->reg.data.offset = 0;
+      i->setIndirect(0, 0, afetch->getDef(0));
+   }
+
+   return ret;
  }
  
  bool