gm107/ir: fix indirect txq emission

[mesa.git] / src / gallium / drivers / nouveau / codegen / nv50_ir_lowering_nvc0.cpp
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp

index 92f9a156f7c7b8db02cd3dcda240d0c1a99c89b2..e71fa113d9957b21ccd0954d364db051dc3ebd7b 100644 (file)
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -70,7 +70,49 @@ NVC0LegalizeSSA::handleDIV(Instruction *i)
  void
  NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
  {
-   // TODO
+   assert(i->dType == TYPE_F64);
+   // There are instructions that will compute the high 32 bits of the 64-bit
+   // float. We will just stick 0 in the bottom 32 bits.
+
+   bld.setPosition(i, false);
+
+   // 1. Take the source and it up.
+   Value *src[2], *dst[2], *def = i->getDef(0);
+   bld.mkSplit(src, 4, i->getSrc(0));
+
+   // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
+   dst[0] = bld.loadImm(NULL, 0);
+   dst[1] = bld.getSSA();
+
+   // 3. The new version of the instruction takes the high 32 bits of the
+   // source and outputs the high 32 bits of the destination.
+   i->setSrc(0, src[1]);
+   i->setDef(0, dst[1]);
+   i->setType(TYPE_F32);
+   i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
+
+   // 4. Recombine the two dst pieces back into the original destination.
+   bld.setPosition(i, true);
+   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
+}
+
+void
+NVC0LegalizeSSA::handleFTZ(Instruction *i)
+{
+   // Only want to flush float inputs
+   assert(i->sType == TYPE_F32);
+
+   // If we're already flushing denorms (and NaN's) to zero, no need for this.
+   if (i->dnz)
+      return;
+
+   // Only certain classes of operations can flush
+   OpClass cls = prog->getTarget()->getOpClass(i->op);
+   if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
+       cls != OPCLASS_CONVERT)
+      return;
+
+   i->ftz = true;
  }
  
  bool
@@ -86,8 +128,11 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
     Instruction *next;
     for (Instruction *i = bb->getEntry(); i; i = next) {
        next = i->next;
-      if (i->dType == TYPE_F32)
+      if (i->sType == TYPE_F32) {
+         if (prog->getType() != Program::TYPE_COMPUTE)
+            handleFTZ(i);
           continue;
+      }
        switch (i->op) {
        case OP_DIV:
        case OP_MOD:
@@ -123,7 +168,7 @@ NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
  
  void
  NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
-                              Instruction *usei, const Instruction *insn)
+                              Instruction *usei, const Instruction *texi)
  {
     bool add = true;
     for (std::list<TexUse>::iterator it = uses.begin();
@@ -138,7 +183,7 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
           ++it;
     }
     if (add)
-      uses.push_back(TexUse(usei, insn));
+      uses.push_back(TexUse(usei, texi));
  }
  
  void
@@ -150,7 +195,8 @@ NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
     while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
        insn = insn->getSrc(0)->getUniqueInsn();
  
-   if (!insn || !insn->bb->reachableBy(texi->bb, term))
+   // NOTE: the tex itself is, of course, not an overwriting definition
+   if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
        return;
  
     switch (insn->op) {
@@ -185,22 +231,25 @@ NVC0LegalizePostRA::findFirstUses(
        for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
           Instruction *usei = (*u)->getInsn();
  
-         /* XXX HACK ALERT XXX
-          *
-          * This shouldn't have to be here, we should always be making forward
-          * progress by looking at the uses. However this somehow does not
-          * appear to be the case. Probably because this is being done right
-          * after RA, when the defs/uses lists have been messed with by node
-          * merging. This should probably be moved to being done right before
-          * RA. But this will do for now.
-          */
+         // NOTE: In case of a loop that overwrites a value but never uses
+         // it, it can happen that we have a cycle of uses that consists only
+         // of phis and no-op moves and will thus cause an infinite loop here
+         // since these are not considered actual uses.
+         // The most obvious (and perhaps the only) way to prevent this is to
+         // remember which instructions we've already visited.
+
           if (visited.find(usei) != visited.end())
              continue;
  
           visited.insert(usei);
  
           if (usei->op == OP_PHI || usei->op == OP_UNION) {
-            // need a barrier before WAW cases
+            // need a barrier before WAW cases, like:
+            //   %r0 = tex
+            //   if ...
+            //     texbar <- is required or tex might replace x again
+            //     %r1 = x <- overwriting def
+            //   %r2 = phi %r0, %r1
              for (int s = 0; usei->srcExists(s); ++s) {
                 Instruction *defi = usei->getSrc(s)->getUniqueInsn();
                 if (defi && &usei->src(s) != *u)
@@ -219,7 +268,7 @@ NVC0LegalizePostRA::findFirstUses(
               usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
              findFirstUses(texi, usei, uses, visited);
           } else {
-            addTexUse(uses, usei, insn);
+            addTexUse(uses, usei, texi);
           }
        }
     }
@@ -311,7 +360,6 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
        }
     }
     delete[] uses;
-   uses = NULL;
  
     // insert the barriers
     for (size_t i = 0; i < useVec.size(); ++i) {
@@ -332,11 +380,8 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
        }
     }
  
-   if (fn->getProgram()->optLevel < 3) {
-      if (uses)
-         delete[] uses;
+   if (fn->getProgram()->optLevel < 3)
        return true;
-   }
  
     std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
  
@@ -421,8 +466,6 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
              prev = i;
        }
     }
-   if (uses)
-      delete[] uses;
     return true;
  }
  
@@ -609,6 +652,21 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
     //  lod bias
     //  depth compare
     //  offsets (same as fermi, except txd which takes it with array)
+   //
+   // Maxwell (tex):
+   //  array
+   //  coords
+   //  indirect handle
+   //  sample
+   //  lod bias
+   //  depth compare
+   //  offsets
+   //
+   // Maxwell (txd):
+   //  indirect handle
+   //  coords
+   //  array + offsets
+   //  derivatives
  
     if (chipset >= NVISA_GK104_CHIPSET) {
        if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
@@ -642,12 +700,17 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
           const int sat = (i->op == OP_TXF) ? 1 : 0;
           DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
           bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
-         for (int s = dim; s >= 1; --s)
-            i->setSrc(s, i->getSrc(s - 1));
-         i->setSrc(0, layer);
+         if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
+            for (int s = dim; s >= 1; --s)
+               i->setSrc(s, i->getSrc(s - 1));
+            i->setSrc(0, layer);
+         } else {
+            i->setSrc(dim, layer);
+         }
        }
        // Move the indirect reference to the first place
-      if (i->tex.rIndirectSrc >= 0) {
+      if (i->tex.rIndirectSrc >= 0 && (
+                i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
           Value *hnd = i->getIndirectR();
  
           i->setIndirectR(NULL);
@@ -742,7 +805,8 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
           assert(i->tex.useOffsets == 1);
           for (c = 0; c < 3; ++c) {
              ImmediateValue val;
-            assert(i->offset[0][c].getImmediate(val));
+            if (!i->offset[0][c].getImmediate(val))
+               assert(!"non-immediate offset passed to non-TXG");
              imm |= (val.reg.data.u32 & 0xf) << (c * 4);
           }
           if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
@@ -750,8 +814,10 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
              // create it if it's not already there, and INSBF it if it already
              // is.
              s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
+            if (chipset >= NVISA_GM107_CHIPSET)
+               s += dim;
              if (i->tex.target.isArray()) {
-               bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(0),
+               bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
                           bld.loadImm(NULL, imm), bld.mkImm(0xc10),
                           i->getSrc(s));
              } else {
@@ -890,7 +956,47 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
  bool
  NVC0LoweringPass::handleTXQ(TexInstruction *txq)
  {
-   // TODO: indirect resource/sampler index
+   if (txq->tex.rIndirectSrc < 0)
+      return true;
+
+   Value *ticRel = txq->getIndirectR();
+   const int chipset = prog->getTarget()->getChipset();
+
+   txq->setIndirectS(NULL);
+   txq->tex.sIndirectSrc = -1;
+
+   assert(ticRel);
+
+   if (chipset < NVISA_GK104_CHIPSET) {
+      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+      txq->setSrc(txq->tex.rIndirectSrc, NULL);
+      if (txq->tex.r)
+         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                             ticRel, bld.mkImm(txq->tex.r));
+
+      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
+
+      txq->moveSources(0, 1);
+      txq->setSrc(0, src);
+   } else {
+      Value *hnd = loadTexHandle(
+            bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                       txq->getIndirectR(), bld.mkImm(2)),
+            txq->tex.r);
+      txq->tex.r = 0xff;
+      txq->tex.s = 0x1f;
+
+      if (chipset < NVISA_GM107_CHIPSET) {
+         txq->setIndirectR(NULL);
+         txq->moveSources(0, 1);
+         txq->setSrc(0, hnd);
+         txq->tex.rIndirectSrc = 0;
+      } else {
+         txq->setIndirectR(hnd);
+      }
+   }
+
     return true;
  }
  
@@ -1439,8 +1545,9 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
        Value *face = i->getDef(0);
        bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
        if (i->dType == TYPE_F32) {
-         bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
-         bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
+         bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
+         bld.mkOp1(OP_NEG, TYPE_S32, face, face);
+         bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
        }
     }
        break;
@@ -1504,7 +1611,7 @@ NVC0LoweringPass::handleDIV(Instruction *i)
     if (!isFloatType(i->dType))
        return true;
     bld.setPosition(i, false);
-   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
     i->op = OP_MUL;
     i->setSrc(1, rcp->getDef(0));
     return true;
@@ -1513,13 +1620,13 @@ NVC0LoweringPass::handleDIV(Instruction *i)
  bool
  NVC0LoweringPass::handleMOD(Instruction *i)
  {
-   if (i->dType != TYPE_F32)
+   if (!isFloatType(i->dType))
        return true;
-   LValue *value = bld.getScratch();
-   bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
-   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
-   bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
-   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+   LValue *value = bld.getScratch(typeSizeof(i->dType));
+   bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
+   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
+   bld.mkOp1(OP_TRUNC, i->dType, value, value);
+   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
     i->op = OP_SUB;
     i->setSrc(1, value);
     return true;
@@ -1528,10 +1635,22 @@ NVC0LoweringPass::handleMOD(Instruction *i)
  bool
  NVC0LoweringPass::handleSQRT(Instruction *i)
  {
-   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
-                                bld.getSSA(), i->getSrc(0));
+   Value *pred = bld.getSSA(1, FILE_PREDICATE);
+   Value *zero = bld.getSSA();
+   Instruction *rsq;
+
+   bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
+   if (i->dType == TYPE_F64)
+      zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
+   bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
+   bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred);
+   rsq = bld.mkOp1(OP_RSQ, i->dType,
+                   bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
+   rsq->setPredicate(CC_NOT_P, pred);
     i->op = OP_MUL;
     i->setSrc(1, rsq->getDef(0));
+   i->setPredicate(CC_NOT_P, pred);
+
  
     return true;
  }
@@ -1677,6 +1796,7 @@ NVC0LoweringPass::visit(Instruction *i)
              Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                                      i->getIndirect(0, 0), bld.mkImm(4));
              i->setIndirect(0, 0, ptr);
+            i->op = OP_VFETCH;
           } else {
              i->op = OP_VFETCH;
              assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP