#include "codegen/nv50_ir_build_util.h"
#include "codegen/nv50_ir_target_nvc0.h"
+#include "codegen/nv50_ir_lowering_nvc0.h"
#include <limits>
((QOP_##q << 6) | (QOP_##r << 4) | \
(QOP_##s << 2) | (QOP_##t << 0))
-class NVC0LegalizeSSA : public Pass
-{
-private:
- virtual bool visit(BasicBlock *);
- virtual bool visit(Function *);
-
- // we want to insert calls to the builtin library only after optimization
- void handleDIV(Instruction *); // integer division, modulus
- void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
-
-private:
- BuildUtil bld;
-};
-
void
NVC0LegalizeSSA::handleDIV(Instruction *i)
{
void
NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
{
- // TODO
+ assert(i->dType == TYPE_F64);
+ // There are instructions that will compute the high 32 bits of the 64-bit
+ // float. We will just stick 0 in the bottom 32 bits.
+
+ bld.setPosition(i, false);
+
+ // 1. Take the source and it up.
+ Value *src[2], *dst[2], *def = i->getDef(0);
+ bld.mkSplit(src, 4, i->getSrc(0));
+
+ // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
+ dst[0] = bld.loadImm(NULL, 0);
+ dst[1] = bld.getSSA();
+
+ // 3. The new version of the instruction takes the high 32 bits of the
+ // source and outputs the high 32 bits of the destination.
+ i->setSrc(0, src[1]);
+ i->setDef(0, dst[1]);
+ i->setType(TYPE_F32);
+ i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
+
+ // 4. Recombine the two dst pieces back into the original destination.
+ bld.setPosition(i, true);
+ bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
+}
+
+void
+NVC0LegalizeSSA::handleFTZ(Instruction *i)
+{
+ // Only want to flush float inputs
+ assert(i->sType == TYPE_F32);
+
+ // If we're already flushing denorms (and NaN's) to zero, no need for this.
+ if (i->dnz)
+ return;
+
+ // Only certain classes of operations can flush
+ OpClass cls = prog->getTarget()->getOpClass(i->op);
+ if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
+ cls != OPCLASS_CONVERT)
+ return;
+
+ i->ftz = true;
}
bool
Instruction *next;
for (Instruction *i = bb->getEntry(); i; i = next) {
next = i->next;
- if (i->dType == TYPE_F32)
+ if (i->sType == TYPE_F32) {
+ if (prog->getType() != Program::TYPE_COMPUTE)
+ handleFTZ(i);
continue;
+ }
switch (i->op) {
case OP_DIV:
case OP_MOD:
return true;
}
-class NVC0LegalizePostRA : public Pass
-{
-public:
- NVC0LegalizePostRA(const Program *);
-
-private:
- virtual bool visit(Function *);
- virtual bool visit(BasicBlock *);
-
- void replaceZero(Instruction *);
- bool tryReplaceContWithBra(BasicBlock *);
- void propagateJoin(BasicBlock *);
-
- struct TexUse
- {
- TexUse(Instruction *use, const Instruction *tex)
- : insn(use), tex(tex), level(-1) { }
- Instruction *insn;
- const Instruction *tex; // or split / mov
- int level;
- };
- struct Limits
- {
- Limits() { }
- Limits(int min, int max) : min(min), max(max) { }
- int min, max;
- };
- bool insertTextureBarriers(Function *);
- inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
- void findFirstUses(const Instruction *tex, const Instruction *def,
- std::list<TexUse>&);
- void findOverwritingDefs(const Instruction *tex, Instruction *insn,
- const BasicBlock *term,
- std::list<TexUse>&);
- void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
- const Instruction *recurseDef(const Instruction *);
-
-private:
- LValue *rZero;
- LValue *carry;
- const bool needTexBar;
-};
-
NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
: rZero(NULL),
carry(NULL),
void
NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
- Instruction *usei, const Instruction *insn)
+ Instruction *usei, const Instruction *texi)
{
bool add = true;
for (std::list<TexUse>::iterator it = uses.begin();
++it;
}
if (add)
- uses.push_back(TexUse(usei, insn));
+ uses.push_back(TexUse(usei, texi));
}
void
while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
insn = insn->getSrc(0)->getUniqueInsn();
- if (!insn || !insn->bb->reachableBy(texi->bb, term))
+ // NOTE: the tex itself is, of course, not an overwriting definition
+ if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
return;
switch (insn->op) {
}
void
-NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
- const Instruction *insn,
- std::list<TexUse> &uses)
+NVC0LegalizePostRA::findFirstUses(
+ const Instruction *texi,
+ const Instruction *insn,
+ std::list<TexUse> &uses,
+ std::tr1::unordered_set<const Instruction *>& visited)
{
for (int d = 0; insn->defExists(d); ++d) {
Value *v = insn->getDef(d);
for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
Instruction *usei = (*u)->getInsn();
+ // NOTE: In case of a loop that overwrites a value but never uses
+ // it, it can happen that we have a cycle of uses that consists only
+ // of phis and no-op moves and will thus cause an infinite loop here
+ // since these are not considered actual uses.
+ // The most obvious (and perhaps the only) way to prevent this is to
+ // remember which instructions we've already visited.
+
+ if (visited.find(usei) != visited.end())
+ continue;
+
+ visited.insert(usei);
+
if (usei->op == OP_PHI || usei->op == OP_UNION) {
- // need a barrier before WAW cases
+ // need a barrier before WAW cases, like:
+ // %r0 = tex
+ // if ...
+ // texbar <- is required or tex might replace x again
+ // %r1 = x <- overwriting def
+ // %r2 = phi %r0, %r1
for (int s = 0; usei->srcExists(s); ++s) {
Instruction *defi = usei->getSrc(s)->getUniqueInsn();
if (defi && &usei->src(s) != *u)
usei->op == OP_PHI ||
usei->op == OP_UNION) {
// these uses don't manifest in the machine code
- findFirstUses(texi, usei, uses);
+ findFirstUses(texi, usei, uses, visited);
} else
if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
- findFirstUses(texi, usei, uses);
+ findFirstUses(texi, usei, uses, visited);
} else {
- addTexUse(uses, usei, insn);
+ addTexUse(uses, usei, texi);
}
}
}
uses = new std::list<TexUse>[texes.size()];
if (!uses)
return false;
- for (size_t i = 0; i < texes.size(); ++i)
- findFirstUses(texes[i], texes[i], uses[i]);
+ for (size_t i = 0; i < texes.size(); ++i) {
+ std::tr1::unordered_set<const Instruction *> visited;
+ findFirstUses(texes[i], texes[i], uses[i], visited);
+ }
// determine the barrier level at each use
for (size_t i = 0; i < texes.size(); ++i) {
}
}
delete[] uses;
- uses = NULL;
// insert the barriers
for (size_t i = 0; i < useVec.size(); ++i) {
}
}
- if (fn->getProgram()->optLevel < 3) {
- if (uses)
- delete[] uses;
+ if (fn->getProgram()->optLevel < 3)
return true;
- }
std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
prev = i;
}
}
- if (uses)
- delete[] uses;
return true;
}
i->setDef(0, NULL);
if (i->src(0).getFile() == FILE_IMMEDIATE)
i->setSrc(0, rZero); // initial value must be 0
+ replaceZero(i);
} else
if (i->isNop()) {
bb->remove(i);
return true;
}
-class NVC0LoweringPass : public Pass
-{
-public:
- NVC0LoweringPass(Program *);
-
-private:
- virtual bool visit(Function *);
- virtual bool visit(BasicBlock *);
- virtual bool visit(Instruction *);
-
- bool handleRDSV(Instruction *);
- bool handleWRSV(Instruction *);
- bool handleEXPORT(Instruction *);
- bool handleOUT(Instruction *);
- bool handleDIV(Instruction *);
- bool handleMOD(Instruction *);
- bool handleSQRT(Instruction *);
- bool handlePOW(Instruction *);
- bool handleTEX(TexInstruction *);
- bool handleTXD(TexInstruction *);
- bool handleTXQ(TexInstruction *);
- bool handleManualTXD(TexInstruction *);
- bool handleTXLQ(TexInstruction *);
- bool handleATOM(Instruction *);
- bool handleCasExch(Instruction *, bool needCctl);
- void handleSurfaceOpNVE4(TexInstruction *);
-
- void checkPredicate(Instruction *);
-
- void readTessCoord(LValue *dst, int c);
-
- Value *loadResInfo32(Value *ptr, uint32_t off);
- Value *loadMsInfo32(Value *ptr, uint32_t off);
- Value *loadTexHandle(Value *ptr, unsigned int slot);
-
- void adjustCoordinatesMS(TexInstruction *);
- void processSurfaceCoordsNVE4(TexInstruction *);
-
-private:
- const Target *const targ;
-
- BuildUtil bld;
-
- Symbol *gMemBase;
- LValue *gpEmitAddress;
-};
-
NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
{
bld.setProgram(prog);
const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
const int chipset = prog->getTarget()->getChipset();
+ // Arguments to the TEX instruction are a little insane. Even though the
+ // encoding is identical between SM20 and SM30, the arguments mean
+ // different things between Fermi and Kepler+. A lot of arguments are
+ // optional based on flags passed to the instruction. This summarizes the
+ // order of things.
+ //
+ // Fermi:
+ // array/indirect
+ // coords
+ // sample
+ // lod bias
+ // depth compare
+ // offsets:
+ // - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
+ // - other: 4 bits each, single reg
+ //
+ // Kepler+:
+ // indirect handle
+ // array (+ offsets for txd in upper 16 bits)
+ // coords
+ // sample
+ // lod bias
+ // depth compare
+ // offsets (same as fermi, except txd which takes it with array)
+ //
+ // Maxwell (tex):
+ // array
+ // coords
+ // indirect handle
+ // sample
+ // lod bias
+ // depth compare
+ // offsets
+ //
+ // Maxwell (txd):
+ // indirect handle
+ // coords
+ // array + offsets
+ // derivatives
+
if (chipset >= NVISA_GK104_CHIPSET) {
if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
- WARN("indirect TEX not implemented\n");
- }
- if (i->tex.r == i->tex.s) {
+ // XXX this ignores tsc, and assumes a 1:1 mapping
+ assert(i->tex.rIndirectSrc >= 0);
+ Value *hnd = loadTexHandle(
+ bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+ i->getIndirectR(), bld.mkImm(2)),
+ i->tex.r);
+ i->tex.r = 0xff;
+ i->tex.s = 0x1f;
+ i->setIndirectR(hnd);
+ i->setIndirectS(NULL);
+ } else if (i->tex.r == i->tex.s) {
i->tex.r += prog->driver->io.texBindBase / 4;
i->tex.s = 0; // only a single cX[] value possible here
} else {
const int sat = (i->op == OP_TXF) ? 1 : 0;
DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
- for (int s = dim; s >= 1; --s)
- i->setSrc(s, i->getSrc(s - 1));
- i->setSrc(0, layer);
+ if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
+ for (int s = dim; s >= 1; --s)
+ i->setSrc(s, i->getSrc(s - 1));
+ i->setSrc(0, layer);
+ } else {
+ i->setSrc(dim, layer);
+ }
+ }
+ // Move the indirect reference to the first place
+ if (i->tex.rIndirectSrc >= 0 && (
+ i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
+ Value *hnd = i->getIndirectR();
+
+ i->setIndirectR(NULL);
+ i->moveSources(0, 1);
+ i->setSrc(0, hnd);
+ i->tex.rIndirectSrc = 0;
+ i->tex.sIndirectSrc = -1;
}
} else
// (nvc0) generate and move the tsc/tic/array source to the front
if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+ Value *ticRel = i->getIndirectR();
+ Value *tscRel = i->getIndirectS();
+
+ if (ticRel) {
+ i->setSrc(i->tex.rIndirectSrc, NULL);
+ if (i->tex.r)
+ ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+ ticRel, bld.mkImm(i->tex.r));
+ }
+ if (tscRel) {
+ i->setSrc(i->tex.sIndirectSrc, NULL);
+ if (i->tex.s)
+ tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+ tscRel, bld.mkImm(i->tex.s));
+ }
+
Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
for (int s = dim; s >= 1; --s)
i->setSrc(s, i->getSrc(s - 1));
i->setSrc(0, arrayIndex);
- Value *ticRel = i->getIndirectR();
- Value *tscRel = i->getIndirectS();
-
if (arrayIndex) {
int sat = (i->op == OP_TXF) ? 1 : 0;
DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
bld.loadImm(src, 0);
}
- if (ticRel) {
- i->setSrc(i->tex.rIndirectSrc, NULL);
+ if (ticRel)
bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
- }
- if (tscRel) {
- i->setSrc(i->tex.sIndirectSrc, NULL);
+ if (tscRel)
bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
- }
i->setSrc(0, src);
}
assert(chipset >= NVISA_GK104_CHIPSET ||
!i->tex.useOffsets || !i->tex.target.isMS());
- // offset is last source (lod 1st, dc 2nd)
+ // offset is between lod and dc
if (i->tex.useOffsets) {
int n, c;
int s = i->srcCount(0xff, true);
- if (i->srcExists(s)) // move potential predicate out of the way
- i->moveSources(s, 1);
- if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
- i->moveSources(s + 1, 1);
+ if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
+ if (i->tex.target.isShadow())
+ s--;
+ if (i->srcExists(s)) // move potential predicate out of the way
+ i->moveSources(s, 1);
+ if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
+ i->moveSources(s + 1, 1);
+ }
if (i->op == OP_TXG) {
// Either there is 1 offset, which goes into the 2 low bytes of the
// first source, or there are 4 offsets, which go into 2 sources (8
assert(i->tex.useOffsets == 1);
for (c = 0; c < 3; ++c) {
ImmediateValue val;
- assert(i->offset[0][c].getImmediate(val));
+ if (!i->offset[0][c].getImmediate(val))
+ assert(!"non-immediate offset passed to non-TXG");
imm |= (val.reg.data.u32 & 0xf) << (c * 4);
}
- i->setSrc(s, bld.loadImm(NULL, imm));
+ if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
+ // The offset goes into the upper 16 bits of the array index. So
+ // create it if it's not already there, and INSBF it if it already
+ // is.
+ s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
+ if (chipset >= NVISA_GM107_CHIPSET)
+ s += dim;
+ if (i->tex.target.isArray()) {
+ bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
+ bld.loadImm(NULL, imm), bld.mkImm(0xc10),
+ i->getSrc(s));
+ } else {
+ i->moveSources(s, 1);
+ i->setSrc(s, bld.loadImm(NULL, imm << 16));
+ }
+ } else {
+ i->setSrc(s, bld.loadImm(NULL, imm));
+ }
}
}
Value *zero = bld.loadImm(bld.getSSA(), 0);
int l, c;
const int dim = i->tex.target.getDim();
+ const int array = i->tex.target.isArray();
i->op = OP_TEX; // no need to clone dPdx/dPdy later
for (l = 0; l < 4; ++l) {
// mov coordinates from lane l to all lanes
for (c = 0; c < dim; ++c)
- bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+ bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
// add dPdx from lane l to lanes dx
for (c = 0; c < dim; ++c)
bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
// texture
bld.insert(tex = cloneForward(func, i));
for (c = 0; c < dim; ++c)
- tex->setSrc(c, crd[c]);
+ tex->setSrc(c + array, crd[c]);
// save results
for (c = 0; i->defExists(c); ++c) {
Instruction *mov;
NVC0LoweringPass::handleTXD(TexInstruction *txd)
{
int dim = txd->tex.target.getDim();
- int arg = txd->tex.target.getArgCount();
+ unsigned arg = txd->tex.target.getArgCount();
+ unsigned expected_args = arg;
+ const int chipset = prog->getTarget()->getChipset();
+
+ if (chipset >= NVISA_GK104_CHIPSET) {
+ if (!txd->tex.target.isArray() && txd->tex.useOffsets)
+ expected_args++;
+ if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
+ expected_args++;
+ } else {
+ if (txd->tex.useOffsets)
+ expected_args++;
+ if (!txd->tex.target.isArray() && (
+ txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
+ expected_args++;
+ }
+
+ if (expected_args > 4 ||
+ dim > 2 ||
+ txd->tex.target.isShadow() ||
+ txd->tex.target.isCube())
+ txd->op = OP_TEX;
handleTEX(txd);
while (txd->srcExists(arg))
++arg;
txd->tex.derivAll = true;
- if (dim > 2 ||
- txd->tex.target.isCube() ||
- arg > 4 ||
- txd->tex.target.isShadow())
+ if (txd->op == OP_TEX)
return handleManualTXD(txd);
+ assert(arg == expected_args);
for (int c = 0; c < dim; ++c) {
txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
bool
NVC0LoweringPass::handleTXQ(TexInstruction *txq)
{
- // TODO: indirect resource/sampler index
+ if (txq->tex.rIndirectSrc < 0)
+ return true;
+
+ Value *ticRel = txq->getIndirectR();
+ const int chipset = prog->getTarget()->getChipset();
+
+ txq->setIndirectS(NULL);
+ txq->tex.sIndirectSrc = -1;
+
+ assert(ticRel);
+
+ if (chipset < NVISA_GK104_CHIPSET) {
+ LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+ txq->setSrc(txq->tex.rIndirectSrc, NULL);
+ if (txq->tex.r)
+ ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+ ticRel, bld.mkImm(txq->tex.r));
+
+ bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
+
+ txq->moveSources(0, 1);
+ txq->setSrc(0, src);
+ } else {
+ Value *hnd = loadTexHandle(
+ bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+ txq->getIndirectR(), bld.mkImm(2)),
+ txq->tex.r);
+ txq->tex.r = 0xff;
+ txq->tex.s = 0x1f;
+
+ if (chipset < NVISA_GM107_CHIPSET) {
+ txq->setIndirectR(NULL);
+ txq->moveSources(0, 1);
+ txq->setSrc(0, hnd);
+ txq->tex.rIndirectSrc = 0;
+ } else {
+ txq->setIndirectR(hnd);
+ }
+ }
+
return true;
}
switch (sv) {
case SV_POSITION:
assert(prog->getType() == Program::TYPE_FRAGMENT);
- bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+ if (i->srcExists(1)) {
+ // Pass offset through to the interpolation logic
+ ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
+ i->getDef(0), addr, NULL);
+ ld->setSrc(1, i->getSrc(1));
+ } else {
+ bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+ }
break;
case SV_FACE:
{
Value *face = i->getDef(0);
bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
if (i->dType == TYPE_F32) {
- bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
- bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
+ bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
+ bld.mkOp1(OP_NEG, TYPE_S32, face, face);
+ bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
}
}
break;
if (!isFloatType(i->dType))
return true;
bld.setPosition(i, false);
- Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+ Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
i->op = OP_MUL;
i->setSrc(1, rcp->getDef(0));
return true;
bool
NVC0LoweringPass::handleMOD(Instruction *i)
{
- if (i->dType != TYPE_F32)
+ if (!isFloatType(i->dType))
return true;
- LValue *value = bld.getScratch();
- bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
- bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
- bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
- bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+ LValue *value = bld.getScratch(typeSizeof(i->dType));
+ bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
+ bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
+ bld.mkOp1(OP_TRUNC, i->dType, value, value);
+ bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
i->op = OP_SUB;
i->setSrc(1, value);
return true;
bool
NVC0LoweringPass::handleSQRT(Instruction *i)
{
- Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
- bld.getSSA(), i->getSrc(0));
+ Value *pred = bld.getSSA(1, FILE_PREDICATE);
+ Value *zero = bld.getSSA();
+ Instruction *rsq;
+
+ bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
+ if (i->dType == TYPE_F64)
+ zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
+ bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
+ bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred);
+ rsq = bld.mkOp1(OP_RSQ, i->dType,
+ bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
+ rsq->setPredicate(CC_NOT_P, pred);
i->op = OP_MUL;
i->setSrc(1, rsq->getDef(0));
+ i->setPredicate(CC_NOT_P, pred);
+
return true;
}
bool
NVC0LoweringPass::handleOUT(Instruction *i)
{
- if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
+ Instruction *prev = i->prev;
+ ImmediateValue stream, prevStream;
+
+ // Only merge if the stream ids match. Also, note that the previous
+ // instruction would have already been lowered, so we take arg1 from it.
+ if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
+ i->src(0).getImmediate(stream) &&
+ prev->src(1).getImmediate(prevStream) &&
+ stream.reg.data.u32 == prevStream.reg.data.u32) {
i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
delete_Instruction(prog, i);
} else {
assert(gpEmitAddress);
i->setDef(0, gpEmitAddress);
- if (i->srcExists(0))
- i->setSrc(1, i->getSrc(0));
+ i->setSrc(1, i->getSrc(0));
i->setSrc(0, gpEmitAddress);
}
return true;
Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
i->getIndirect(0, 0), bld.mkImm(4));
i->setIndirect(0, 0, ptr);
+ i->op = OP_VFETCH;
} else {
i->op = OP_VFETCH;
assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
}
+ } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
+ if (i->src(0).isIndirect(1)) {
+ Value *ptr;
+ if (i->src(0).isIndirect(0))
+ ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
+ i->getIndirect(0, 1), bld.mkImm(0x1010),
+ i->getIndirect(0, 0));
+ else
+ ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+ i->getIndirect(0, 1), bld.mkImm(16));
+ i->setIndirect(0, 1, NULL);
+ i->setIndirect(0, 0, ptr);
+ i->subOp = NV50_IR_SUBOP_LDC_IS;
+ }
}
break;
case OP_ATOM: