// ah*bl 00
//
// fffe0001 + fffe0001
+//
+// Note that this sort of splitting doesn't work for signed values, so we
+// compute the sign on those manually and then perform an unsigned multiply.
static bool
expandIntegerMUL(BuildUtil *bld, Instruction *mul)
{
const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+ ImmediateValue src1;
+ bool src1imm = mul->src(1).getImmediate(src1);
+
+ DataType fTy; // full type
+ switch (mul->sType) {
+ case TYPE_S32: fTy = TYPE_U32; break;
+ case TYPE_S64: fTy = TYPE_U64; break;
+ default: fTy = mul->sType; break;
+ }
- DataType fTy = mul->sType; // full type
- DataType hTy;
+ DataType hTy; // half type
switch (fTy) {
- case TYPE_S32: hTy = TYPE_S16; break;
case TYPE_U32: hTy = TYPE_U16; break;
case TYPE_U64: hTy = TYPE_U32; break;
- case TYPE_S64: hTy = TYPE_S32; break;
default:
return false;
}
bld->setPosition(mul, true);
+ Value *s[2];
Value *a[2], *b[2];
- Value *c[2];
Value *t[4];
for (int j = 0; j < 4; ++j)
t[j] = bld->getSSA(fullSize);
+ if (isSignedType(mul->sType) && highResult) {
+ s[0] = bld->getSSA(fullSize);
+ s[1] = bld->getSSA(fullSize);
+ bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
+ bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
+ src1.reg.data.s32 = abs(src1.reg.data.s32);
+ } else {
+ s[0] = mul->getSrc(0);
+ s[1] = mul->getSrc(1);
+ }
+
// split sources into halves
- i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
- i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+ i[0] = bld->mkSplit(a, halfSize, s[0]);
+ i[1] = bld->mkSplit(b, halfSize, s[1]);
- i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
- i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+ if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
+ i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
+ bld->mkImm(src1.reg.data.u32 & 0xffff));
+ } else {
+ i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
+ src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
+ if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
+ i[3] = i[2];
+ t[1] = t[0];
+ } else {
+ i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+ }
+ }
i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
- i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+ if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
+ i[4] = i[3];
+ t[3] = t[2];
+ } else {
+ i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+ }
if (highResult) {
- Value *r[3];
+ Value *c[2];
+ Value *r[5];
Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
c[0] = bld->getSSA(1, FILE_FLAGS);
c[1] = bld->getSSA(1, FILE_FLAGS);
- for (int j = 0; j < 3; ++j)
+ for (int j = 0; j < 5; ++j)
r[j] = bld->getSSA(fullSize);
i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
- bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
- i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+ bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
+ bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
+ i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
// set carry defs / sources
i[3]->setFlagsDef(1, c[0]);
- i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+ // actual result required in negative case, but ignored for
+ // unsigned. for some reason the compiler ends up dropping the whole
+ // instruction if the destination is unused but the flags are.
+ if (isSignedType(mul->sType))
+ i[4]->setFlagsDef(1, c[1]);
+ else
+ i[4]->setFlagsDef(0, c[1]);
i[6]->setPredicate(CC_C, c[0]);
i[5]->setFlagsSrc(3, c[1]);
+
+ if (isSignedType(mul->sType)) {
+ Value *cc[2];
+ Value *rr[7];
+ Value *one = bld->getSSA(fullSize);
+ bld->loadImm(one, 1);
+ for (int j = 0; j < 7; j++)
+ rr[j] = bld->getSSA(fullSize);
+
+ // NOTE: this logic uses predicates because splitting basic blocks is
+ // ~impossible during the SSA phase. The RA relies on a correlation
+ // between edge order and phi node sources.
+
+ // Set the sign of the result based on the inputs
+ bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
+ ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
+
+ // 1s complement of 64-bit value
+ bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
+ ->setPredicate(CC_S, cc[0]);
+ bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
+ ->setPredicate(CC_S, cc[0]);
+
+ // add to low 32-bits, keep track of the carry
+ Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
+ n->setPredicate(CC_S, cc[0]);
+ n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
+
+ // If there was a carry, add 1 to the upper 32 bits
+ // XXX: These get executed even if they shouldn't be
+ bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
+ ->setPredicate(CC_C, cc[1]);
+ bld->mkMov(rr[3], rr[0])
+ ->setPredicate(CC_NC, cc[1]);
+ bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
+
+ // Merge the results from the negative and non-negative paths
+ bld->mkMov(rr[5], rr[4])
+ ->setPredicate(CC_S, cc[0]);
+ bld->mkMov(rr[6], r[4])
+ ->setPredicate(CC_NS, cc[0]);
+ bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
+ } else {
+ bld->mkMov(mul->getDef(0), r[4]);
+ }
} else {
bld->mkMov(mul->getDef(0), t[3]);
}
Program *prog = fn->getProgram();
r63 = new_LValue(fn, FILE_GPR);
- r63->reg.data.id = 63;
+ // GPR units on nv50 are in half-regs
+ if (prog->maxGPR < 126)
+ r63->reg.data.id = 63;
+ else
+ r63->reg.data.id = 127;
// this is actually per-program, but we can do it all on visiting main()
std::list<Instruction *> *outWrites =
next = hi;
}
- if (i->op != OP_MOV && i->op != OP_PFETCH &&
- i->op != OP_BAR &&
+ if (i->op != OP_PFETCH && i->op != OP_BAR &&
(!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
replaceZero(i);
}
return;
for (int s = 0; di->srcExists(s); ++s)
- if (di->src(s).getFile() == FILE_IMMEDIATE)
+ if (di->src(s).getFile() == FILE_IMMEDIATE ||
+ di->src(s).getFile() == FILE_MEMORY_LOCAL)
return;
if (prog->getType() == Program::TYPE_GEOMETRY) {
bool handleTXB(TexInstruction *); // I really
bool handleTXL(TexInstruction *); // hate
bool handleTXD(TexInstruction *); // these 3
+ bool handleTXLQ(TexInstruction *);
+ bool handleTXQ(TexInstruction *);
bool handleCALL(Instruction *);
bool handlePRECONT(Instruction *);
Value **ms_x, Value **ms_y) {
// This loads the texture-indexed ms setting from the constant buffer
Value *tmp = new_LValue(func, FILE_GPR);
- uint8_t b = prog->driver->io.resInfoCBSlot;
+ uint8_t b = prog->driver->io.auxCBSlot;
off += prog->driver->io.suInfoBase;
+ if (prog->getType() > Program::TYPE_VERTEX)
+ off += 16 * 2 * 4;
+ if (prog->getType() > Program::TYPE_GEOMETRY)
+ off += 16 * 2 * 4;
*ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
*ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
const int dref = arg;
const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
+ /* Only normalize in the non-explicit derivatives case.
+ */
+ if (i->tex.target.isCube() && i->op != OP_TXD) {
+ Value *src[3], *val;
+ int c;
+ for (c = 0; c < 3; ++c)
+ src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+ val = bld.getScratch();
+ bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+ bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+ bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+ for (c = 0; c < 3; ++c) {
+ i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+ i->getSrc(c), val));
+ }
+ }
+
// handle MS, which means looking up the MS params for this texture, and
// adjusting the input coordinates to point at the right sample.
if (i->tex.target.isMS()) {
// texel offsets are 3 immediate fields in the instruction,
// nv50 cannot do textureGatherOffsets
assert(i->tex.useOffsets <= 1);
+ if (i->tex.useOffsets) {
+ for (int c = 0; c < 3; ++c) {
+ ImmediateValue val;
+ if (!i->offset[0][c].getImmediate(val))
+ assert(!"non-immediate offset");
+ i->tex.offset[c] = val.reg.data.u32;
+ i->offset[0][c].set(NULL);
+ }
+ }
return true;
}
const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
int l, d;
+ // We can't actually apply bias *and* do a compare for a cube
+ // texture. Since the compare has to be done before the filtering, just
+ // drop the bias on the floor.
+ if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
+ i->op = OP_TEX;
+ i->setSrc(3, i->getSrc(4));
+ i->setSrc(4, NULL);
+ return handleTEX(i);
+ }
+
handleTEX(i);
Value *bias = i->getSrc(i->tex.target.getArgCount());
if (bias->isUniform())
}
Value *flags = bld.getScratch(1, FILE_FLAGS);
bld.setPosition(cond, true);
- bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+ bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
Instruction *tex[4];
for (l = 0; l < 4; ++l) {
BasicBlock *joinBB = i->bb->splitAfter(i);
bld.setPosition(currBB, true);
+ assert(!currBB->joinAt);
currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
for (int l = 0; l <= 3; ++l) {
}
}
bld.setPosition(joinBB, false);
- bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+ bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
return true;
}
Instruction *tex;
Value *zero = bld.loadImm(bld.getSSA(), 0);
int l, c;
- const int dim = i->tex.target.getDim();
+ const int dim = i->tex.target.getDim() + i->tex.target.isCube();
handleTEX(i);
i->op = OP_TEX; // no need to clone dPdx/dPdy later
+ i->tex.derivAll = true;
for (c = 0; c < dim; ++c)
crd[c] = bld.getScratch();
bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
for (l = 0; l < 4; ++l) {
+ Value *src[3], *val;
// mov coordinates from lane l to all lanes
for (c = 0; c < dim; ++c)
bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
// add dPdy from lane l to lanes dy
for (c = 0; c < dim; ++c)
bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+ // normalize cube coordinates if necessary
+ if (i->tex.target.isCube()) {
+ for (c = 0; c < 3; ++c)
+ src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+ val = bld.getScratch();
+ bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+ bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+ bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+ for (c = 0; c < 3; ++c)
+ src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+ } else {
+ for (c = 0; c < dim; ++c)
+ src[c] = crd[c];
+ }
// texture
bld.insert(tex = cloneForward(func, i));
for (c = 0; c < dim; ++c)
- tex->setSrc(c, crd[c]);
+ tex->setSrc(c, src[c]);
// save results
for (c = 0; i->defExists(c); ++c) {
Instruction *mov;
return true;
}
+bool
+NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
+{
+ handleTEX(i);
+ bld.setPosition(i, true);
+
+ /* The returned values are not quite what we want:
+ * (a) convert from s32 to f32
+ * (b) multiply by 1/256
+ */
+ for (int def = 0; def < 2; ++def) {
+ if (!i->defExists(def))
+ continue;
+ bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
+ bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
+ i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
+ }
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
+{
+ Value *ms, *ms_x, *ms_y;
+ if (i->tex.query == TXQ_DIMS)
+ return true;
+ assert(i->tex.query == TXQ_TYPE);
+ assert(i->tex.mask == 4);
+
+ loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
+ bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
+ i->bb->remove(i);
+
+ return true;
+}
+
+
bool
NV50LoweringPreSSA::handleSET(Instruction *i)
{
case SV_FACE:
bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
if (i->dType == TYPE_F32) {
- bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
- bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
+ bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
+ bld.mkOp1(OP_NEG, TYPE_S32, def, def);
+ bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
}
break;
case SV_NCTAID:
bld.mkMov(def, bld.mkImm(0));
}
break;
+ case SV_SAMPLE_POS: {
+ Value *off = new_LValue(func, FILE_ADDRESS);
+ bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
+ bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
+ bld.mkLoad(TYPE_F32,
+ def,
+ bld.mkSymbol(
+ FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+ TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
+ off);
+ break;
+ }
default:
bld.mkFetch(i->getDef(0), i->dType,
FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
bool
NV50LoweringPreSSA::handleSQRT(Instruction *i)
{
- Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
- bld.getSSA(), i->getSrc(0));
- i->op = OP_MUL;
- i->setSrc(1, rsq->getDef(0));
+ bld.setPosition(i, true);
+ i->op = OP_RSQ;
+ bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
return true;
}
i->setDef(0, new_LValue(func, FILE_GPR));
i->getDef(0)->reg.data.id = id;
- prog->maxGPR = MAX2(prog->maxGPR, id);
+ prog->maxGPR = MAX2(prog->maxGPR, id * 2);
}
}
return true;
Value *pred = insn->getPredicate();
Value *cdst;
- if (!pred || pred->reg.file == FILE_FLAGS)
+ // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
+ if (!pred ||
+ pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
return;
+
cdst = bld.getSSA(1, FILE_FLAGS);
bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
return handleTXL(i->asTex());
case OP_TXD:
return handleTXD(i->asTex());
+ case OP_TXLQ:
+ return handleTXLQ(i->asTex());
+ case OP_TXQ:
+ return handleTXQ(i->asTex());
case OP_EX2:
bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
i->setSrc(0, i->getDef(0));