LoadPropagation::checkSwapSrc01(Instruction *insn)
{
const Target *targ = prog->getTarget();
- if (!targ->getOpInfo(insn).commutative)
- if (insn->op != OP_SET && insn->op != OP_SLCT)
+ if (!targ->getOpInfo(insn).commutative) {
+ if (insn->op != OP_SET && insn->op != OP_SLCT &&
+ insn->op != OP_SUB && insn->op != OP_XMAD)
return;
+ // XMAD is only commutative if both the CBCC and MRG flags are not set.
+ if (insn->op == OP_XMAD &&
+ (insn->subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK) == NV50_IR_SUBOP_XMAD_CBCC)
+ return;
+ if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG))
+ return;
+ }
if (insn->src(1).getFile() != FILE_GPR)
return;
+ // This is the special OP_SET used for alphatesting, we can't reverse its
+ // arguments as that will confuse the fixup code.
+ if (insn->op == OP_SET && insn->subOp)
+ return;
Instruction *i0 = insn->getSrc(0)->getInsn();
Instruction *i1 = insn->getSrc(1)->getInsn();
else
if (insn->op == OP_SLCT)
insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
+ else
+ if (insn->op == OP_SUB) {
+ insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
+ insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
+ } else
+ if (insn->op == OP_XMAD) {
+ // swap h1 flags
+ uint16_t h1 = (insn->subOp >> 1 & NV50_IR_SUBOP_XMAD_H1(0)) |
+ (insn->subOp << 1 & NV50_IR_SUBOP_XMAD_H1(1));
+ insn->subOp = (insn->subOp & ~NV50_IR_SUBOP_XMAD_H1_MASK) | h1;
+ }
}
bool
{
private:
virtual bool visit(BasicBlock *);
+
+ BuildUtil bld;
};
bool
for (Instruction *i = bb->getEntry(); i; i = next) {
next = i->next;
+ bld.setPosition(i, false);
+
for (int s = 0; i->srcExists(s); ++s) {
Instruction *insn;
ImmediateValue imm;
i->setIndirect(s, 0, NULL);
i->setSrc(s, cloneShallow(func, i->getSrc(s)));
i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+ } else if (insn->op == OP_SHLADD) {
+ if (!insn->src(2).getImmediate(imm) ||
+ !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+ continue;
+ i->setIndirect(s, 0, bld.mkOp2v(
+ OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1)));
+ i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+ i->src(s).get()->reg.data.offset += imm.reg.data.u32;
}
}
}
void expr(Instruction *, ImmediateValue&, ImmediateValue&);
void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
void opnd(Instruction *, ImmediateValue&, int s);
+ void opnd3(Instruction *, ImmediateValue&);
void unary(Instruction *, const ImmediateValue&);
else
if (i->srcExists(1) && i->src(1).getImmediate(src1))
opnd(i, src1, 1);
+ if (i->srcExists(2) && i->src(2).getImmediate(src2))
+ opnd3(i, src2);
}
return true;
}
if (!value)
return NULL;
Instruction *insn = value->getInsn();
+ if (!insn)
+ return NULL;
if (insn->asCmp() && insn->op != OP_SLCT)
return insn->asCmp();
return;
}
break;
+ case OP_SUB:
+ switch (i->dType) {
+ case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
+ case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
+ case TYPE_S32:
+ case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
+ default:
+ return;
+ }
+ break;
case OP_POW:
switch (i->dType) {
case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
// Leave PFETCH alone... we just folded its 2 args into 1.
break;
default:
- i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
+ i->op = i->saturate ? OP_SAT : OP_MOV;
+ if (i->saturate)
+ unary(i, *i->getSrc(0)->asImm());
break;
}
i->subOp = 0;
}
break;
}
+ case OP_SHLADD:
+ res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
+ break;
default:
return;
}
}
}
+void
+ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
+{
+ switch (i->op) {
+ case OP_MAD:
+ case OP_FMA:
+ if (imm2.isInteger(0)) {
+ i->op = OP_MUL;
+ i->setSrc(2, NULL);
+ foldCount++;
+ return;
+ }
+ break;
+ case OP_SHLADD:
+ if (imm2.isInteger(0)) {
+ i->op = OP_SHL;
+ i->setSrc(2, NULL);
+ foldCount++;
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+}
+
void
ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
{
+ const Target *target = prog->getTarget();
const int t = !s;
const operation op = i->op;
Instruction *newi = i;
switch (i->op) {
+ case OP_SPLIT: {
+ bld.setPosition(i, false);
+
+ uint8_t size = i->getDef(0)->reg.size;
+ uint8_t bitsize = size * 8;
+ uint32_t mask = (1ULL << bitsize) - 1;
+ assert(bitsize <= 32);
+
+ uint64_t val = imm0.reg.data.u64;
+ for (int8_t d = 0; i->defExists(d); ++d) {
+ Value *def = i->getDef(d);
+ assert(def->reg.size == size);
+
+ newi = bld.mkMov(def, bld.mkImm((uint32_t)(val & mask)), TYPE_U32);
+ val >>= bitsize;
+ }
+ delete_Instruction(prog, i);
+ break;
+ }
case OP_MUL:
if (i->dType == TYPE_F32)
tryCollapseChainedMULs(i, s, imm0);
i->postFactor = 0;
}
break;
+ case OP_FMA:
case OP_MAD:
if (imm0.isInteger(0)) {
i->setSrc(0, i->getSrc(2));
i->src(1).mod = i->src(2).mod;
i->setSrc(2, NULL);
i->op = OP_ADD;
+ } else
+ if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
+ !isFloatType(i->dType) &&
+ target->isOpSupported(OP_SHLADD, i->dType) &&
+ !i->subOp) {
+ i->op = OP_SHLADD;
+ imm0.applyLog2();
+ i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
}
break;
+ case OP_SUB:
+ if (imm0.isInteger(0) && s == 0 && typeSizeof(i->dType) == 8 &&
+ !isFloatType(i->dType))
+ break;
+ /* fallthrough */
case OP_ADD:
if (i->usesFlags())
break;
if (s == 0) {
i->setSrc(0, i->getSrc(1));
i->src(0).mod = i->src(1).mod;
+ if (i->op == OP_SUB)
+ i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
}
i->setSrc(1, NULL);
i->op = i->src(0).mod.getOp();
break;
case OP_MOD:
- if (i->sType == TYPE_U32 && imm0.isPow2()) {
+ if (s == 1 && imm0.isPow2()) {
bld.setPosition(i, false);
- i->op = OP_AND;
- i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
+ if (i->sType == TYPE_U32) {
+ i->op = OP_AND;
+ i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
+ } else if (i->sType == TYPE_S32) {
+ // Do it on the absolute value of the input, and then restore the
+ // sign. The only odd case is MIN_INT, but that should work out
+ // as well, since MIN_INT mod any power of 2 is 0.
+ //
+ // Technically we don't have to do any of this since MOD is
+ // undefined with negative arguments in GLSL, but this seems like
+ // the nice thing to do.
+ Value *abs = bld.mkOp1v(OP_ABS, TYPE_S32, bld.getSSA(), i->getSrc(0));
+ Value *neg, *v1, *v2;
+ bld.mkCmp(OP_SET, CC_LT, TYPE_S32,
+ (neg = bld.getSSA(1, prog->getTarget()->nativeFile(FILE_PREDICATE))),
+ TYPE_S32, i->getSrc(0), bld.loadImm(NULL, 0));
+ Value *mod = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), abs,
+ bld.loadImm(NULL, imm0.reg.data.u32 - 1));
+ bld.mkOp1(OP_NEG, TYPE_S32, (v1 = bld.getSSA()), mod)
+ ->setPredicate(CC_P, neg);
+ bld.mkOp1(OP_MOV, TYPE_S32, (v2 = bld.getSSA()), mod)
+ ->setPredicate(CC_NOT_P, neg);
+ newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
+
+ delete_Instruction(prog, i);
+ }
+ } else if (s == 1) {
+ // In this case, we still want the optimized lowering that we get
+ // from having division by an immediate.
+ //
+ // a % b == a - (a/b) * b
+ bld.setPosition(i, false);
+ Value *div = bld.mkOp2v(OP_DIV, i->sType, bld.getSSA(),
+ i->getSrc(0), i->getSrc(1));
+ newi = bld.mkOp2(OP_ADD, i->sType, i->getDef(0), i->getSrc(0),
+ bld.mkOp2v(OP_MUL, i->sType, bld.getSSA(), div, i->getSrc(1)));
+ // TODO: Check that target supports this. In this case, we know that
+ // all backends do.
+ newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
+
+ delete_Instruction(prog, i);
}
break;
src->op == OP_SHR &&
src->src(1).getImmediate(imm1) &&
i->src(t).mod == Modifier(0) &&
- util_is_power_of_two(imm0.reg.data.u32 + 1)) {
+ util_is_power_of_two_or_zero(imm0.reg.data.u32 + 1)) {
// low byte = offset, high byte = width
uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
i->op = OP_EXTBF;
i->setSrc(0, src->getSrc(0));
i->setSrc(1, new_ImmediateValue(prog, ext));
+ } else if (src->op == OP_SHL &&
+ src->src(1).getImmediate(imm1) &&
+ i->src(t).mod == Modifier(0) &&
+ util_is_power_of_two_or_zero(~imm0.reg.data.u32 + 1) &&
+ util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
+ i->op = OP_MOV;
+ i->setSrc(s, NULL);
+ if (t) {
+ i->setSrc(0, i->getSrc(t));
+ i->setSrc(t, NULL);
+ }
}
}
break;
default:
return;
}
+
+ // This can get left behind some of the optimizations which simplify
+ // saturatable values.
+ if (newi->op == OP_MOV && newi->saturate) {
+ ImmediateValue tmp;
+ newi->saturate = 0;
+ newi->op = OP_SAT;
+ if (newi->src(0).getImmediate(tmp))
+ unary(newi, tmp);
+ }
+
if (newi->op != op)
foldCount++;
}
// SLCT(a, b, const) -> cc(const) ? a : b
// RCP(RCP(a)) -> a
// MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
+// EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
class AlgebraicOpt : public Pass
{
private:
void handleCVT_CVT(Instruction *);
void handleCVT_EXTBF(Instruction *);
void handleSUCLAMP(Instruction *);
+ void handleNEG(Instruction *);
+ void handleEXTBF_RDSV(Instruction *);
BuildUtil bld;
};
return false;
bool changed = false;
- if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
+ // we can't optimize to MAD if the add is precise
+ if (!add->precise && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
changed = tryADDToMADOrSAD(add, OP_MAD);
if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
changed = tryADDToMADOrSAD(add, OP_SAD);
if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
return false;
- if (src->getInsn()->postFactor)
+ if (src->getInsn()->saturate || src->getInsn()->postFactor ||
+ src->getInsn()->dnz || src->getInsn()->precise)
return false;
+
if (toOp == OP_SAD) {
ImmediateValue imm;
if (!src->getInsn()->src(2).getImmediate(imm))
add->op = toOp;
add->subOp = src->getInsn()->subOp; // potentially mul-high
+ add->dnz = src->getInsn()->dnz;
add->dType = src->getInsn()->dType; // sign matters for imad hi
add->sType = src->getInsn()->sType;
}
}
+// rcp(rcp(a)) = a
+// rcp(sqrt(a)) = rsq(a)
void
AlgebraicOpt::handleRCP(Instruction *rcp)
{
Instruction *si = rcp->getSrc(0)->getUniqueInsn();
- if (si && si->op == OP_RCP) {
+ if (!si)
+ return;
+
+ if (si->op == OP_RCP) {
Modifier mod = rcp->src(0).mod * si->src(0).mod;
rcp->op = mod.getOp();
rcp->setSrc(0, si->getSrc(0));
+ } else if (si->op == OP_SQRT) {
+ rcp->op = OP_RSQ;
+ rcp->setSrc(0, si->getSrc(0));
+ rcp->src(0).mod = rcp->src(0).mod * si->src(0).mod;
}
}
insn->setSrc(0, add->getSrc(s));
}
+// NEG(AND(SET, 1)) -> SET
+void
+AlgebraicOpt::handleNEG(Instruction *i) {
+ Instruction *src = i->getSrc(0)->getInsn();
+ ImmediateValue imm;
+ int b;
+
+ if (isFloatType(i->sType) || !src || src->op != OP_AND)
+ return;
+
+ if (src->src(0).getImmediate(imm))
+ b = 1;
+ else if (src->src(1).getImmediate(imm))
+ b = 0;
+ else
+ return;
+
+ if (!imm.isInteger(1))
+ return;
+
+ Instruction *set = src->getSrc(b)->getInsn();
+ if ((set->op == OP_SET || set->op == OP_SET_AND ||
+ set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
+ !isFloatType(set->dType)) {
+ i->def(0).replace(set->getDef(0), false);
+ }
+}
+
+// EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
+void
+AlgebraicOpt::handleEXTBF_RDSV(Instruction *i)
+{
+ Instruction *rdsv = i->getSrc(0)->getUniqueInsn();
+ if (rdsv->op != OP_RDSV ||
+ rdsv->getSrc(0)->asSym()->reg.data.sv.sv != SV_COMBINED_TID)
+ return;
+ // Avoid creating more RDSV instructions
+ if (rdsv->getDef(0)->refCount() > 1)
+ return;
+
+ ImmediateValue imm;
+ if (!i->src(1).getImmediate(imm))
+ return;
+
+ int index;
+ if (imm.isInteger(0x1000))
+ index = 0;
+ else
+ if (imm.isInteger(0x0a10))
+ index = 1;
+ else
+ if (imm.isInteger(0x061a))
+ index = 2;
+ else
+ return;
+
+ bld.setPosition(i, false);
+
+ i->op = OP_RDSV;
+ i->setSrc(0, bld.mkSysVal(SV_TID, index));
+ i->setSrc(1, NULL);
+}
+
bool
AlgebraicOpt::visit(BasicBlock *bb)
{
case OP_SUCLAMP:
handleSUCLAMP(i);
break;
+ case OP_NEG:
+ handleNEG(i);
+ break;
+ case OP_EXTBF:
+ handleEXTBF_RDSV(i);
+ break;
default:
break;
}
// =============================================================================
+// ADD(SHL(a, b), c) -> SHLADD(a, b, c)
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
+class LateAlgebraicOpt : public Pass
+{
+private:
+ virtual bool visit(Instruction *);
+
+ void handleADD(Instruction *);
+ void handleMULMAD(Instruction *);
+ bool tryADDToSHLADD(Instruction *);
+
+ BuildUtil bld;
+};
+
+void
+LateAlgebraicOpt::handleADD(Instruction *add)
+{
+ Value *src0 = add->getSrc(0);
+ Value *src1 = add->getSrc(1);
+
+ if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
+ return;
+
+ if (prog->getTarget()->isOpSupported(OP_SHLADD, add->dType))
+ tryADDToSHLADD(add);
+}
+
+// ADD(SHL(a, b), c) -> SHLADD(a, b, c)
+bool
+LateAlgebraicOpt::tryADDToSHLADD(Instruction *add)
+{
+ Value *src0 = add->getSrc(0);
+ Value *src1 = add->getSrc(1);
+ ImmediateValue imm;
+ Instruction *shl;
+ Value *src;
+ int s;
+
+ if (add->saturate || add->usesFlags() || typeSizeof(add->dType) == 8
+ || isFloatType(add->dType))
+ return false;
+
+ if (src0->getUniqueInsn() && src0->getUniqueInsn()->op == OP_SHL)
+ s = 0;
+ else
+ if (src1->getUniqueInsn() && src1->getUniqueInsn()->op == OP_SHL)
+ s = 1;
+ else
+ return false;
+
+ src = add->getSrc(s);
+ shl = src->getUniqueInsn();
+
+ if (shl->bb != add->bb || shl->usesFlags() || shl->subOp || shl->src(0).mod)
+ return false;
+
+ if (!shl->src(1).getImmediate(imm))
+ return false;
+
+ add->op = OP_SHLADD;
+ add->setSrc(2, add->src(!s));
+ // SHL can't have any modifiers, but the ADD source may have had
+ // one. Preserve it.
+ add->setSrc(0, shl->getSrc(0));
+ if (s == 1)
+ add->src(0).mod = add->src(1).mod;
+ add->setSrc(1, new_ImmediateValue(shl->bb->getProgram(), imm.reg.data.u32));
+ add->src(1).mod = Modifier(0);
+
+ return true;
+}
+
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
+void
+LateAlgebraicOpt::handleMULMAD(Instruction *i)
+{
+ // TODO: handle NV50_IR_SUBOP_MUL_HIGH
+ if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
+ return;
+ if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
+ return;
+ if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
+ return;
+
+ assert(!i->src(0).mod);
+ assert(!i->src(1).mod);
+ assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
+
+ bld.setPosition(i, false);
+
+ Value *a = i->getSrc(0);
+ Value *b = i->getSrc(1);
+ Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
+
+ Value *tmp0 = bld.getSSA();
+ Value *tmp1 = bld.getSSA();
+
+ Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
+ insn->setPredicate(i->cc, i->getPredicate());
+
+ insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
+ insn->setPredicate(i->cc, i->getPredicate());
+ insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
+
+ Value *pred = i->getPredicate();
+ i->setPredicate(i->cc, NULL);
+
+ i->op = OP_XMAD;
+ i->setSrc(0, b);
+ i->setSrc(1, tmp1);
+ i->setSrc(2, tmp0);
+ i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
+ i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
+
+ i->setPredicate(i->cc, pred);
+}
+
+bool
+LateAlgebraicOpt::visit(Instruction *i)
+{
+ switch (i->op) {
+ case OP_ADD:
+ handleADD(i);
+ break;
+ case OP_MUL:
+ case OP_MAD:
+ case OP_FMA:
+ handleMULMAD(i);
+ break;
+ default:
+ break;
+ }
+
+ return true;
+}
+
+// =============================================================================
+
+// Split 64-bit MUL and MAD
+class Split64BitOpPreRA : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+ void split64MulMad(Function *, Instruction *, DataType);
+
+ BuildUtil bld;
+};
+
+bool
+Split64BitOpPreRA::visit(BasicBlock *bb)
+{
+ Instruction *i, *next;
+ Modifier mod;
+
+ for (i = bb->getEntry(); i; i = next) {
+ next = i->next;
+
+ DataType hTy;
+ switch (i->dType) {
+ case TYPE_U64: hTy = TYPE_U32; break;
+ case TYPE_S64: hTy = TYPE_S32; break;
+ default:
+ continue;
+ }
+
+ if (i->op == OP_MAD || i->op == OP_MUL)
+ split64MulMad(func, i, hTy);
+ }
+
+ return true;
+}
+
+void
+Split64BitOpPreRA::split64MulMad(Function *fn, Instruction *i, DataType hTy)
+{
+ assert(i->op == OP_MAD || i->op == OP_MUL);
+ assert(!isFloatType(i->dType) && !isFloatType(i->sType));
+ assert(typeSizeof(hTy) == 4);
+
+ bld.setPosition(i, true);
+
+ Value *zero = bld.mkImm(0u);
+ Value *carry = bld.getSSA(1, FILE_FLAGS);
+
+ // We want to compute `d = a * b (+ c)?`, where a, b, c and d are 64-bit
+ // values (a, b and c might be 32-bit values), using 32-bit operations. This
+ // gives the following operations:
+ // * `d.low = low(a.low * b.low) (+ c.low)?`
+ // * `d.high = low(a.high * b.low) + low(a.low * b.high)
+ // + high(a.low * b.low) (+ c.high)?`
+ //
+ // To compute the high bits, we can split in the following operations:
+ // * `tmp1 = low(a.high * b.low) (+ c.high)?`
+ // * `tmp2 = low(a.low * b.high) + tmp1`
+ // * `d.high = high(a.low * b.low) + tmp2`
+ //
+ // mkSplit put lower bits at index 0 and higher bits at index 1
+
+ Value *op1[2];
+ if (i->getSrc(0)->reg.size == 8)
+ bld.mkSplit(op1, 4, i->getSrc(0));
+ else {
+ op1[0] = i->getSrc(0);
+ op1[1] = zero;
+ }
+ Value *op2[2];
+ if (i->getSrc(1)->reg.size == 8)
+ bld.mkSplit(op2, 4, i->getSrc(1));
+ else {
+ op2[0] = i->getSrc(1);
+ op2[1] = zero;
+ }
+
+ Value *op3[2] = { NULL, NULL };
+ if (i->op == OP_MAD) {
+ if (i->getSrc(2)->reg.size == 8)
+ bld.mkSplit(op3, 4, i->getSrc(2));
+ else {
+ op3[0] = i->getSrc(2);
+ op3[1] = zero;
+ }
+ }
+
+ Value *tmpRes1Hi = bld.getSSA();
+ if (i->op == OP_MAD)
+ bld.mkOp3(OP_MAD, hTy, tmpRes1Hi, op1[1], op2[0], op3[1]);
+ else
+ bld.mkOp2(OP_MUL, hTy, tmpRes1Hi, op1[1], op2[0]);
+
+ Value *tmpRes2Hi = bld.mkOp3v(OP_MAD, hTy, bld.getSSA(), op1[0], op2[1], tmpRes1Hi);
+
+ Value *def[2] = { bld.getSSA(), bld.getSSA() };
+
+ // If it was a MAD, add the carry from the low bits
+ // It is not needed if it was a MUL, since we added high(a.low * b.low) to
+ // d.high
+ if (i->op == OP_MAD)
+ bld.mkOp3(OP_MAD, hTy, def[0], op1[0], op2[0], op3[0])->setFlagsDef(1, carry);
+ else
+ bld.mkOp2(OP_MUL, hTy, def[0], op1[0], op2[0]);
+
+ Instruction *hiPart3 = bld.mkOp3(OP_MAD, hTy, def[1], op1[0], op2[0], tmpRes2Hi);
+ hiPart3->subOp = NV50_IR_SUBOP_MUL_HIGH;
+ if (i->op == OP_MAD)
+ hiPart3->setFlagsSrc(3, carry);
+
+ bld.mkOp2(OP_MERGE, i->dType, i->getDef(0), def[0], def[1]);
+
+ delete_Instruction(fn->getProgram(), i);
+}
+
+// =============================================================================
+
static inline void
updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
{
if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
return false;
+ // for compute indirect loads are not guaranteed to be aligned
+ if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
+ return false;
assert(sizeRc + sizeLd <= 16 && offRc != offLd);
+ // lock any stores that overlap with the load being merged into the
+ // existing record.
+ lockStores(ld);
+
for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
if (offLd < offRc) {
if (!prog->getTarget()->
isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
return false;
+ // no unaligned stores
if (size == 8 && MIN2(offRc, offSt) & 0x7)
return false;
+ // for compute indirect stores are not guaranteed to be aligned
+ if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
+ return false;
+
+ // remove any existing load/store records for the store being merged into
+ // the existing record.
+ purgeRecords(st, DATA_FILE_COUNT);
st->takeExtraSources(0, extra); // save predicate and indirect address
Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
for (; it; it = it->next) {
- if (it->locked && insn->op != OP_LOAD)
+ if (it->locked && insn->op != OP_LOAD && insn->op != OP_VFETCH)
continue;
if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
it->rel[0] != insn->getIndirect(0, 0) ||
// get non-replaced sources after values covered by st
for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
vals[k++] = ri->getSrc(s);
- assert((unsigned int)k <= Elements(vals));
+ assert((unsigned int)k <= ARRAY_SIZE(vals));
for (s = 0; s < k; ++s)
st->setSrc(s + 1, vals[s]);
st->setSrc(0, ri->getSrc(0));
Record that;
that.set(ldst);
- if (this->fileIndex != that.fileIndex)
+ // This assumes that images/buffers can't overlap. They can.
+ // TODO: Plumb the restrict logic through, and only skip when it's a
+ // restrict situation, or there can implicitly be no writes.
+ if (this->fileIndex != that.fileIndex && this->rel[1] == that.rel[1])
return false;
if (this->rel[0] || that.rel[0])
return this->base == that.base;
+
return
(this->offset < that.offset + that.size) &&
(this->offset + this->size > that.offset);
}
} else
if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
+ if (typeSizeof(ldst->dType) == 4 &&
+ ldst->src(1).getFile() == FILE_GPR &&
+ ldst->getSrc(1)->getInsn()->op == OP_NOP) {
+ delete_Instruction(prog, ldst);
+ continue;
+ }
isLoad = false;
} else {
// TODO: maybe have all fixed ops act as barrier ?
insn = insn->prev;
if (insn && !insn->getPredicate() &&
!insn->asFlow() &&
+ insn->op != OP_DISCARD &&
insn->op != OP_TEXBAR &&
!isTextureOp(insn->op) && // probably just nve4
!isSurfaceOp(insn->op) && // not confirmed
insn->op != OP_LINTERP && // probably just nve4
insn->op != OP_PINTERP && // probably just nve4
- ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
+ ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
(typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
!insn->isNop()) {
insn->join = 1;
// constraint SDST == SSRC2
// TODO:
// Does NVC0+ have other situations where this pass makes sense?
-class NV50PostRaConstantFolding : public Pass
+class PostRaLoadPropagation : public Pass
{
private:
- virtual bool visit(BasicBlock *);
+ virtual bool visit(Instruction *);
+
+ void handleMADforNV50(Instruction *);
+ void handleMADforNVC0(Instruction *);
};
static bool
return true;
}
-bool
-NV50PostRaConstantFolding::visit(BasicBlock *bb)
+// Fold Immediate into MAD; must be done after register allocation due to
+// constraint SDST == SSRC2
+void
+PostRaLoadPropagation::handleMADforNV50(Instruction *i)
{
+ if (i->def(0).getFile() != FILE_GPR ||
+ i->src(0).getFile() != FILE_GPR ||
+ i->src(1).getFile() != FILE_GPR ||
+ i->src(2).getFile() != FILE_GPR ||
+ i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
+ return;
+
+ if (i->getDef(0)->reg.data.id >= 64 ||
+ i->getSrc(0)->reg.data.id >= 64)
+ return;
+
+ if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
+ return;
+
+ if (i->getPredicate())
+ return;
+
Value *vtmp;
- Instruction *def;
+ Instruction *def = i->getSrc(1)->getInsn();
+
+ if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
+ def = def->getSrc(0)->getInsn();
+ if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
+ vtmp = i->getSrc(1);
+ if (isFloatType(i->sType)) {
+ i->setSrc(1, def->getSrc(0));
+ } else {
+ ImmediateValue val;
+ // getImmediate() has side-effects on the argument so this *shouldn't*
+ // be folded into the assert()
+ MAYBE_UNUSED bool ret = def->src(0).getImmediate(val);
+ assert(ret);
+ if (i->getSrc(1)->reg.data.id & 1)
+ val.reg.data.u32 >>= 16;
+ val.reg.data.u32 &= 0xffff;
+ i->setSrc(1, new_ImmediateValue(prog, val.reg.data.u32));
+ }
- for (Instruction *i = bb->getFirst(); i; i = i->next) {
- switch (i->op) {
- case OP_MAD:
- if (i->def(0).getFile() != FILE_GPR ||
- i->src(0).getFile() != FILE_GPR ||
- i->src(1).getFile() != FILE_GPR ||
- i->src(2).getFile() != FILE_GPR ||
- i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
- break;
+ /* There's no post-RA dead code elimination, so do it here
+ * XXX: if we add more code-removing post-RA passes, we might
+ * want to create a post-RA dead-code elim pass */
+ if (post_ra_dead(vtmp->getInsn())) {
+ Value *src = vtmp->getInsn()->getSrc(0);
+ // Careful -- splits will have already been removed from the
+ // functions. Don't double-delete.
+ if (vtmp->getInsn()->bb)
+ delete_Instruction(prog, vtmp->getInsn());
+ if (src->getInsn() && post_ra_dead(src->getInsn()))
+ delete_Instruction(prog, src->getInsn());
+ }
+ }
+}
- if (i->getDef(0)->reg.data.id >= 64 ||
- i->getSrc(0)->reg.data.id >= 64)
- break;
+void
+PostRaLoadPropagation::handleMADforNVC0(Instruction *i)
+{
+ if (i->def(0).getFile() != FILE_GPR ||
+ i->src(0).getFile() != FILE_GPR ||
+ i->src(1).getFile() != FILE_GPR ||
+ i->src(2).getFile() != FILE_GPR ||
+ i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
+ return;
- if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
- break;
+ // TODO: gm107 can also do this for S32, maybe other chipsets as well
+ if (i->dType != TYPE_F32)
+ return;
- if (i->getPredicate())
- break;
+ if ((i->src(2).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
+ return;
- def = i->getSrc(1)->getInsn();
- if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
- def = def->getSrc(0)->getInsn();
- if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
- vtmp = i->getSrc(1);
- if (isFloatType(i->sType)) {
- i->setSrc(1, def->getSrc(0));
- } else {
- ImmediateValue val;
- bool ret = def->src(0).getImmediate(val);
- assert(ret);
- if (i->getSrc(1)->reg.data.id & 1)
- val.reg.data.u32 >>= 16;
- val.reg.data.u32 &= 0xffff;
- i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
- }
+ ImmediateValue val;
+ int s;
- /* There's no post-RA dead code elimination, so do it here
- * XXX: if we add more code-removing post-RA passes, we might
- * want to create a post-RA dead-code elim pass */
- if (post_ra_dead(vtmp->getInsn())) {
- Value *src = vtmp->getInsn()->getSrc(0);
- // Careful -- splits will have already been removed from the
- // functions. Don't double-delete.
- if (vtmp->getInsn()->bb)
- delete_Instruction(prog, vtmp->getInsn());
- if (src->getInsn() && post_ra_dead(src->getInsn()))
- delete_Instruction(prog, src->getInsn());
- }
+ if (i->src(0).getImmediate(val))
+ s = 1;
+ else if (i->src(1).getImmediate(val))
+ s = 0;
+ else
+ return;
- break;
- }
- break;
- default:
- break;
- }
+ if ((i->src(s).mod | Modifier(NV50_IR_MOD_NEG)) != Modifier(NV50_IR_MOD_NEG))
+ return;
+
+ if (s == 1)
+ i->swapSources(0, 1);
+
+ Instruction *imm = i->getSrc(1)->getInsn();
+ i->setSrc(1, imm->getSrc(0));
+ if (post_ra_dead(imm))
+ delete_Instruction(prog, imm);
+}
+
+bool
+PostRaLoadPropagation::visit(Instruction *i)
+{
+ switch (i->op) {
+ case OP_FMA:
+ case OP_MAD:
+ if (prog->getTarget()->getChipset() < 0xc0)
+ handleMADforNV50(i);
+ else
+ handleMADforNVC0(i);
+ break;
+ default:
+ break;
}
return true;
} else
if (this->asFlow()) {
return false;
+ } else
+ if (this->op == OP_PHI && this->bb != that->bb) {
+ /* TODO: we could probably be a bit smarter here by following the
+ * control flow, but honestly, it is quite painful to check */
+ return false;
} else {
if (this->ipa != that->ipa ||
this->lanes != that->lanes ||
if (that->srcExists(s))
return false;
- if (op == OP_LOAD || op == OP_VFETCH) {
+ if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
switch (src(0).getFile()) {
case FILE_MEMORY_CONST:
case FILE_SHADER_INPUT:
ik = phi->getSrc(0)->getInsn();
if (!ik)
continue; // probably a function input
+ if (ik->defCount(0xff) > 1)
+ continue; // too painful to check if we can really push this forward
for (s = 1; phi->srcExists(s); ++s) {
if (phi->getSrc(s)->refCount() > 1)
break;
break;
}
if (!phi->srcExists(s)) {
+ assert(ik->op != OP_PHI);
Instruction *entry = bb->getEntry();
ik->bb->remove(ik);
if (!entry || entry->op != OP_JOIN)
for (ir = bb->getFirst(); ir; ir = ir->next)
ir->serial = serial++;
- for (ir = bb->getEntry(); ir; ir = next) {
+ for (ir = bb->getFirst(); ir; ir = next) {
int s;
Value *src = NULL;
++deadCount;
delete_Instruction(prog, i);
} else
- if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
+ if (i->defExists(1) &&
+ i->subOp == 0 &&
+ (i->op == OP_VFETCH || i->op == OP_LOAD)) {
checkSplitLoad(i);
} else
if (i->defExists(0) && !i->getDef(0)->refCount()) {
if (i->op == OP_ATOM ||
i->op == OP_SUREDP ||
- i->op == OP_SUREDB)
+ i->op == OP_SUREDB) {
i->setDef(0, NULL);
+ if (i->op == OP_ATOM && i->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
+ i->cache = CACHE_CV;
+ i->op = OP_STORE;
+ i->subOp = 0;
+ }
+ } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
+ i->setDef(0, i->getDef(1));
+ i->setDef(1, NULL);
+ }
}
}
return true;
RUN_PASS(2, AlgebraicOpt, run);
RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
RUN_PASS(1, ConstantFolding, foldAll);
+ RUN_PASS(1, Split64BitOpPreRA, run);
+ RUN_PASS(2, LateAlgebraicOpt, run);
RUN_PASS(1, LoadPropagation, run);
RUN_PASS(1, IndirectPropagation, run);
RUN_PASS(2, MemoryOpt, run);
Program::optimizePostRA(int level)
{
RUN_PASS(2, FlatteningPass, run);
- if (getTarget()->getChipset() < 0xc0)
- RUN_PASS(2, NV50PostRaConstantFolding, run);
+ RUN_PASS(2, PostRaLoadPropagation, run);
return true;
}