LoadPropagation::checkSwapSrc01(Instruction *insn)
{
const Target *targ = prog->getTarget();
- if (!targ->getOpInfo(insn).commutative)
- if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
+ if (!targ->getOpInfo(insn).commutative) {
+ if (insn->op != OP_SET && insn->op != OP_SLCT &&
+ insn->op != OP_SUB && insn->op != OP_XMAD)
return;
+ // XMAD is only commutative if both the CBCC and MRG flags are not set.
+ if (insn->op == OP_XMAD &&
+ (insn->subOp & NV50_IR_SUBOP_XMAD_CMODE_MASK) == NV50_IR_SUBOP_XMAD_CBCC)
+ return;
+ if (insn->op == OP_XMAD && (insn->subOp & NV50_IR_SUBOP_XMAD_MRG))
+ return;
+ }
if (insn->src(1).getFile() != FILE_GPR)
return;
// This is the special OP_SET used for alphatesting, we can't reverse its
if (insn->op == OP_SUB) {
insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
+ } else
+ if (insn->op == OP_XMAD) {
+ // swap h1 flags
+ uint16_t h1 = (insn->subOp >> 1 & NV50_IR_SUBOP_XMAD_H1(0)) |
+ (insn->subOp << 1 & NV50_IR_SUBOP_XMAD_H1(1));
+ insn->subOp = (insn->subOp & ~NV50_IR_SUBOP_XMAD_H1_MASK) | h1;
}
}
{
private:
virtual bool visit(BasicBlock *);
+
+ BuildUtil bld;
};
bool
for (Instruction *i = bb->getEntry(); i; i = next) {
next = i->next;
+ bld.setPosition(i, false);
+
for (int s = 0; i->srcExists(s); ++s) {
Instruction *insn;
ImmediateValue imm;
i->setIndirect(s, 0, NULL);
i->setSrc(s, cloneShallow(func, i->getSrc(s)));
i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+ } else if (insn->op == OP_SHLADD) {
+ if (!insn->src(2).getImmediate(imm) ||
+ !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+ continue;
+ i->setIndirect(s, 0, bld.mkOp2v(
+ OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1)));
+ i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+ i->src(s).get()->reg.data.offset += imm.reg.data.u32;
}
}
}
void expr(Instruction *, ImmediateValue&, ImmediateValue&);
void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
- void opnd(Instruction *, ImmediateValue&, int s);
+ /* true if i was deleted */
+ bool opnd(Instruction *i, ImmediateValue&, int s);
void opnd3(Instruction *, ImmediateValue&);
void unary(Instruction *, const ImmediateValue&);
CmpInstruction *findOriginForTestWithZero(Value *);
+ bool createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c);
+
unsigned int foldCount;
BuildUtil bld;
if (i->srcExists(2) &&
i->src(0).getImmediate(src0) &&
i->src(1).getImmediate(src1) &&
- i->src(2).getImmediate(src2))
+ i->src(2).getImmediate(src2)) {
expr(i, src0, src1, src2);
- else
+ } else
if (i->srcExists(1) &&
- i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
+ i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1)) {
expr(i, src0, src1);
- else
- if (i->srcExists(0) && i->src(0).getImmediate(src0))
- opnd(i, src0, 0);
- else
- if (i->srcExists(1) && i->src(1).getImmediate(src1))
- opnd(i, src1, 1);
+ } else
+ if (i->srcExists(0) && i->src(0).getImmediate(src0)) {
+ if (opnd(i, src0, 0))
+ continue;
+ } else
+ if (i->srcExists(1) && i->src(1).getImmediate(src1)) {
+ if (opnd(i, src1, 1))
+ continue;
+ }
if (i->srcExists(2) && i->src(2).getImmediate(src2))
opnd3(i, src2);
}
// restrictions, so move it into a separate LValue.
bld.setPosition(i, false);
i->op = OP_ADD;
+ i->dnz = 0;
i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
i->setSrc(0, i->getSrc(2));
i->src(0).mod = i->src(2).mod;
}
}
-void
-ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
+bool
+ConstantFolding::createMul(DataType ty, Value *def, Value *a, int64_t b, Value *c)
{
const Target *target = prog->getTarget();
+ int64_t absB = llabs(b);
+
+ //a * (2^shl) -> a << shl
+ if (b >= 0 && util_is_power_of_two_or_zero64(b)) {
+ int shl = util_logbase2_64(b);
+
+ Value *res = c ? bld.getSSA(typeSizeof(ty)) : def;
+ bld.mkOp2(OP_SHL, ty, res, a, bld.mkImm(shl));
+ if (c)
+ bld.mkOp2(OP_ADD, ty, def, res, c);
+
+ return true;
+ }
+
+ //a * (2^shl + 1) -> a << shl + a
+ //a * -(2^shl + 1) -> -a << shl + a
+ //a * (2^shl - 1) -> a << shl - a
+ //a * -(2^shl - 1) -> -a << shl - a
+ if (typeSizeof(ty) == 4 &&
+ (util_is_power_of_two_or_zero64(absB - 1) ||
+ util_is_power_of_two_or_zero64(absB + 1)) &&
+ target->isOpSupported(OP_SHLADD, TYPE_U32)) {
+ bool subA = util_is_power_of_two_or_zero64(absB + 1);
+ int shl = subA ? util_logbase2_64(absB + 1) : util_logbase2_64(absB - 1);
+
+ Value *res = c ? bld.getSSA() : def;
+ Instruction *insn = bld.mkOp3(OP_SHLADD, TYPE_U32, res, a, bld.mkImm(shl), a);
+ if (b < 0)
+ insn->src(0).mod = Modifier(NV50_IR_MOD_NEG);
+ if (subA)
+ insn->src(2).mod = Modifier(NV50_IR_MOD_NEG);
+
+ if (c)
+ bld.mkOp2(OP_ADD, TYPE_U32, def, res, c);
+
+ return true;
+ }
+
+ if (typeSizeof(ty) == 4 && b >= 0 && b <= 0xffff &&
+ target->isOpSupported(OP_XMAD, TYPE_U32)) {
+ Value *tmp = bld.mkOp3v(OP_XMAD, TYPE_U32, bld.getSSA(),
+ a, bld.mkImm((uint32_t)b), c ? c : bld.mkImm(0));
+ bld.mkOp3(OP_XMAD, TYPE_U32, def, a, bld.mkImm((uint32_t)b), tmp)->subOp =
+ NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_H1(0);
+
+ return true;
+ }
+
+ return false;
+}
+
+bool
+ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
+{
const int t = !s;
const operation op = i->op;
Instruction *newi = i;
+ bool deleted = false;
switch (i->op) {
case OP_SPLIT: {
val >>= bitsize;
}
delete_Instruction(prog, i);
+ deleted = true;
break;
}
case OP_MUL:
- if (i->dType == TYPE_F32)
+ if (i->dType == TYPE_F32 && !i->precise)
tryCollapseChainedMULs(i, s, imm0);
if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
newi = bld.mkCmp(OP_SET, CC_LT, TYPE_S32, i->getDef(0),
TYPE_S32, i->getSrc(t), bld.mkImm(0));
delete_Instruction(prog, i);
+ deleted = true;
} else if (imm0.isInteger(0) || imm0.isInteger(1)) {
// The high bits can't be set in this case (either mul by 0 or
// unsigned by 1)
if (imm0.isNegative())
i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
i->op = OP_ADD;
+ i->dnz = 0;
i->setSrc(s, i->getSrc(t));
i->src(s).mod = i->src(t).mod;
} else
- if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
- i->op = OP_SHL;
- imm0.applyLog2();
- i->setSrc(0, i->getSrc(t));
- i->src(0).mod = i->src(t).mod;
- i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
- i->src(1).mod = 0;
+ if (!isFloatType(i->dType) && !i->src(t).mod) {
+ bld.setPosition(i, false);
+ int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
+ if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, NULL)) {
+ delete_Instruction(prog, i);
+ deleted = true;
+ }
} else
if (i->postFactor && i->sType == TYPE_F32) {
/* Can't emit a postfactor with an immediate, have to fold it in */
i->setSrc(1, i->getSrc(2));
i->src(1).mod = i->src(2).mod;
i->setSrc(2, NULL);
+ i->dnz = 0;
i->op = OP_ADD;
} else
- if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
- !isFloatType(i->dType) &&
- target->isOpSupported(OP_SHLADD, i->dType)) {
- i->op = OP_SHLADD;
- imm0.applyLog2();
- i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
+ if (!isFloatType(i->dType) && !i->subOp && !i->src(t).mod && !i->src(2).mod) {
+ bld.setPosition(i, false);
+ int64_t b = typeSizeof(i->dType) == 8 ? imm0.reg.data.s64 : imm0.reg.data.s32;
+ if (createMul(i->dType, i->getDef(0), i->getSrc(t), b, i->getSrc(2))) {
+ delete_Instruction(prog, i);
+ deleted = true;
+ }
}
break;
case OP_SUB:
bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
delete_Instruction(prog, i);
+ deleted = true;
} else
if (imm0.reg.data.s32 == -1) {
i->op = OP_NEG;
bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
delete_Instruction(prog, i);
+ deleted = true;
}
break;
newi = bld.mkOp2(OP_UNION, TYPE_S32, i->getDef(0), v1, v2);
delete_Instruction(prog, i);
+ deleted = true;
}
} else if (s == 1) {
// In this case, we still want the optimized lowering that we get
newi->src(1).mod = Modifier(NV50_IR_MOD_NEG);
delete_Instruction(prog, i);
+ deleted = true;
}
break;
CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
CondCode cc, ccZ;
if (imm0.reg.data.u32 != 0 || !si)
- return;
+ return false;
cc = si->setCond;
ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
// We do everything assuming var (cmp) 0, reverse the condition if 0 is
case CC_GT: break; // bool > 0 -- bool
case CC_NE: break; // bool != 0 -- bool
default:
- return;
+ return false;
}
// Update the condition of this SET to be identical to the origin set,
} else if (src->asCmp()) {
CmpInstruction *cmp = src->asCmp();
if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
- return;
+ return false;
if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
- return;
+ return false;
if (imm0.reg.data.f32 != 1.0)
- return;
+ return false;
if (cmp->dType != TYPE_U32)
- return;
+ return false;
cmp->dType = TYPE_F32;
if (i->src(t).mod != Modifier(0)) {
src->op == OP_SHR &&
src->src(1).getImmediate(imm1) &&
i->src(t).mod == Modifier(0) &&
- util_is_power_of_two(imm0.reg.data.u32 + 1)) {
+ util_is_power_of_two_or_zero(imm0.reg.data.u32 + 1)) {
// low byte = offset, high byte = width
uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
i->op = OP_EXTBF;
} else if (src->op == OP_SHL &&
src->src(1).getImmediate(imm1) &&
i->src(t).mod == Modifier(0) &&
- util_is_power_of_two(~imm0.reg.data.u32 + 1) &&
+ util_is_power_of_two_or_zero(~imm0.reg.data.u32 + 1) &&
util_last_bit(~imm0.reg.data.u32) <= imm1.reg.data.u32) {
i->op = OP_MOV;
i->setSrc(s, NULL);
case OP_MUL:
int muls;
if (isFloatType(si->dType))
- return;
+ return false;
if (si->src(1).getImmediate(imm1))
muls = 1;
else if (si->src(0).getImmediate(imm1))
muls = 0;
else
- return;
+ return false;
bld.setPosition(i, false);
i->op = OP_MUL;
case OP_ADD:
int adds;
if (isFloatType(si->dType))
- return;
+ return false;
if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
adds = 0;
else if (si->src(1).getImmediate(imm1))
adds = 1;
else
- return;
+ return false;
if (si->src(!adds).mod != Modifier(0))
- return;
+ return false;
// SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
// This is more operations, but if one of x, y is an immediate, then
bld.mkImm(imm0.reg.data.u32)));
break;
default:
- return;
+ return false;
}
}
break;
case TYPE_S32: res = util_last_bit_signed(imm0.reg.data.s32) - 1; break;
case TYPE_U32: res = util_last_bit(imm0.reg.data.u32) - 1; break;
default:
- return;
+ return false;
}
if (i->subOp == NV50_IR_SUBOP_BFIND_SAMT && res >= 0)
res = 31 - res;
// TODO: handle 64-bit values properly
if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
- return;
+ return false;
// TODO: handle single byte/word extractions
if (i->subOp)
- return;
+ return false;
bld.setPosition(i, true); /* make sure bld is init'ed */
CLAMP(imm0.reg.data.u16, umin, umax) : \
imm0.reg.data.u16; \
break; \
- default: return; \
+ default: return false; \
} \
i->setSrc(0, bld.mkImm(res.data.dst)); \
break
case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
default:
- return;
+ return false;
}
i->setSrc(0, bld.mkImm(res.data.f32));
break;
case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
default:
- return;
+ return false;
}
i->setSrc(0, bld.mkImm(res.data.f64));
break;
default:
- return;
+ return false;
}
#undef CASE
break;
}
default:
- return;
+ return false;
}
// This can get left behind some of the optimizations which simplify
if (newi->op != op)
foldCount++;
+ return deleted;
}
// =============================================================================
// SLCT(a, b, const) -> cc(const) ? a : b
// RCP(RCP(a)) -> a
// MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
+// EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
class AlgebraicOpt : public Pass
{
private:
void handleCVT_EXTBF(Instruction *);
void handleSUCLAMP(Instruction *);
void handleNEG(Instruction *);
+ void handleEXTBF_RDSV(Instruction *);
BuildUtil bld;
};
if (minmax->src(0).mod == minmax->src(1).mod) {
if (minmax->def(0).mayReplace(minmax->src(0))) {
minmax->def(0).replace(minmax->src(0), false);
- minmax->bb->remove(minmax);
+ delete_Instruction(prog, minmax);
} else {
minmax->op = OP_CVT;
minmax->setSrc(1, NULL);
}
}
+// rcp(rcp(a)) = a
+// rcp(sqrt(a)) = rsq(a)
void
AlgebraicOpt::handleRCP(Instruction *rcp)
{
Instruction *si = rcp->getSrc(0)->getUniqueInsn();
- if (si && si->op == OP_RCP) {
+ if (!si)
+ return;
+
+ if (si->op == OP_RCP) {
Modifier mod = rcp->src(0).mod * si->src(0).mod;
rcp->op = mod.getOp();
rcp->setSrc(0, si->getSrc(0));
+ } else if (si->op == OP_SQRT) {
+ rcp->op = OP_RSQ;
+ rcp->setSrc(0, si->getSrc(0));
+ rcp->src(0).mod = rcp->src(0).mod * si->src(0).mod;
}
}
}
}
+// EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
+void
+AlgebraicOpt::handleEXTBF_RDSV(Instruction *i)
+{
+ Instruction *rdsv = i->getSrc(0)->getUniqueInsn();
+ if (rdsv->op != OP_RDSV ||
+ rdsv->getSrc(0)->asSym()->reg.data.sv.sv != SV_COMBINED_TID)
+ return;
+ // Avoid creating more RDSV instructions
+ if (rdsv->getDef(0)->refCount() > 1)
+ return;
+
+ ImmediateValue imm;
+ if (!i->src(1).getImmediate(imm))
+ return;
+
+ int index;
+ if (imm.isInteger(0x1000))
+ index = 0;
+ else
+ if (imm.isInteger(0x0a10))
+ index = 1;
+ else
+ if (imm.isInteger(0x061a))
+ index = 2;
+ else
+ return;
+
+ bld.setPosition(i, false);
+
+ i->op = OP_RDSV;
+ i->setSrc(0, bld.mkSysVal(SV_TID, index));
+ i->setSrc(1, NULL);
+}
+
bool
AlgebraicOpt::visit(BasicBlock *bb)
{
case OP_NEG:
handleNEG(i);
break;
+ case OP_EXTBF:
+ handleEXTBF_RDSV(i);
+ break;
default:
break;
}
// =============================================================================
// ADD(SHL(a, b), c) -> SHLADD(a, b, c)
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
class LateAlgebraicOpt : public Pass
{
private:
virtual bool visit(Instruction *);
void handleADD(Instruction *);
+ void handleMULMAD(Instruction *);
bool tryADDToSHLADD(Instruction *);
+
+ BuildUtil bld;
};
void
return true;
}
+// MUL(a, b) -> a few XMADs
+// MAD/FMA(a, b, c) -> a few XMADs
+void
+LateAlgebraicOpt::handleMULMAD(Instruction *i)
+{
+ // TODO: handle NV50_IR_SUBOP_MUL_HIGH
+ if (!prog->getTarget()->isOpSupported(OP_XMAD, TYPE_U32))
+ return;
+ if (isFloatType(i->dType) || typeSizeof(i->dType) != 4)
+ return;
+ if (i->subOp || i->usesFlags() || i->flagsDef >= 0)
+ return;
+
+ assert(!i->src(0).mod);
+ assert(!i->src(1).mod);
+ assert(i->op == OP_MUL ? 1 : !i->src(2).mod);
+
+ bld.setPosition(i, false);
+
+ Value *a = i->getSrc(0);
+ Value *b = i->getSrc(1);
+ Value *c = i->op == OP_MUL ? bld.mkImm(0) : i->getSrc(2);
+
+ Value *tmp0 = bld.getSSA();
+ Value *tmp1 = bld.getSSA();
+
+ Instruction *insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp0, b, a, c);
+ insn->setPredicate(i->cc, i->getPredicate());
+
+ insn = bld.mkOp3(OP_XMAD, TYPE_U32, tmp1, b, a, bld.mkImm(0));
+ insn->setPredicate(i->cc, i->getPredicate());
+ insn->subOp = NV50_IR_SUBOP_XMAD_MRG | NV50_IR_SUBOP_XMAD_H1(1);
+
+ Value *pred = i->getPredicate();
+ i->setPredicate(i->cc, NULL);
+
+ i->op = OP_XMAD;
+ i->setSrc(0, b);
+ i->setSrc(1, tmp1);
+ i->setSrc(2, tmp0);
+ i->subOp = NV50_IR_SUBOP_XMAD_PSL | NV50_IR_SUBOP_XMAD_CBCC;
+ i->subOp |= NV50_IR_SUBOP_XMAD_H1(0) | NV50_IR_SUBOP_XMAD_H1(1);
+
+ i->setPredicate(i->cc, pred);
+}
+
bool
LateAlgebraicOpt::visit(Instruction *i)
{
case OP_ADD:
handleADD(i);
break;
+ case OP_MUL:
+ case OP_MAD:
+ case OP_FMA:
+ handleMULMAD(i);
+ break;
default:
break;
}
} else
if (this->asFlow()) {
return false;
+ } else
+ if (this->op == OP_PHI && this->bb != that->bb) {
+ /* TODO: we could probably be a bit smarter here by following the
+ * control flow, but honestly, it is quite painful to check */
+ return false;
} else {
if (this->ipa != that->ipa ||
this->lanes != that->lanes ||
break;
}
if (!phi->srcExists(s)) {
+ assert(ik->op != OP_PHI);
Instruction *entry = bb->getEntry();
ik->bb->remove(ik);
if (!entry || entry->op != OP_JOIN)
RUN_PASS(2, AlgebraicOpt, run);
RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
RUN_PASS(1, ConstantFolding, foldAll);
- RUN_PASS(1, Split64BitOpPreRA, run);
+ RUN_PASS(0, Split64BitOpPreRA, run);
+ RUN_PASS(2, LateAlgebraicOpt, run);
RUN_PASS(1, LoadPropagation, run);
RUN_PASS(1, IndirectPropagation, run);
RUN_PASS(2, MemoryOpt, run);
- RUN_PASS(2, LateAlgebraicOpt, run);
RUN_PASS(2, LocalCSE, run);
RUN_PASS(0, DeadCodeElim, buryAll);