void checkSwapSrc01(Instruction *);
bool isCSpaceLoad(Instruction *);
- bool isImmd32Load(Instruction *);
+ bool isImmdLoad(Instruction *);
bool isAttribOrSharedLoad(Instruction *);
};
}
bool
-LoadPropagation::isImmd32Load(Instruction *ld)
+LoadPropagation::isImmdLoad(Instruction *ld)
{
- if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+ if (!ld || (ld->op != OP_MOV) ||
+ ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
return false;
- return ld->src(0).getFile() == FILE_IMMEDIATE;
+
+ // A 0 can be replaced with a register, so it doesn't count as an immediate.
+ ImmediateValue val;
+ return ld->src(0).getImmediate(val) && !val.isInteger(0);
}
bool
void
LoadPropagation::checkSwapSrc01(Instruction *insn)
{
- if (!prog->getTarget()->getOpInfo(insn).commutative)
- if (insn->op != OP_SET && insn->op != OP_SLCT)
+ const Target *targ = prog->getTarget();
+ if (!targ->getOpInfo(insn).commutative)
+ if (insn->op != OP_SET && insn->op != OP_SLCT && insn->op != OP_SUB)
return;
if (insn->src(1).getFile() != FILE_GPR)
return;
+ // This is the special OP_SET used for alphatesting, we can't reverse its
+ // arguments as that will confuse the fixup code.
+ if (insn->op == OP_SET && insn->subOp)
+ return;
Instruction *i0 = insn->getSrc(0)->getInsn();
Instruction *i1 = insn->getSrc(1)->getInsn();
- if (isCSpaceLoad(i0)) {
- if (!isCSpaceLoad(i1))
- insn->swapSources(0, 1);
- else
- return;
- } else
- if (isImmd32Load(i0)) {
- if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+ // Swap sources to inline the less frequently used source. That way,
+ // optimistically, it will eventually be able to remove the instruction.
+ int i0refs = insn->getSrc(0)->refCount();
+ int i1refs = insn->getSrc(1)->refCount();
+
+ if ((isCSpaceLoad(i0) || isImmdLoad(i0)) && targ->insnCanLoad(insn, 1, i0)) {
+ if ((!isImmdLoad(i1) && !isCSpaceLoad(i1)) ||
+ !targ->insnCanLoad(insn, 1, i1) ||
+ i0refs < i1refs)
insn->swapSources(0, 1);
else
return;
else
if (insn->op == OP_SLCT)
insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
+ else
+ if (insn->op == OP_SUB) {
+ insn->src(0).mod = insn->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
+ insn->src(1).mod = insn->src(1).mod ^ Modifier(NV50_IR_MOD_NEG);
+ }
}
bool
if (i->op == OP_CALL) // calls have args as sources, they must be in regs
continue;
+ if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
+ continue;
+
if (i->srcExists(1))
checkSwapSrc01(i);
// =============================================================================
+class IndirectPropagation : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+};
+
+bool
+IndirectPropagation::visit(BasicBlock *bb)
+{
+ const Target *targ = prog->getTarget();
+ Instruction *next;
+
+ for (Instruction *i = bb->getEntry(); i; i = next) {
+ next = i->next;
+
+ for (int s = 0; i->srcExists(s); ++s) {
+ Instruction *insn;
+ ImmediateValue imm;
+ if (!i->src(s).isIndirect(0))
+ continue;
+ insn = i->getIndirect(s, 0)->getInsn();
+ if (!insn)
+ continue;
+ if (insn->op == OP_ADD && !isFloatType(insn->dType)) {
+ if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
+ !insn->src(1).getImmediate(imm) ||
+ !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+ continue;
+ i->setIndirect(s, 0, insn->getSrc(0));
+ i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+ i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+ } else if (insn->op == OP_SUB && !isFloatType(insn->dType)) {
+ if (insn->src(0).getFile() != targ->nativeFile(FILE_ADDRESS) ||
+ !insn->src(1).getImmediate(imm) ||
+ !targ->insnCanLoadOffset(i, s, -imm.reg.data.s32))
+ continue;
+ i->setIndirect(s, 0, insn->getSrc(0));
+ i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+ i->src(s).get()->reg.data.offset -= imm.reg.data.u32;
+ } else if (insn->op == OP_MOV) {
+ if (!insn->src(0).getImmediate(imm) ||
+ !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+ continue;
+ i->setIndirect(s, 0, NULL);
+ i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+ i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+ }
+ }
+ }
+ return true;
+}
+
+// =============================================================================
+
// Evaluate constant expressions.
class ConstantFolding : public Pass
{
void expr(Instruction *, ImmediateValue&, ImmediateValue&);
void expr(Instruction *, ImmediateValue&, ImmediateValue&, ImmediateValue&);
void opnd(Instruction *, ImmediateValue&, int s);
+ void opnd3(Instruction *, ImmediateValue&);
void unary(Instruction *, const ImmediateValue&);
void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
- // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
CmpInstruction *findOriginForTestWithZero(Value *);
unsigned int foldCount;
else
if (i->srcExists(1) && i->src(1).getImmediate(src1))
opnd(i, src1, 1);
+ if (i->srcExists(2) && i->src(2).getImmediate(src2))
+ opnd3(i, src2);
}
return true;
}
return NULL;
Instruction *insn = value->getInsn();
- while (insn && insn->op != OP_SET) {
- Instruction *next = NULL;
- switch (insn->op) {
- case OP_NEG:
- case OP_ABS:
- case OP_CVT:
- next = insn->getSrc(0)->getInsn();
- if (insn->sType != next->dType)
+ if (insn->asCmp() && insn->op != OP_SLCT)
+ return insn->asCmp();
+
+ /* Sometimes mov's will sneak in as a result of other folding. This gets
+ * cleaned up later.
+ */
+ if (insn->op == OP_MOV)
+ return findOriginForTestWithZero(insn->getSrc(0));
+
+ /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
+ if (insn->op == OP_AND) {
+ int s = 0;
+ ImmediateValue imm;
+ if (!insn->src(s).getImmediate(imm)) {
+ s = 1;
+ if (!insn->src(s).getImmediate(imm))
return NULL;
- break;
- case OP_MOV:
- next = insn->getSrc(0)->getInsn();
- break;
- default:
- return NULL;
}
- insn = next;
+ if (imm.reg.data.f32 != 1.0f)
+ return NULL;
+ /* TODO: Come up with a way to handle the condition being inverted */
+ if (insn->src(!s).mod != Modifier(0))
+ return NULL;
+ return findOriginForTestWithZero(insn->getSrc(!s));
}
- return insn ? insn->asCmp() : NULL;
+
+ return NULL;
}
void
{
struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
struct Storage res;
+ DataType type = i->dType;
memset(&res.data, 0, sizeof(res.data));
return;
}
break;
+ case OP_SUB:
+ switch (i->dType) {
+ case TYPE_F32: res.data.f32 = a->data.f32 - b->data.f32; break;
+ case TYPE_F64: res.data.f64 = a->data.f64 - b->data.f64; break;
+ case TYPE_S32:
+ case TYPE_U32: res.data.u32 = a->data.u32 - b->data.u32; break;
+ default:
+ return;
+ }
+ break;
case OP_POW:
switch (i->dType) {
case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
case OP_POPCNT:
res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
break;
+ case OP_PFETCH:
+ // The two arguments to pfetch are logically added together. Normally
+ // the second argument will not be constant, but that can happen.
+ res.data.u32 = a->data.u32 + b->data.u32;
+ type = TYPE_U32;
+ break;
+ case OP_MERGE:
+ switch (i->dType) {
+ case TYPE_U64:
+ case TYPE_S64:
+ case TYPE_F64:
+ res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
+ break;
+ default:
+ return;
+ }
+ break;
default:
return;
}
i->setSrc(1, NULL);
i->getSrc(0)->reg.data = res.data;
+ i->getSrc(0)->reg.type = type;
+ i->getSrc(0)->reg.size = typeSizeof(type);
- if (i->op == OP_MAD || i->op == OP_FMA) {
- i->op = OP_ADD;
+ switch (i->op) {
+ case OP_MAD:
+ case OP_FMA: {
+ ImmediateValue src0, src1 = *i->getSrc(0)->asImm();
- i->setSrc(1, i->getSrc(0));
- i->src(1).mod = i->src(2).mod;
+ // Move the immediate into position 1, where we know it might be
+ // emittable. However it might not be anyways, as there may be other
+ // restrictions, so move it into a separate LValue.
+ bld.setPosition(i, false);
+ i->op = OP_ADD;
+ i->setSrc(1, bld.mkMov(bld.getSSA(type), i->getSrc(0), type)->getDef(0));
i->setSrc(0, i->getSrc(2));
+ i->src(0).mod = i->src(2).mod;
i->setSrc(2, NULL);
- ImmediateValue src0;
if (i->src(0).getImmediate(src0))
- expr(i, src0, *i->getSrc(1)->asImm());
- if (i->saturate && !prog->getTarget()->isSatSupported(i)) {
- bld.setPosition(i, false);
- i->setSrc(1, bld.loadImm(NULL, res.data.u32));
- }
- } else {
+ expr(i, src0, src1);
+ else
+ opnd(i, src1, 1);
+ break;
+ }
+ case OP_PFETCH:
+ // Leave PFETCH alone... we just folded its 2 args into 1.
+ break;
+ default:
i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
+ break;
}
i->subOp = 0;
}
res.data.u32 = ((a->data.u32 << offset) & bitmask) | (c->data.u32 & ~bitmask);
break;
}
+ case OP_MAD:
+ case OP_FMA: {
+ switch (i->dType) {
+ case TYPE_F32:
+ res.data.f32 = a->data.f32 * b->data.f32 * exp2f(i->postFactor) +
+ c->data.f32;
+ break;
+ case TYPE_F64:
+ res.data.f64 = a->data.f64 * b->data.f64 + c->data.f64;
+ break;
+ case TYPE_S32:
+ if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
+ res.data.s32 = ((int64_t)a->data.s32 * b->data.s32 >> 32) + c->data.s32;
+ break;
+ }
+ /* fallthrough */
+ case TYPE_U32:
+ if (i->subOp == NV50_IR_SUBOP_MUL_HIGH) {
+ res.data.u32 = ((uint64_t)a->data.u32 * b->data.u32 >> 32) + c->data.u32;
+ break;
+ }
+ res.data.u32 = a->data.u32 * b->data.u32 + c->data.u32;
+ break;
+ default:
+ return;
+ }
+ break;
+ }
+ case OP_SHLADD:
+ res.data.u32 = (a->data.u32 << b->data.u32) + c->data.u32;
+ break;
default:
return;
}
i->setSrc(2, NULL);
i->getSrc(0)->reg.data = res.data;
+ i->getSrc(0)->reg.type = i->dType;
+ i->getSrc(0)->reg.size = typeSizeof(i->dType);
i->op = OP_MOV;
}
}
}
+void
+ConstantFolding::opnd3(Instruction *i, ImmediateValue &imm2)
+{
+ switch (i->op) {
+ case OP_MAD:
+ case OP_FMA:
+ if (imm2.isInteger(0)) {
+ i->op = OP_MUL;
+ i->setSrc(2, NULL);
+ foldCount++;
+ return;
+ }
+ break;
+ case OP_SHLADD:
+ if (imm2.isInteger(0)) {
+ i->op = OP_SHL;
+ i->setSrc(2, NULL);
+ foldCount++;
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+}
+
void
ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
{
+ const Target *target = prog->getTarget();
const int t = !s;
const operation op = i->op;
Instruction *newi = i;
i->src(0).mod = i->src(t).mod;
i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
i->src(1).mod = 0;
+ } else
+ if (i->postFactor && i->sType == TYPE_F32) {
+ /* Can't emit a postfactor with an immediate, have to fold it in */
+ i->setSrc(s, new_ImmediateValue(
+ prog, imm0.reg.data.f32 * exp2f(i->postFactor)));
+ i->postFactor = 0;
}
break;
case OP_MAD:
if (i->op != OP_CVT)
i->src(0).mod = 0;
} else
- if (imm0.isInteger(1) || imm0.isInteger(-1)) {
+ if (i->subOp != NV50_IR_SUBOP_MUL_HIGH &&
+ (imm0.isInteger(1) || imm0.isInteger(-1))) {
if (imm0.isNegative())
i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
if (s == 0) {
i->src(1).mod = i->src(2).mod;
i->setSrc(2, NULL);
i->op = OP_ADD;
+ } else
+ if (s == 1 && !imm0.isNegative() && imm0.isPow2() &&
+ target->isOpSupported(i->op, i->dType)) {
+ i->op = OP_SHLADD;
+ imm0.applyLog2();
+ i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
}
break;
case OP_ADD:
case OP_SET: // TODO: SET_AND,OR,XOR
{
+ /* This optimizes the case where the output of a set is being compared
+ * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
+ * can be a lot cleverer in our comparison.
+ */
CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
CondCode cc, ccZ;
- if (i->src(t).mod != Modifier(0))
- return;
- if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
+ if (imm0.reg.data.u32 != 0 || !si)
return;
cc = si->setCond;
ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
+ // We do everything assuming var (cmp) 0, reverse the condition if 0 is
+ // first.
if (s == 0)
ccZ = reverseCondCode(ccZ);
+ // If there is a negative modifier, we need to undo that, by flipping
+ // the comparison to zero.
+ if (i->src(t).mod.neg())
+ ccZ = reverseCondCode(ccZ);
+ // If this is a signed comparison, we expect the input to be a regular
+ // boolean, i.e. 0/-1. However the rest of the logic assumes that true
+ // is positive, so just flip the sign.
+ if (i->sType == TYPE_S32) {
+ assert(!isFloatType(si->dType));
+ ccZ = reverseCondCode(ccZ);
+ }
switch (ccZ) {
- case CC_LT: cc = CC_FL; break;
- case CC_GE: cc = CC_TR; break;
- case CC_EQ: cc = inverseCondCode(cc); break;
- case CC_LE: cc = inverseCondCode(cc); break;
- case CC_GT: break;
- case CC_NE: break;
+ case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
+ case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
+ case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
+ case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
+ case CC_GT: break; // bool > 0 -- bool
+ case CC_NE: break; // bool != 0 -- bool
default:
return;
}
+
+ // Update the condition of this SET to be identical to the origin set,
+ // but with the updated condition code. The original SET should get
+ // DCE'd, ideally.
+ i->op = si->op;
i->asCmp()->setCond = cc;
i->setSrc(0, si->src(0));
i->setSrc(1, si->src(1));
+ if (si->srcExists(2))
+ i->setSrc(2, si->src(2));
i->sType = si->sType;
}
break;
case OP_AND:
{
- CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp();
- if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
- return;
- if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
- return;
- if (imm0.reg.data.f32 != 1.0)
- return;
- if (i->getSrc(t)->getInsn()->dType != TYPE_U32)
- return;
+ Instruction *src = i->getSrc(t)->getInsn();
+ ImmediateValue imm1;
+ if (imm0.reg.data.u32 == 0) {
+ i->op = OP_MOV;
+ i->setSrc(0, new_ImmediateValue(prog, 0u));
+ i->src(0).mod = Modifier(0);
+ i->setSrc(1, NULL);
+ } else if (imm0.reg.data.u32 == ~0U) {
+ i->op = i->src(t).mod.getOp();
+ if (t) {
+ i->setSrc(0, i->getSrc(t));
+ i->src(0).mod = i->src(t).mod;
+ }
+ i->setSrc(1, NULL);
+ } else if (src->asCmp()) {
+ CmpInstruction *cmp = src->asCmp();
+ if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
+ return;
+ if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
+ return;
+ if (imm0.reg.data.f32 != 1.0)
+ return;
+ if (cmp->dType != TYPE_U32)
+ return;
- i->getSrc(t)->getInsn()->dType = TYPE_F32;
- if (i->src(t).mod != Modifier(0)) {
- assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
- i->src(t).mod = Modifier(0);
- cmp->setCond = inverseCondCode(cmp->setCond);
- }
- i->op = OP_MOV;
- i->setSrc(s, NULL);
- if (t) {
- i->setSrc(0, i->getSrc(t));
- i->setSrc(t, NULL);
+ cmp->dType = TYPE_F32;
+ if (i->src(t).mod != Modifier(0)) {
+ assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
+ i->src(t).mod = Modifier(0);
+ cmp->setCond = inverseCondCode(cmp->setCond);
+ }
+ i->op = OP_MOV;
+ i->setSrc(s, NULL);
+ if (t) {
+ i->setSrc(0, i->getSrc(t));
+ i->setSrc(t, NULL);
+ }
+ } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
+ src->op == OP_SHR &&
+ src->src(1).getImmediate(imm1) &&
+ i->src(t).mod == Modifier(0) &&
+ util_is_power_of_two(imm0.reg.data.u32 + 1)) {
+ // low byte = offset, high byte = width
+ uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
+ i->op = OP_EXTBF;
+ i->setSrc(0, src->getSrc(0));
+ i->setSrc(1, new_ImmediateValue(prog, ext));
}
}
break;
break;
// try to concatenate shifts
Instruction *si = i->getSrc(0)->getInsn();
- if (!si || si->op != OP_SHL)
+ if (!si)
break;
ImmediateValue imm1;
- if (si->src(1).getImmediate(imm1)) {
+ switch (si->op) {
+ case OP_SHL:
+ if (si->src(1).getImmediate(imm1)) {
+ bld.setPosition(i, false);
+ i->setSrc(0, si->getSrc(0));
+ i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
+ }
+ break;
+ case OP_SHR:
+ if (si->src(1).getImmediate(imm1) && imm0.reg.data.u32 == imm1.reg.data.u32) {
+ bld.setPosition(i, false);
+ i->op = OP_AND;
+ i->setSrc(0, si->getSrc(0));
+ i->setSrc(1, bld.loadImm(NULL, ~((1 << imm0.reg.data.u32) - 1)));
+ }
+ break;
+ case OP_MUL:
+ int muls;
+ if (isFloatType(si->dType))
+ return;
+ if (si->src(1).getImmediate(imm1))
+ muls = 1;
+ else if (si->src(0).getImmediate(imm1))
+ muls = 0;
+ else
+ return;
+
+ bld.setPosition(i, false);
+ i->op = OP_MUL;
+ i->setSrc(0, si->getSrc(!muls));
+ i->setSrc(1, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
+ break;
+ case OP_SUB:
+ case OP_ADD:
+ int adds;
+ if (isFloatType(si->dType))
+ return;
+ if (si->op != OP_SUB && si->src(0).getImmediate(imm1))
+ adds = 0;
+ else if (si->src(1).getImmediate(imm1))
+ adds = 1;
+ else
+ return;
+ if (si->src(!adds).mod != Modifier(0))
+ return;
+ // SHL(ADD(x, y), z) = ADD(SHL(x, z), SHL(y, z))
+
+ // This is more operations, but if one of x, y is an immediate, then
+ // we can get a situation where (a) we can use ISCADD, or (b)
+ // propagate the add bit into an indirect load.
bld.setPosition(i, false);
- i->setSrc(0, si->getSrc(0));
- i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
+ i->op = si->op;
+ i->setSrc(adds, bld.loadImm(NULL, imm1.reg.data.u32 << imm0.reg.data.u32));
+ i->setSrc(!adds, bld.mkOp2v(OP_SHL, i->dType,
+ bld.getSSA(i->def(0).getSize(), i->def(0).getFile()),
+ si->getSrc(!adds),
+ bld.mkImm(imm0.reg.data.u32)));
+ break;
+ default:
+ return;
}
}
break;
i->op = OP_MOV;
break;
}
+ case OP_CVT: {
+ Storage res;
+
+ // TODO: handle 64-bit values properly
+ if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
+ return;
+
+ // TODO: handle single byte/word extractions
+ if (i->subOp)
+ return;
+
+ bld.setPosition(i, true); /* make sure bld is init'ed */
+
+#define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
+ case type: \
+ switch (i->sType) { \
+ case TYPE_F64: \
+ res.data.dst = util_iround(i->saturate ? \
+ CLAMP(imm0.reg.data.f64, fmin, fmax) : \
+ imm0.reg.data.f64); \
+ break; \
+ case TYPE_F32: \
+ res.data.dst = util_iround(i->saturate ? \
+ CLAMP(imm0.reg.data.f32, fmin, fmax) : \
+ imm0.reg.data.f32); \
+ break; \
+ case TYPE_S32: \
+ res.data.dst = i->saturate ? \
+ CLAMP(imm0.reg.data.s32, imin, imax) : \
+ imm0.reg.data.s32; \
+ break; \
+ case TYPE_U32: \
+ res.data.dst = i->saturate ? \
+ CLAMP(imm0.reg.data.u32, umin, umax) : \
+ imm0.reg.data.u32; \
+ break; \
+ case TYPE_S16: \
+ res.data.dst = i->saturate ? \
+ CLAMP(imm0.reg.data.s16, imin, imax) : \
+ imm0.reg.data.s16; \
+ break; \
+ case TYPE_U16: \
+ res.data.dst = i->saturate ? \
+ CLAMP(imm0.reg.data.u16, umin, umax) : \
+ imm0.reg.data.u16; \
+ break; \
+ default: return; \
+ } \
+ i->setSrc(0, bld.mkImm(res.data.dst)); \
+ break
+
+ switch(i->dType) {
+ CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
+ CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
+ CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
+ CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
+ case TYPE_F32:
+ switch (i->sType) {
+ case TYPE_F64:
+ res.data.f32 = i->saturate ?
+ CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+ imm0.reg.data.f64;
+ break;
+ case TYPE_F32:
+ res.data.f32 = i->saturate ?
+ CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+ imm0.reg.data.f32;
+ break;
+ case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
+ case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
+ case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
+ case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
+ default:
+ return;
+ }
+ i->setSrc(0, bld.mkImm(res.data.f32));
+ break;
+ case TYPE_F64:
+ switch (i->sType) {
+ case TYPE_F64:
+ res.data.f64 = i->saturate ?
+ CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+ imm0.reg.data.f64;
+ break;
+ case TYPE_F32:
+ res.data.f64 = i->saturate ?
+ CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+ imm0.reg.data.f32;
+ break;
+ case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
+ case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
+ case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
+ case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
+ default:
+ return;
+ }
+ i->setSrc(0, bld.mkImm(res.data.f64));
+ break;
+ default:
+ return;
+ }
+#undef CASE
+
+ i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
+ i->op = OP_MOV;
+ i->saturate = 0;
+ i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
+ break;
+ }
default:
return;
}
void handleRCP(Instruction *);
void handleSLCT(Instruction *);
void handleLOGOP(Instruction *);
- void handleCVT(Instruction *);
+ void handleCVT_NEG(Instruction *);
+ void handleCVT_CVT(Instruction *);
+ void handleCVT_EXTBF(Instruction *);
void handleSUCLAMP(Instruction *);
+ void handleNEG(Instruction *);
BuildUtil bld;
};
else
return false;
- if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) ||
- (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb))
- return false;
-
src = add->getSrc(s);
- if (src->getInsn()->postFactor)
+ if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
return false;
+
+ if (src->getInsn()->saturate || src->getInsn()->postFactor ||
+ src->getInsn()->dnz)
+ return false;
+
if (toOp == OP_SAD) {
ImmediateValue imm;
if (!src->getInsn()->src(2).getImmediate(imm))
return false;
}
+ if (typeSizeof(add->dType) != typeSizeof(src->getInsn()->dType) ||
+ isFloatType(add->dType) != isFloatType(src->getInsn()->dType))
+ return false;
+
mod[0] = add->src(0).mod;
mod[1] = add->src(1).mod;
mod[2] = src->getUniqueInsn()->src(0).mod;
add->op = toOp;
add->subOp = src->getInsn()->subOp; // potentially mul-high
+ add->dType = src->getInsn()->dType; // sign matters for imad hi
+ add->sType = src->getInsn()->sType;
add->setSrc(2, add->src(s ? 0 : 1));
// nv50:
// F2I(NEG(I2F(ABS(SET))))
void
-AlgebraicOpt::handleCVT(Instruction *cvt)
+AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
{
+ Instruction *insn = cvt->getSrc(0)->getInsn();
if (cvt->sType != TYPE_F32 ||
cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
return;
- Instruction *insn = cvt->getSrc(0)->getInsn();
if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
return;
if (insn->src(0).mod != Modifier(0))
delete_Instruction(prog, cvt);
}
+// F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
+// does a type conversion, this becomes trickier as there might be range
+// changes/etc. We could handle those in theory as long as the range was being
+// reduced or kept the same.
+void
+AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
+{
+ Instruction *insn = cvt->getSrc(0)->getInsn();
+ RoundMode rnd = insn->rnd;
+
+ if (insn->saturate ||
+ insn->subOp ||
+ insn->dType != insn->sType ||
+ insn->dType != cvt->sType)
+ return;
+
+ switch (insn->op) {
+ case OP_CEIL:
+ rnd = ROUND_PI;
+ break;
+ case OP_FLOOR:
+ rnd = ROUND_MI;
+ break;
+ case OP_TRUNC:
+ rnd = ROUND_ZI;
+ break;
+ case OP_CVT:
+ break;
+ default:
+ return;
+ }
+
+ if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
+ rnd = (RoundMode)(rnd & 3);
+
+ cvt->rnd = rnd;
+ cvt->setSrc(0, insn->getSrc(0));
+ cvt->src(0).mod *= insn->src(0).mod;
+ cvt->sType = insn->sType;
+}
+
+// Some shaders extract packed bytes out of words and convert them to
+// e.g. float. The Fermi+ CVT instruction can extract those directly, as can
+// nv50 for word sizes.
+//
+// CVT(EXTBF(x, byte/word))
+// CVT(AND(bytemask, x))
+// CVT(AND(bytemask, SHR(x, 8/16/24)))
+// CVT(SHR(x, 16/24))
+void
+AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
+{
+ Instruction *insn = cvt->getSrc(0)->getInsn();
+ ImmediateValue imm;
+ Value *arg = NULL;
+ unsigned width, offset;
+ if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
+ return;
+ if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
+ width = (imm.reg.data.u32 >> 8) & 0xff;
+ offset = imm.reg.data.u32 & 0xff;
+ arg = insn->getSrc(0);
+
+ if (width != 8 && width != 16)
+ return;
+ if (width == 8 && offset & 0x7)
+ return;
+ if (width == 16 && offset & 0xf)
+ return;
+ } else if (insn->op == OP_AND) {
+ int s;
+ if (insn->src(0).getImmediate(imm))
+ s = 0;
+ else if (insn->src(1).getImmediate(imm))
+ s = 1;
+ else
+ return;
+
+ if (imm.reg.data.u32 == 0xff)
+ width = 8;
+ else if (imm.reg.data.u32 == 0xffff)
+ width = 16;
+ else
+ return;
+
+ arg = insn->getSrc(!s);
+ Instruction *shift = arg->getInsn();
+ offset = 0;
+ if (shift && shift->op == OP_SHR &&
+ shift->sType == cvt->sType &&
+ shift->src(1).getImmediate(imm) &&
+ ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+ (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
+ arg = shift->getSrc(0);
+ offset = imm.reg.data.u32;
+ }
+ // We just AND'd the high bits away, which means this is effectively an
+ // unsigned value.
+ cvt->sType = TYPE_U32;
+ } else if (insn->op == OP_SHR &&
+ insn->sType == cvt->sType &&
+ insn->src(1).getImmediate(imm)) {
+ arg = insn->getSrc(0);
+ if (imm.reg.data.u32 == 24) {
+ width = 8;
+ offset = 24;
+ } else if (imm.reg.data.u32 == 16) {
+ width = 16;
+ offset = 16;
+ } else {
+ return;
+ }
+ }
+
+ if (!arg)
+ return;
+
+ // Irrespective of what came earlier, we can undo a shift on the argument
+ // by adjusting the offset.
+ Instruction *shift = arg->getInsn();
+ if (shift && shift->op == OP_SHL &&
+ shift->src(1).getImmediate(imm) &&
+ ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+ (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
+ imm.reg.data.u32 <= offset) {
+ arg = shift->getSrc(0);
+ offset -= imm.reg.data.u32;
+ }
+
+ // The unpackSnorm lowering still leaves a few shifts behind, but it's too
+ // annoying to detect them.
+
+ if (width == 8) {
+ cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
+ } else {
+ assert(width == 16);
+ cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
+ }
+ cvt->setSrc(0, arg);
+ cvt->subOp = offset >> 3;
+}
+
// SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
void
AlgebraicOpt::handleSUCLAMP(Instruction *insn)
insn->setSrc(0, add->getSrc(s));
}
+// NEG(AND(SET, 1)) -> SET
+void
+AlgebraicOpt::handleNEG(Instruction *i) {
+ Instruction *src = i->getSrc(0)->getInsn();
+ ImmediateValue imm;
+ int b;
+
+ if (isFloatType(i->sType) || !src || src->op != OP_AND)
+ return;
+
+ if (src->src(0).getImmediate(imm))
+ b = 1;
+ else if (src->src(1).getImmediate(imm))
+ b = 0;
+ else
+ return;
+
+ if (!imm.isInteger(1))
+ return;
+
+ Instruction *set = src->getSrc(b)->getInsn();
+ if ((set->op == OP_SET || set->op == OP_SET_AND ||
+ set->op == OP_SET_OR || set->op == OP_SET_XOR) &&
+ !isFloatType(set->dType)) {
+ i->def(0).replace(set->getDef(0), false);
+ }
+}
+
bool
AlgebraicOpt::visit(BasicBlock *bb)
{
handleLOGOP(i);
break;
case OP_CVT:
- handleCVT(i);
+ handleCVT_NEG(i);
+ handleCVT_CVT(i);
+ if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
+ handleCVT_EXTBF(i);
break;
case OP_SUCLAMP:
handleSUCLAMP(i);
break;
+ case OP_NEG:
+ handleNEG(i);
+ break;
default:
break;
}
if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
return false;
+ // for compute indirect loads are not guaranteed to be aligned
+ if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
+ return false;
assert(sizeRc + sizeLd <= 16 && offRc != offLd);
if (!prog->getTarget()->
isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
return false;
+ // no unaligned stores
if (size == 8 && MIN2(offRc, offSt) & 0x7)
return false;
+ // for compute indirect stores are not guaranteed to be aligned
+ if (prog->getType() == Program::TYPE_COMPUTE && rec->rel[0])
+ return false;
st->takeExtraSources(0, extra); // save predicate and indirect address
// get non-replaced sources after values covered by st
for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
vals[k++] = ri->getSrc(s);
- assert((unsigned int)k <= Elements(vals));
+ assert((unsigned int)k <= ARRAY_SIZE(vals));
for (s = 0; s < k; ++s)
st->setSrc(s + 1, vals[s]);
st->setSrc(0, ri->getSrc(0));
}
} else
if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
+ if (typeSizeof(ldst->dType) == 4 &&
+ ldst->src(1).getFile() == FILE_GPR &&
+ ldst->getSrc(1)->getInsn()->op == OP_NOP) {
+ delete_Instruction(prog, ldst);
+ continue;
+ }
isLoad = false;
} else {
// TODO: maybe have all fixed ops act as barrier ?
}
if (ldst->getPredicate()) // TODO: handle predicated ld/st
continue;
+ if (ldst->perPatch) // TODO: create separate per-patch lists
+ continue;
if (isLoad) {
DataFile file = ldst->src(0).getFile();
class FlatteningPass : public Pass
{
private:
+ virtual bool visit(Function *);
virtual bool visit(BasicBlock *);
bool tryPredicateConditional(BasicBlock *);
inline bool isConstantCondition(Value *pred);
inline bool mayPredicate(const Instruction *, const Value *pred) const;
inline void removeFlow(Instruction *);
+
+ uint8_t gpr_unit;
};
bool
file = ld->src(0).getFile();
} else {
file = insn->src(s).getFile();
- // catch $r63 on NVC0
- if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
- file = FILE_IMMEDIATE;
+ // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
+ // in register "units", which can vary between targets.
+ if (file == FILE_GPR) {
+ Value *v = insn->getSrc(s);
+ int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
+ int units = bytes >> gpr_unit;
+ if (units > prog->maxGPR)
+ file = FILE_IMMEDIATE;
+ }
}
if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
return false;
}
}
+bool
+FlatteningPass::visit(Function *fn)
+{
+ gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
+
+ return true;
+}
+
bool
FlatteningPass::visit(BasicBlock *bb)
{
!isSurfaceOp(insn->op) && // not confirmed
insn->op != OP_LINTERP && // probably just nve4
insn->op != OP_PINTERP && // probably just nve4
- ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
- typeSizeof(insn->dType) <= 4) &&
+ ((insn->op != OP_LOAD && insn->op != OP_STORE && insn->op != OP_ATOM) ||
+ (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
!insn->isNop()) {
insn->join = 1;
bb->remove(bb->getExit());
virtual bool visit(BasicBlock *);
};
+static bool
+post_ra_dead(Instruction *i)
+{
+ for (int d = 0; i->defExists(d); ++d)
+ if (i->getDef(d)->refCount())
+ return false;
+ return true;
+}
+
bool
NV50PostRaConstantFolding::visit(BasicBlock *bb)
{
i->src(0).getFile() != FILE_GPR ||
i->src(1).getFile() != FILE_GPR ||
i->src(2).getFile() != FILE_GPR ||
- i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id ||
- !isFloatType(i->dType))
+ i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
+ break;
+
+ if (i->getDef(0)->reg.data.id >= 64 ||
+ i->getSrc(0)->reg.data.id >= 64)
+ break;
+
+ if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
+ break;
+
+ if (i->getPredicate())
break;
def = i->getSrc(1)->getInsn();
- if (def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
+ if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
+ def = def->getSrc(0)->getInsn();
+ if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
vtmp = i->getSrc(1);
- i->setSrc(1, def->getSrc(0));
+ if (isFloatType(i->sType)) {
+ i->setSrc(1, def->getSrc(0));
+ } else {
+ ImmediateValue val;
+ bool ret = def->src(0).getImmediate(val);
+ assert(ret);
+ if (i->getSrc(1)->reg.data.id & 1)
+ val.reg.data.u32 >>= 16;
+ val.reg.data.u32 &= 0xffff;
+ i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
+ }
/* There's no post-RA dead code elimination, so do it here
* XXX: if we add more code-removing post-RA passes, we might
* want to create a post-RA dead-code elim pass */
- if (vtmp->refCount() == 0)
- delete_Instruction(bb->getProgram(), def);
+ if (post_ra_dead(vtmp->getInsn())) {
+ Value *src = vtmp->getInsn()->getSrc(0);
+ // Careful -- splits will have already been removed from the
+ // functions. Don't double-delete.
+ if (vtmp->getInsn()->bb)
+ delete_Instruction(prog, vtmp->getInsn());
+ if (src->getInsn() && post_ra_dead(src->getInsn()))
+ delete_Instruction(prog, src->getInsn());
+ }
break;
}
if (that->srcExists(s))
return false;
- if (op == OP_LOAD || op == OP_VFETCH) {
+ if (op == OP_LOAD || op == OP_VFETCH || op == OP_ATOM) {
switch (src(0).getFile()) {
case FILE_MEMORY_CONST:
case FILE_SHADER_INPUT:
return true;
+ case FILE_SHADER_OUTPUT:
+ return bb->getProgram()->getType() == Program::TYPE_TESSELLATION_EVAL;
default:
return false;
}
ik = phi->getSrc(0)->getInsn();
if (!ik)
continue; // probably a function input
+ if (ik->defCount(0xff) > 1)
+ continue; // too painful to check if we can really push this forward
for (s = 1; phi->srcExists(s); ++s) {
if (phi->getSrc(s)->refCount() > 1)
break;
bool
DeadCodeElim::visit(BasicBlock *bb)
{
- Instruction *next;
+ Instruction *prev;
- for (Instruction *i = bb->getFirst(); i; i = next) {
- next = i->next;
+ for (Instruction *i = bb->getExit(); i; i = prev) {
+ prev = i->prev;
if (i->isDead()) {
++deadCount;
delete_Instruction(prog, i);
} else
- if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
+ if (i->defExists(1) &&
+ i->subOp == 0 &&
+ (i->op == OP_VFETCH || i->op == OP_LOAD)) {
checkSplitLoad(i);
} else
if (i->defExists(0) && !i->getDef(0)->refCount()) {
if (i->op == OP_ATOM ||
i->op == OP_SUREDP ||
- i->op == OP_SUREDB)
+ i->op == OP_SUREDB) {
i->setDef(0, NULL);
+ } else if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
+ i->setDef(0, i->getDef(1));
+ i->setDef(1, NULL);
+ }
}
}
return true;
}
+// Each load can go into up to 4 destinations, any of which might potentially
+// be dead (i.e. a hole). These can always be split into 2 loads, independent
+// of where the holes are. We find the first contiguous region, put it into
+// the first load, and then put the second contiguous region into the second
+// load. There can be at most 2 contiguous regions.
+//
+// Note that there are some restrictions, for example it's not possible to do
+// a 64-bit load that's not 64-bit aligned, so such a load has to be split
+// up. Also hardware doesn't support 96-bit loads, so those also have to be
+// split into a 64-bit and 32-bit load.
void
DeadCodeElim::checkSplitLoad(Instruction *ld1)
{
addr1 = ld1->getSrc(0)->reg.data.offset;
n1 = n2 = 0;
size1 = size2 = 0;
+
+ // Compute address/width for first load
for (d = 0; ld1->defExists(d); ++d) {
if (mask & (1 << d)) {
if (size1 && (addr1 & 0x7))
break;
}
}
+
+ // Scale back the size of the first load until it can be loaded. This
+ // typically happens for TYPE_B96 loads.
+ while (n1 &&
+ !prog->getTarget()->isAccessSupported(ld1->getSrc(0)->reg.file,
+ typeOfSize(size1))) {
+ size1 -= def1[--n1]->reg.size;
+ d--;
+ }
+
+ // Compute address/width for second load
for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
if (mask & (1 << d)) {
+ assert(!size2 || !(addr2 & 0x7));
def2[n2] = ld1->getDef(d);
size2 += def2[n2++]->reg.size;
- } else {
+ } else if (!n2) {
assert(!n2);
addr2 += ld1->getDef(d)->reg.size;
+ } else {
+ break;
}
}
+ // Make sure that we've processed all the values
+ for (; ld1->defExists(d); ++d)
+ assert(!(mask & (1 << d)));
+
updateLdStOffset(ld1, addr1, func);
ld1->setType(typeOfSize(size1));
for (d = 0; d < 4; ++d)
RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
RUN_PASS(1, ConstantFolding, foldAll);
RUN_PASS(1, LoadPropagation, run);
+ RUN_PASS(1, IndirectPropagation, run);
RUN_PASS(2, MemoryOpt, run);
RUN_PASS(2, LocalCSE, run);
RUN_PASS(0, DeadCodeElim, buryAll);