void emitCCTL(const Instruction *);
void emitINTERP(const Instruction *);
+ void emitAFETCH(const Instruction *);
void emitPFETCH(const Instruction *);
void emitVFETCH(const Instruction *);
void emitEXPORT(const Instruction *);
void emitUADD(const Instruction *);
void emitFADD(const Instruction *);
+ void emitDADD(const Instruction *);
void emitUMUL(const Instruction *);
void emitFMUL(const Instruction *);
+ void emitDMUL(const Instruction *);
void emitIMAD(const Instruction *);
void emitISAD(const Instruction *);
void emitFMAD(const Instruction *);
+ void emitDMAD(const Instruction *);
void emitMADSP(const Instruction *);
void emitNOT(Instruction *);
void emitPIXLD(const Instruction *);
+ void emitVOTE(const Instruction *);
+
inline void defId(const ValueDef&, const int pos);
inline void defId(const Instruction *, int d, const int pos);
inline void srcId(const ValueRef&, const int pos);
assert(imm);
u32 = imm->reg.data.u32;
+ if ((code[0] & 0xf) == 0x1) {
+ // double immediate
+ uint64_t u64 = imm->reg.data.u64;
+ assert(!(u64 & 0x00000fffffffffffULL));
+ assert(!(code[1] & 0xc000));
+ code[0] |= ((u64 >> 44) & 0x3f) << 26;
+ code[1] |= 0xc000 | (u64 >> 50);
+ } else
if ((code[0] & 0xf) == 0x2) {
// LIMM
code[0] |= (u32 & 0x3f) << 26;
srcId(i->src(s), s ? ((s == 2) ? 49 : s1) : 20);
break;
default:
+ if (i->op == OP_SELP) {
+ // OP_SELP is used to implement shared+atomics on Fermi.
+ assert(s == 2 && i->src(s).getFile() == FILE_PREDICATE);
+ srcId(i->src(s), 49);
+ }
// ignore here, can be predicate or flags, but must not be address
break;
}
}
}
+void
+CodeEmitterNVC0::emitDMAD(const Instruction *i)
+{
+ bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ emitForm_A(i, HEX64(20000000, 00000001));
+
+ if (i->src(2).mod.neg())
+ code[0] |= 1 << 8;
+
+ roundMode_A(i);
+
+ if (neg1)
+ code[0] |= 1 << 9;
+
+ assert(!i->saturate);
+ assert(!i->ftz);
+}
+
void
CodeEmitterNVC0::emitFMUL(const Instruction *i)
{
}
}
+void
+CodeEmitterNVC0::emitDMUL(const Instruction *i)
+{
+ bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ emitForm_A(i, HEX64(50000000, 00000001));
+ roundMode_A(i);
+
+ if (neg)
+ code[0] |= 1 << 9;
+
+ assert(!i->saturate);
+ assert(!i->ftz);
+ assert(!i->dnz);
+ assert(!i->postFactor);
+}
+
void
CodeEmitterNVC0::emitUMUL(const Instruction *i)
{
}
}
+void
+CodeEmitterNVC0::emitDADD(const Instruction *i)
+{
+ assert(i->encSize == 8);
+ emitForm_A(i, HEX64(48000000, 00000001));
+ roundMode_A(i);
+ assert(!i->saturate);
+ assert(!i->ftz);
+ emitNegAbs12(i);
+ if (i->op == OP_SUB)
+ code[0] ^= 1 << 8;
+}
+
void
CodeEmitterNVC0::emitUADD(const Instruction *i)
{
// (a OP b) OP c
if (i->predSrc != 2 && i->srcExists(2)) {
code[1] |= subOp << 21;
- srcId(i->src(2), 17);
- if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 20;
+ srcId(i->src(2), 49);
+ if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT)) code[1] |= 1 << 20;
} else {
code[1] |= 0x000e0000;
}
else
if (!isFloatType(i->dType))
op |= isSignedType(i->dType) ? 0x23 : 0x03;
+ if (i->dType == TYPE_F64)
+ op |= 0x01;
emitForm_A(i, op);
emitNegAbs12(i);
code[0] |= util_logbase2(typeSizeof(dType)) << 20;
code[0] |= util_logbase2(typeSizeof(i->sType)) << 23;
+ // for 8/16 source types, the byte/word is in subOp. word 1 is
+ // represented as 2.
+ if (!isFloatType(i->sType))
+ code[1] |= i->subOp << 0x17;
+ else
+ code[1] |= i->subOp << 0x18;
+
if (sat)
code[0] |= 0x20;
if (abs)
if (!isFloatType(i->sType))
lo = 0x3;
- if (isFloatType(i->dType) || isSignedIntType(i->sType))
+ if (isSignedIntType(i->sType))
lo |= 0x20;
+ if (isFloatType(i->dType)) {
+ if (isFloatType(i->sType))
+ lo |= 0x20;
+ else
+ lo |= 0x80;
+ }
switch (i->op) {
case OP_SET_AND: hi = 0x10000000; break;
{
emitForm_A(i, HEX64(20000000, 00000004));
- if (i->cc == CC_NOT_P || i->src(2).mod & Modifier(NV50_IR_MOD_NOT))
+ if (i->src(2).mod & Modifier(NV50_IR_MOD_NOT))
code[1] |= 1 << 20;
}
defId(i->def(0), 14);
srcId(i->src(0), 20);
- srcId(i->srcExists(1) ? i->src(1) : i->src(0), 26);
+ srcId((i->srcExists(1) && i->predSrc != 1) ? i->src(1) : i->src(0), 26);
if (i->op == OP_QUADOP && progType != Program::TYPE_FRAGMENT)
code[0] |= 1 << 9; // dall
} else
if (mask & 2) {
int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+ if (writeIssueDelays && !(f->target.bb->binPos & 0x3f))
+ pcRel += 8;
// currently we don't want absolute branches
assert(!f->absolute);
code[0] |= (pcRel & 0x3f) << 26;
ImmediateValue *imm = i->getSrc(0)->asImm();
assert(imm);
code[0] |= imm->reg.data.u32 << 20;
+ code[1] |= 0x8000;
}
// thread count
} else {
ImmediateValue *imm = i->getSrc(1)->asImm();
assert(imm);
+ assert(imm->reg.data.u32 <= 0xfff);
code[0] |= imm->reg.data.u32 << 26;
code[1] |= imm->reg.data.u32 >> 6;
+ code[1] |= 0x4000;
}
if (i->srcExists(2) && (i->predSrc != 2)) {
}
}
+void
+CodeEmitterNVC0::emitAFETCH(const Instruction *i)
+{
+ code[0] = 0x00000006;
+ code[1] = 0x0c000000 | (i->src(0).get()->reg.data.offset & 0x7ff);
+
+ if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+ code[0] |= 0x200;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 14);
+ srcId(i->src(0).getIndirect(0), 20);
+}
+
void
CodeEmitterNVC0::emitPFETCH(const Instruction *i)
{
emitPredicate(i);
+ const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
defId(i->def(0), 14);
- srcId(i->src(1), 20);
+ srcId(i, src1, 20);
}
void
}
}
+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+ bool force_persample_interp, bool flatshade)
+{
+ int ipa = entry->ipa;
+ int reg = entry->reg;
+ int loc = entry->loc;
+
+ if (flatshade &&
+ (ipa & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_SC) {
+ ipa = NV50_IR_INTERP_FLAT;
+ reg = 0x3f;
+ } else if (force_persample_interp &&
+ (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+ (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+ ipa |= NV50_IR_INTERP_CENTROID;
+ }
+ code[loc + 0] &= ~(0xf << 6);
+ code[loc + 0] |= ipa << 6;
+ code[loc + 0] &= ~(0x3f << 26);
+ code[loc + 0] |= reg << 26;
+}
+
void
CodeEmitterNVC0::emitINTERP(const Instruction *i)
{
if (i->saturate)
code[0] |= 1 << 5;
- if (i->op == OP_PINTERP)
+ if (i->op == OP_PINTERP) {
srcId(i->src(1), 26);
- else
+ addInterp(i->ipa, SDATA(i->src(1)).id, interpApply);
+ } else {
code[0] |= 0x3f << 26;
+ addInterp(i->ipa, 0x3f, interpApply);
+ }
srcId(i->src(0).getIndirect(0), 20);
} else {
switch (i->src(0).getFile()) {
case FILE_MEMORY_GLOBAL: opc = 0x90000000; break;
case FILE_MEMORY_LOCAL: opc = 0xc8000000; break;
- case FILE_MEMORY_SHARED: opc = 0xc9000000; break;
+ case FILE_MEMORY_SHARED:
+ if (i->subOp == NV50_IR_SUBOP_STORE_UNLOCKED) {
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+ opc = 0xb8000000;
+ else
+ opc = 0xcc000000;
+ } else {
+ opc = 0xc9000000;
+ }
+ break;
default:
assert(!"invalid memory file");
opc = 0;
code[0] = 0x00000005;
code[1] = opc;
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET) {
+ // Unlocked store on shared memory can fail.
+ if (i->src(0).getFile() == FILE_MEMORY_SHARED &&
+ i->subOp == NV50_IR_SUBOP_STORE_UNLOCKED) {
+ assert(i->defExists(0));
+ defId(i->def(0), 8);
+ }
+ }
+
setAddressByFile(i->src(0));
srcId(i->src(1), 14);
srcId(i->src(0).getIndirect(0), 20);
switch (i->src(0).getFile()) {
case FILE_MEMORY_GLOBAL: opc = 0x80000000; break;
case FILE_MEMORY_LOCAL: opc = 0xc0000000; break;
- case FILE_MEMORY_SHARED: opc = 0xc1000000; break;
+ case FILE_MEMORY_SHARED:
+ if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+ opc = 0xa8000000;
+ else
+ opc = 0xc4000000;
+ } else {
+ opc = 0xc1000000;
+ }
+ break;
case FILE_MEMORY_CONST:
if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
emitMOV(i); // not sure if this is any better
}
code[1] = opc;
+ if (i->src(0).getFile() == FILE_MEMORY_SHARED) {
+ if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
+ assert(i->defExists(1));
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+ defId(i->def(1), 8);
+ else
+ defId(i->def(1), 32 + 18);
+ }
+ }
+
defId(i->def(0), 14);
setAddressByFile(i->src(0));
case SV_VERTEX_COUNT: return 0x10;
case SV_INVOCATION_ID: return 0x11;
case SV_YDIR: return 0x12;
+ case SV_THREAD_KILL: return 0x13;
case SV_TID: return 0x21 + SDATA(ref).sv.index;
case SV_CTAID: return 0x25 + SDATA(ref).sv.index;
case SV_NTID: return 0x29 + SDATA(ref).sv.index;
code[0] |= 63 << 20;
}
- if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
- srcId(i->src(2), 32 + 17);
+ if (i->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+ assert(i->src(1).getSize() == 2 * typeSizeof(i->sType));
+ code[1] |= (SDATA(i->src(1)).id + 1) << 17;
+ }
}
void
code[1] |= 0x00e00000;
}
+void
+CodeEmitterNVC0::emitVOTE(const Instruction *i)
+{
+ assert(i->src(0).getFile() == FILE_PREDICATE &&
+ i->def(1).getFile() == FILE_PREDICATE);
+
+ code[0] = 0x00000004 | (i->subOp << 5);
+ code[1] = 0x48000000;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 14);
+ defId(i->def(1), 32 + 22);
+ if (i->src(0).mod == Modifier(NV50_IR_MOD_NOT))
+ code[0] |= 1 << 23;
+ srcId(i->src(0), 20);
+}
+
bool
CodeEmitterNVC0::emitInstruction(Instruction *insn)
{
case OP_PFETCH:
emitPFETCH(insn);
break;
+ case OP_AFETCH:
+ emitAFETCH(insn);
+ break;
case OP_EMIT:
case OP_RESTART:
emitOUT(insn);
break;
case OP_ADD:
case OP_SUB:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDADD(insn);
+ else if (isFloatType(insn->dType))
emitFADD(insn);
else
emitUADD(insn);
break;
case OP_MUL:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMUL(insn);
+ else if (isFloatType(insn->dType))
emitFMUL(insn);
else
emitUMUL(insn);
break;
case OP_MAD:
case OP_FMA:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMAD(insn);
+ else if (isFloatType(insn->dType))
emitFMAD(insn);
else
emitIMAD(insn);
case OP_CEIL:
case OP_FLOOR:
case OP_TRUNC:
- case OP_CVT:
case OP_SAT:
emitCVT(insn);
break;
+ case OP_CVT:
+ if (insn->def(0).getFile() == FILE_PREDICATE ||
+ insn->src(0).getFile() == FILE_PREDICATE)
+ emitMOV(insn);
+ else
+ emitCVT(insn);
+ break;
case OP_RSQ:
- emitSFnOp(insn, 5);
+ emitSFnOp(insn, 5 + 2 * insn->subOp);
break;
case OP_RCP:
- emitSFnOp(insn, 4);
+ emitSFnOp(insn, 4 + 2 * insn->subOp);
break;
case OP_LG2:
emitSFnOp(insn, 3);
case OP_PIXLD:
emitPIXLD(insn);
break;
+ case OP_VOTE:
+ emitVOTE(insn);
+ break;
case OP_PHI:
case OP_UNION:
case OP_CONSTRAINT:
ERROR("operation should have been lowered\n");
return false;
default:
- ERROR("unknow op\n");
+ ERROR("unknown op: %u\n", insn->op);
return false;
}
int imul; // integer MUL to MUL delay 3
} res;
struct ScoreData {
- int r[64];
+ int r[256];
int p[8];
int c;
} rd, wr;
int base;
+ int regs;
void rebase(const int base)
{
return;
this->base = 0;
- for (int i = 0; i < 64; ++i) {
+ for (int i = 0; i < regs; ++i) {
rd.r[i] += delta;
wr.r[i] += delta;
}
res.imul += delta;
res.tex += delta;
}
- void wipe()
+ void wipe(int regs)
{
memset(&rd, 0, sizeof(rd));
memset(&wr, 0, sizeof(wr));
memset(&res, 0, sizeof(res));
+ this->regs = regs;
}
int getLatest(const ScoreData& d) const
{
int max = 0;
- for (int i = 0; i < 64; ++i)
+ for (int i = 0; i < regs; ++i)
if (d.r[i] > max)
max = d.r[i];
for (int i = 0; i < 8; ++i)
}
void setMax(const RegScores *that)
{
- for (int i = 0; i < 64; ++i) {
+ for (int i = 0; i < regs; ++i) {
rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
}
}
void print(int cycle)
{
- for (int i = 0; i < 64; ++i) {
+ for (int i = 0; i < regs; ++i) {
if (rd.r[i] > cycle)
INFO("rd $r%i @ %i\n", i, rd.r[i]);
if (wr.r[i] > cycle)
RegScores *score; // for current BB
std::vector<RegScores> scoreBoards;
- int cycle;
int prevData;
operation prevOp;
bool
SchedDataCalculator::visit(Function *func)
{
+ int regs = targ->getFileSize(FILE_GPR) + 1;
scoreBoards.resize(func->cfg.getSize());
for (size_t i = 0; i < scoreBoards.size(); ++i)
- scoreBoards[i].wipe();
+ scoreBoards[i].wipe(regs);
return true;
}