+inline void
+SchedDataCalculatorGM107::emitStall(Instruction *insn, uint8_t cnt)
+{
+ assert(cnt < 16);
+ insn->sched |= cnt;
+}
+
+inline void
+SchedDataCalculatorGM107::emitYield(Instruction *insn)
+{
+ insn->sched |= 1 << 4;
+}
+
+inline void
+SchedDataCalculatorGM107::emitWrDepBar(Instruction *insn, uint8_t id)
+{
+ assert(id < 6);
+ if ((insn->sched & 0xe0) == 0xe0)
+ insn->sched ^= 0xe0;
+ insn->sched |= id << 5;
+}
+
+inline void
+SchedDataCalculatorGM107::emitRdDepBar(Instruction *insn, uint8_t id)
+{
+ assert(id < 6);
+ if ((insn->sched & 0x700) == 0x700)
+ insn->sched ^= 0x700;
+ insn->sched |= id << 8;
+}
+
+inline void
+SchedDataCalculatorGM107::emitWtDepBar(Instruction *insn, uint8_t id)
+{
+ assert(id < 6);
+ insn->sched |= 1 << (11 + id);
+}
+
+inline void
+SchedDataCalculatorGM107::emitReuse(Instruction *insn, uint8_t id)
+{
+ assert(id < 4);
+ insn->sched |= 1 << (17 + id);
+}
+
+inline void
+SchedDataCalculatorGM107::printSchedInfo(int cycle,
+ const Instruction *insn) const
+{
+ uint8_t st, yl, wr, rd, wt, ru;
+
+ st = (insn->sched & 0x00000f) >> 0;
+ yl = (insn->sched & 0x000010) >> 4;
+ wr = (insn->sched & 0x0000e0) >> 5;
+ rd = (insn->sched & 0x000700) >> 8;
+ wt = (insn->sched & 0x01f800) >> 11;
+ ru = (insn->sched & 0x1e0000) >> 17;
+
+ INFO("cycle %i, (st 0x%x, yl 0x%x, wr 0x%x, rd 0x%x, wt 0x%x, ru 0x%x)\n",
+ cycle, st, yl, wr, rd, wt, ru);
+}
+
+inline int
+SchedDataCalculatorGM107::getStall(const Instruction *insn) const
+{
+ return insn->sched & 0xf;
+}
+
+inline int
+SchedDataCalculatorGM107::getWrDepBar(const Instruction *insn) const
+{
+ return (insn->sched & 0x0000e0) >> 5;
+}
+
+inline int
+SchedDataCalculatorGM107::getRdDepBar(const Instruction *insn) const
+{
+ return (insn->sched & 0x000700) >> 8;
+}
+
+inline int
+SchedDataCalculatorGM107::getWtDepBar(const Instruction *insn) const
+{
+ return (insn->sched & 0x01f800) >> 11;
+}
+
+// Emit the reuse flag which allows to make use of the new memory hierarchy
+// introduced since Maxwell, the operand reuse cache.
+//
+// It allows to reduce bank conflicts by caching operands. Each time you issue
+// an instruction, that flag can tell the hw which operands are going to be
+// re-used by the next instruction. Note that the next instruction has to use
+// the same GPR id in the same operand slot.
+void
+SchedDataCalculatorGM107::setReuseFlag(Instruction *insn)
+{
+ Instruction *next = insn->next;
+ BitSet defs(255, 1);
+
+ if (!targ->isReuseSupported(insn))
+ return;
+
+ for (int d = 0; insn->defExists(d); ++d) {
+ const Value *def = insn->def(d).rep();
+ if (insn->def(d).getFile() != FILE_GPR)
+ continue;
+ if (typeSizeof(insn->dType) != 4 || def->reg.data.id == 255)
+ continue;
+ defs.set(def->reg.data.id);
+ }
+
+ for (int s = 0; insn->srcExists(s); s++) {
+ const Value *src = insn->src(s).rep();
+ if (insn->src(s).getFile() != FILE_GPR)
+ continue;
+ if (typeSizeof(insn->sType) != 4 || src->reg.data.id == 255)
+ continue;
+ if (defs.test(src->reg.data.id))
+ continue;
+ if (!next->srcExists(s) || next->src(s).getFile() != FILE_GPR)
+ continue;
+ if (src->reg.data.id != next->getSrc(s)->reg.data.id)
+ continue;
+ assert(s < 4);
+ emitReuse(insn, s);
+ }
+}
+
+void
+SchedDataCalculatorGM107::recordWr(const Value *v, int cycle, int ready)
+{
+ int a = v->reg.data.id, b;
+
+ switch (v->reg.file) {
+ case FILE_GPR:
+ b = a + v->reg.size / 4;
+ for (int r = a; r < b; ++r)
+ score->rd.r[r] = ready;
+ break;
+ case FILE_PREDICATE:
+ // To immediately use a predicate set by any instructions, the minimum
+ // number of stall counts is 13.
+ score->rd.p[a] = cycle + 13;
+ break;
+ case FILE_FLAGS:
+ score->rd.c = ready;
+ break;
+ default:
+ break;
+ }
+}
+
+void
+SchedDataCalculatorGM107::checkRd(const Value *v, int cycle, int &delay) const
+{
+ int a = v->reg.data.id, b;
+ int ready = cycle;
+
+ switch (v->reg.file) {
+ case FILE_GPR:
+ b = a + v->reg.size / 4;
+ for (int r = a; r < b; ++r)
+ ready = MAX2(ready, score->rd.r[r]);
+ break;
+ case FILE_PREDICATE:
+ ready = MAX2(ready, score->rd.p[a]);
+ break;
+ case FILE_FLAGS:
+ ready = MAX2(ready, score->rd.c);
+ break;
+ default:
+ break;
+ }
+ if (cycle < ready)
+ delay = MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculatorGM107::commitInsn(const Instruction *insn, int cycle)
+{
+ const int ready = cycle + targ->getLatency(insn);
+
+ for (int d = 0; insn->defExists(d); ++d)
+ recordWr(insn->getDef(d), cycle, ready);
+
+#ifdef GM107_DEBUG_SCHED_DATA
+ score->print(cycle);
+#endif
+}
+
+#define GM107_MIN_ISSUE_DELAY 0x1
+#define GM107_MAX_ISSUE_DELAY 0xf
+
+int
+SchedDataCalculatorGM107::calcDelay(const Instruction *insn, int cycle) const
+{
+ int delay = 0, ready = cycle;
+
+ for (int s = 0; insn->srcExists(s); ++s)
+ checkRd(insn->getSrc(s), cycle, delay);
+
+ // TODO: make use of getReadLatency()!
+
+ return MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculatorGM107::setDelay(Instruction *insn, int delay,
+ const Instruction *next)
+{
+ const OpClass cl = targ->getOpClass(insn->op);
+ int wr, rd;
+
+ if (insn->op == OP_EXIT ||
+ insn->op == OP_BAR ||
+ insn->op == OP_MEMBAR) {
+ delay = GM107_MAX_ISSUE_DELAY;
+ } else
+ if (insn->op == OP_QUADON ||
+ insn->op == OP_QUADPOP) {
+ delay = 0xd;
+ } else
+ if (cl == OPCLASS_FLOW || insn->join) {
+ delay = 0xd;
+ }
+
+ if (!next || !targ->canDualIssue(insn, next)) {
+ delay = CLAMP(delay, GM107_MIN_ISSUE_DELAY, GM107_MAX_ISSUE_DELAY);
+ } else {
+ delay = 0x0; // dual-issue
+ }
+
+ wr = getWrDepBar(insn);
+ rd = getRdDepBar(insn);
+
+ if (delay == GM107_MIN_ISSUE_DELAY && (wr & rd) != 7) {
+ // Barriers take one additional clock cycle to become active on top of
+ // the clock consumed by the instruction producing it.
+ if (!next || insn->bb != next->bb) {
+ delay = 0x2;
+ } else {
+ int wt = getWtDepBar(next);
+ if ((wt & (1 << wr)) | (wt & (1 << rd)))
+ delay = 0x2;
+ }
+ }
+
+ emitStall(insn, delay);
+}
+
+
+// Return true when the given instruction needs to emit a read dependency
+// barrier (for WaR hazards) because it doesn't operate at a fixed latency, and
+// setting the maximum number of stall counts is not enough.
+bool
+SchedDataCalculatorGM107::needRdDepBar(const Instruction *insn) const
+{
+ BitSet srcs(255, 1), defs(255, 1);
+ int a, b;
+
+ if (!targ->isBarrierRequired(insn))
+ return false;
+
+ // Do not emit a read dependency barrier when the instruction doesn't use
+ // any GPR (like st s[0x4] 0x0) as input because it's unnecessary.
+ for (int s = 0; insn->srcExists(s); ++s) {
+ const Value *src = insn->src(s).rep();
+ if (insn->src(s).getFile() != FILE_GPR)
+ continue;
+ if (src->reg.data.id == 255)
+ continue;
+
+ a = src->reg.data.id;
+ b = a + src->reg.size / 4;
+ for (int r = a; r < b; ++r)
+ srcs.set(r);
+ }
+
+ if (!srcs.popCount())
+ return false;
+
+ // Do not emit a read dependency barrier when the output GPRs are equal to
+ // the input GPRs (like rcp $r0 $r0) because a write dependency barrier will
+ // be produced and WaR hazards are prevented.
+ for (int d = 0; insn->defExists(d); ++d) {
+ const Value *def = insn->def(d).rep();
+ if (insn->def(d).getFile() != FILE_GPR)
+ continue;
+ if (def->reg.data.id == 255)
+ continue;
+
+ a = def->reg.data.id;
+ b = a + def->reg.size / 4;
+ for (int r = a; r < b; ++r)
+ defs.set(r);
+ }
+
+ srcs.andNot(defs);
+ if (!srcs.popCount())
+ return false;
+
+ return true;
+}
+
+// Return true when the given instruction needs to emit a write dependency
+// barrier (for RaW hazards) because it doesn't operate at a fixed latency, and
+// setting the maximum number of stall counts is not enough. This is only legal
+// if the instruction output something.
+bool
+SchedDataCalculatorGM107::needWrDepBar(const Instruction *insn) const
+{
+ if (!targ->isBarrierRequired(insn))
+ return false;
+
+ for (int d = 0; insn->defExists(d); ++d) {
+ if (insn->def(d).getFile() == FILE_GPR ||
+ insn->def(d).getFile() == FILE_FLAGS ||
+ insn->def(d).getFile() == FILE_PREDICATE)
+ return true;
+ }
+ return false;
+}
+
+// Helper function for findFirstUse() and findFirstDef()
+bool
+SchedDataCalculatorGM107::doesInsnWriteTo(const Instruction *insn,
+ const Value *val) const
+{
+ if (val->reg.file != FILE_GPR &&
+ val->reg.file != FILE_PREDICATE &&
+ val->reg.file != FILE_FLAGS)
+ return false;
+
+ for (int d = 0; insn->defExists(d); ++d) {
+ const Value* def = insn->getDef(d);
+ int minGPR = def->reg.data.id;
+ int maxGPR = minGPR + def->reg.size / 4 - 1;
+
+ if (def->reg.file != val->reg.file)
+ continue;
+
+ if (def->reg.file == FILE_GPR) {
+ if (val->reg.data.id + val->reg.size / 4 - 1 < minGPR ||
+ val->reg.data.id > maxGPR)
+ continue;
+ return true;
+ } else
+ if (def->reg.file == FILE_PREDICATE) {
+ if (val->reg.data.id != minGPR)
+ continue;
+ return true;
+ } else
+ if (def->reg.file == FILE_FLAGS) {
+ if (val->reg.data.id != minGPR)
+ continue;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+// Find the next instruction inside the same basic block which uses (reads or
+// writes from) the output of the given instruction in order to avoid RaW and
+// WaW hazards.
+Instruction *
+SchedDataCalculatorGM107::findFirstUse(const Instruction *bari) const
+{
+ Instruction *insn, *next;
+
+ if (!bari->defExists(0))
+ return NULL;
+
+ for (insn = bari->next; insn != NULL; insn = next) {
+ next = insn->next;
+
+ for (int s = 0; insn->srcExists(s); ++s)
+ if (doesInsnWriteTo(bari, insn->getSrc(s)))
+ return insn;
+
+ for (int d = 0; insn->defExists(d); ++d)
+ if (doesInsnWriteTo(bari, insn->getDef(d)))
+ return insn;
+ }
+ return NULL;
+}
+
+// Find the next instruction inside the same basic block which overwrites, at
+// least, one source of the given instruction in order to avoid WaR hazards.
+Instruction *
+SchedDataCalculatorGM107::findFirstDef(const Instruction *bari) const
+{
+ Instruction *insn, *next;
+
+ if (!bari->srcExists(0))
+ return NULL;
+
+ for (insn = bari->next; insn != NULL; insn = next) {
+ next = insn->next;
+
+ for (int s = 0; bari->srcExists(s); ++s)
+ if (doesInsnWriteTo(insn, bari->getSrc(s)))
+ return insn;
+ }
+ return NULL;
+}
+
+// Dependency barriers:
+// This pass is a bit ugly and could probably be improved by performing a
+// better allocation.
+//
+// The main idea is to avoid WaR and RaW hazards by emitting read/write
+// dependency barriers using the control codes.
+bool
+SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
+{
+ std::list<LiveBarUse> live_uses;
+ std::list<LiveBarDef> live_defs;
+ Instruction *insn, *next;
+ BitSet bars(6, 1);
+ int bar_id;
+
+ for (insn = bb->getEntry(); insn != NULL; insn = next) {
+ Instruction *usei = NULL, *defi = NULL;
+ bool need_wr_bar, need_rd_bar;
+
+ next = insn->next;
+
+ // Expire old barrier uses.
+ for (std::list<LiveBarUse>::iterator it = live_uses.begin();
+ it != live_uses.end();) {
+ if (insn->serial >= it->usei->serial) {
+ int wr = getWrDepBar(it->insn);
+ emitWtDepBar(insn, wr);
+ bars.clr(wr); // free barrier
+ it = live_uses.erase(it);
+ continue;
+ }
+ ++it;
+ }
+
+ // Expire old barrier defs.
+ for (std::list<LiveBarDef>::iterator it = live_defs.begin();
+ it != live_defs.end();) {
+ if (insn->serial >= it->defi->serial) {
+ int rd = getRdDepBar(it->insn);
+ emitWtDepBar(insn, rd);
+ bars.clr(rd); // free barrier
+ it = live_defs.erase(it);
+ continue;
+ }
+ ++it;
+ }
+
+ need_wr_bar = needWrDepBar(insn);
+ need_rd_bar = needRdDepBar(insn);
+
+ if (need_wr_bar) {
+ // When the instruction requires to emit a write dependency barrier
+ // (all which write something at a variable latency), find the next
+ // instruction which reads the outputs (or writes to them, potentially
+ // completing before this insn.
+ usei = findFirstUse(insn);
+
+ // Allocate and emit a new barrier.
+ bar_id = bars.findFreeRange(1);
+ if (bar_id == -1)
+ bar_id = 5;
+ bars.set(bar_id);
+ emitWrDepBar(insn, bar_id);
+ if (usei)
+ live_uses.push_back(LiveBarUse(insn, usei));
+ }
+
+ if (need_rd_bar) {
+ // When the instruction requires to emit a read dependency barrier
+ // (all which read something at a variable latency), find the next
+ // instruction which will write the inputs.
+ defi = findFirstDef(insn);
+
+ if (usei && defi && usei->serial <= defi->serial)
+ continue;
+
+ // Allocate and emit a new barrier.
+ bar_id = bars.findFreeRange(1);
+ if (bar_id == -1)
+ bar_id = 5;
+ bars.set(bar_id);
+ emitRdDepBar(insn, bar_id);
+ if (defi)
+ live_defs.push_back(LiveBarDef(insn, defi));
+ }
+ }
+
+ // Remove unnecessary barrier waits.
+ BitSet alive_bars(6, 1);
+ for (insn = bb->getEntry(); insn != NULL; insn = next) {
+ int wr, rd, wt;
+
+ next = insn->next;
+
+ wr = getWrDepBar(insn);
+ rd = getRdDepBar(insn);
+ wt = getWtDepBar(insn);
+
+ for (int idx = 0; idx < 6; ++idx) {
+ if (!(wt & (1 << idx)))
+ continue;
+ if (!alive_bars.test(idx)) {
+ insn->sched &= ~(1 << (11 + idx));
+ } else {
+ alive_bars.clr(idx);
+ }
+ }
+
+ if (wr < 6)
+ alive_bars.set(wr);
+ if (rd < 6)
+ alive_bars.set(rd);
+ }
+
+ return true;
+}
+
+bool
+SchedDataCalculatorGM107::visit(Function *func)
+{
+ ArrayList insns;
+
+ func->orderInstructions(insns);
+
+ scoreBoards.resize(func->cfg.getSize());
+ for (size_t i = 0; i < scoreBoards.size(); ++i)
+ scoreBoards[i].wipe();
+ return true;
+}
+