From f519c47f7d47d88ecf3b5e8f28fdffaa12f684d3 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Fri, 16 Dec 2016 14:28:14 +0100
Subject: [PATCH] nv50/ir: improve instruction pipelining on gm107

This makes use of scheduling control codes which are very useful
for improving the instruction pipelining.

This patch will increase performance on Maxwell GPUs by, at least,
x1.5 up to x3.5 for some benchmarks.

Although this has been fairly well tested, I would not be suprised
if someone hit a corner case somewhere. That way, the scheduler
is enabled by default but it can be deactivated by using
NV50_PROG_SCHED=0.

Thanks to Scott Gray for the reverse engineering work available from
https://github.com/NervanaSystems/maxas/wiki/Control-Codes.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Pierre Moreau <pierre.morrow@free.fr>
Tested-by: Alexandre Courbot <acourbot@nvidia.com>
Tested-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 .../nouveau/codegen/nv50_ir_emit_gm107.cpp    | 771 +++++++++++++++++-
 .../nouveau/codegen/nv50_ir_target_gm107.cpp  | 253 ++++++
 .../nouveau/codegen/nv50_ir_target_gm107.h    |   7 +
 3 files changed, 1027 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 5bd0fa0a075..f3ebd2b0be8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -24,6 +24,8 @@
 
 #include "codegen/nv50_ir_target_gm107.h"
 
+//#define GM107_DEBUG_SCHED_DATA
+
 namespace nv50_ir {
 
 class CodeEmitterGM107 : public CodeEmitter
@@ -3374,20 +3376,781 @@ CodeEmitterGM107::getMinEncodingSize(const Instruction *i) const
 class SchedDataCalculatorGM107 : public Pass
 {
 public:
-   SchedDataCalculatorGM107(const Target *targ) : targ(targ) {}
+   SchedDataCalculatorGM107(const TargetGM107 *targ) : targ(targ) {}
+
 private:
-   const Target *targ;
-   bool visit(BasicBlock *bb);
+   struct RegScores
+   {
+      struct ScoreData {
+         int r[256];
+         int p[8];
+         int c;
+      } rd, wr;
+      int base;
+
+      void rebase(const int base)
+      {
+         const int delta = this->base - base;
+         if (!delta)
+            return;
+         this->base = 0;
+
+         for (int i = 0; i < 256; ++i) {
+            rd.r[i] += delta;
+            wr.r[i] += delta;
+         }
+         for (int i = 0; i < 8; ++i) {
+            rd.p[i] += delta;
+            wr.p[i] += delta;
+         }
+         rd.c += delta;
+         wr.c += delta;
+      }
+      void wipe()
+      {
+         memset(&rd, 0, sizeof(rd));
+         memset(&wr, 0, sizeof(wr));
+      }
+      int getLatest(const ScoreData& d) const
+      {
+         int max = 0;
+         for (int i = 0; i < 256; ++i)
+            if (d.r[i] > max)
+               max = d.r[i];
+         for (int i = 0; i < 8; ++i)
+            if (d.p[i] > max)
+               max = d.p[i];
+         if (d.c > max)
+            max = d.c;
+         return max;
+      }
+      inline int getLatestRd() const
+      {
+         return getLatest(rd);
+      }
+      inline int getLatestWr() const
+      {
+         return getLatest(wr);
+      }
+      inline int getLatest() const
+      {
+         return MAX2(getLatestRd(), getLatestWr());
+      }
+      void setMax(const RegScores *that)
+      {
+         for (int i = 0; i < 256; ++i) {
+            rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
+            wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
+         }
+         for (int i = 0; i < 8; ++i) {
+            rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
+            wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
+         }
+         rd.c = MAX2(rd.c, that->rd.c);
+         wr.c = MAX2(wr.c, that->wr.c);
+      }
+      void print(int cycle)
+      {
+         for (int i = 0; i < 256; ++i) {
+            if (rd.r[i] > cycle)
+               INFO("rd $r%i @ %i\n", i, rd.r[i]);
+            if (wr.r[i] > cycle)
+               INFO("wr $r%i @ %i\n", i, wr.r[i]);
+         }
+         for (int i = 0; i < 8; ++i) {
+            if (rd.p[i] > cycle)
+               INFO("rd $p%i @ %i\n", i, rd.p[i]);
+            if (wr.p[i] > cycle)
+               INFO("wr $p%i @ %i\n", i, wr.p[i]);
+         }
+         if (rd.c > cycle)
+            INFO("rd $c @ %i\n", rd.c);
+         if (wr.c > cycle)
+            INFO("wr $c @ %i\n", wr.c);
+      }
+   };
+
+   RegScores *score; // for current BB
+   std::vector<RegScores> scoreBoards;
+
+   const TargetGM107 *targ;
+   bool visit(Function *);
+   bool visit(BasicBlock *);
+
+   void commitInsn(const Instruction *, int);
+   int calcDelay(const Instruction *, int) const;
+   void setDelay(Instruction *, int, const Instruction *);
+   void recordWr(const Value *, int, int);
+   void checkRd(const Value *, int, int&) const;
+
+   inline void emitYield(Instruction *);
+   inline void emitStall(Instruction *, uint8_t);
+   inline void emitReuse(Instruction *, uint8_t);
+   inline void emitWrDepBar(Instruction *, uint8_t);
+   inline void emitRdDepBar(Instruction *, uint8_t);
+   inline void emitWtDepBar(Instruction *, uint8_t);
+
+   inline int getStall(const Instruction *) const;
+   inline int getWrDepBar(const Instruction *) const;
+   inline int getRdDepBar(const Instruction *) const;
+   inline int getWtDepBar(const Instruction *) const;
+
+   void setReuseFlag(Instruction *);
+
+   inline void printSchedInfo(int, const Instruction *) const;
+
+   struct LiveBarUse {
+      LiveBarUse(Instruction *insn, Instruction *usei)
+         : insn(insn), usei(usei) { }
+      Instruction *insn;
+      Instruction *usei;
+   };
+
+   struct LiveBarDef {
+      LiveBarDef(Instruction *insn, Instruction *defi)
+         : insn(insn), defi(defi) { }
+      Instruction *insn;
+      Instruction *defi;
+   };
+
+   bool insertBarriers(BasicBlock *);
+
+   Instruction *findFirstUse(const Instruction *) const;
+   Instruction *findFirstDef(const Instruction *) const;
+
+   bool needRdDepBar(const Instruction *) const;
+   bool needWrDepBar(const Instruction *) const;
 };
 
+inline void
+SchedDataCalculatorGM107::emitStall(Instruction *insn, uint8_t cnt)
+{
+   assert(cnt < 16);
+   insn->sched |= cnt;
+}
+
+inline void
+SchedDataCalculatorGM107::emitYield(Instruction *insn)
+{
+   insn->sched |= 1 << 4;
+}
+
+inline void
+SchedDataCalculatorGM107::emitWrDepBar(Instruction *insn, uint8_t id)
+{
+   assert(id < 6);
+   if ((insn->sched & 0xe0) == 0xe0)
+      insn->sched ^= 0xe0;
+   insn->sched |= id << 5;
+}
+
+inline void
+SchedDataCalculatorGM107::emitRdDepBar(Instruction *insn, uint8_t id)
+{
+   assert(id < 6);
+   if ((insn->sched & 0x700) == 0x700)
+      insn->sched ^= 0x700;
+   insn->sched |= id << 8;
+}
+
+inline void
+SchedDataCalculatorGM107::emitWtDepBar(Instruction *insn, uint8_t id)
+{
+   assert(id < 6);
+   insn->sched |= 1 << (11 + id);
+}
+
+inline void
+SchedDataCalculatorGM107::emitReuse(Instruction *insn, uint8_t id)
+{
+   assert(id < 4);
+   insn->sched |= 1 << (17 + id);
+}
+
+inline void
+SchedDataCalculatorGM107::printSchedInfo(int cycle,
+                                         const Instruction *insn) const
+{
+   uint8_t st, yl, wr, rd, wt, ru;
+
+   st = (insn->sched & 0x00000f) >> 0;
+   yl = (insn->sched & 0x000010) >> 4;
+   wr = (insn->sched & 0x0000e0) >> 5;
+   rd = (insn->sched & 0x000700) >> 8;
+   wt = (insn->sched & 0x01f800) >> 11;
+   ru = (insn->sched & 0x1e0000) >> 17;
+
+   INFO("cycle %i, (st 0x%x, yl 0x%x, wr 0x%x, rd 0x%x, wt 0x%x, ru 0x%x)\n",
+        cycle, st, yl, wr, rd, wt, ru);
+}
+
+inline int
+SchedDataCalculatorGM107::getStall(const Instruction *insn) const
+{
+   return insn->sched & 0xf;
+}
+
+inline int
+SchedDataCalculatorGM107::getWrDepBar(const Instruction *insn) const
+{
+   return (insn->sched & 0x0000e0) >> 5;
+}
+
+inline int
+SchedDataCalculatorGM107::getRdDepBar(const Instruction *insn) const
+{
+   return (insn->sched & 0x000700) >> 8;
+}
+
+inline int
+SchedDataCalculatorGM107::getWtDepBar(const Instruction *insn) const
+{
+   return (insn->sched & 0x01f800) >> 11;
+}
+
+// Emit the reuse flag which allows to make use of the new memory hierarchy
+// introduced since Maxwell, the operand reuse cache.
+//
+// It allows to reduce bank conflicts by caching operands. Each time you issue
+// an instruction, that flag can tell the hw which operands are going to be
+// re-used by the next instruction. Note that the next instruction has to use
+// the same GPR id in the same operand slot.
+void
+SchedDataCalculatorGM107::setReuseFlag(Instruction *insn)
+{
+   Instruction *next = insn->next;
+   BitSet defs(255, 1);
+
+   if (!targ->isReuseSupported(insn))
+      return;
+
+   for (int d = 0; insn->defExists(d); ++d) {
+      const Value *def = insn->def(d).rep();
+      if (insn->def(d).getFile() != FILE_GPR)
+         continue;
+      if (typeSizeof(insn->dType) != 4 || def->reg.data.id == 255)
+         continue;
+      defs.set(def->reg.data.id);
+   }
+
+   for (int s = 0; insn->srcExists(s); s++) {
+      const Value *src = insn->src(s).rep();
+      if (insn->src(s).getFile() != FILE_GPR)
+         continue;
+      if (typeSizeof(insn->sType) != 4 || src->reg.data.id == 255)
+         continue;
+      if (defs.test(src->reg.data.id))
+         continue;
+      if (!next->srcExists(s) || next->src(s).getFile() != FILE_GPR)
+         continue;
+      if (src->reg.data.id != next->getSrc(s)->reg.data.id)
+         continue;
+      assert(s < 4);
+      emitReuse(insn, s);
+   }
+}
+
+void
+SchedDataCalculatorGM107::recordWr(const Value *v, int cycle, int ready)
+{
+   int a = v->reg.data.id, b;
+
+   switch (v->reg.file) {
+   case FILE_GPR:
+      b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         score->rd.r[r] = ready;
+      break;
+   case FILE_PREDICATE:
+      // To immediately use a predicate set by any instructions, the minimum
+      // number of stall counts is 13.
+      score->rd.p[a] = cycle + 13;
+      break;
+   case FILE_FLAGS:
+      score->rd.c = ready;
+      break;
+   default:
+      break;
+   }
+}
+
+void
+SchedDataCalculatorGM107::checkRd(const Value *v, int cycle, int &delay) const
+{
+   int a = v->reg.data.id, b;
+   int ready = cycle;
+
+   switch (v->reg.file) {
+   case FILE_GPR:
+      b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         ready = MAX2(ready, score->rd.r[r]);
+      break;
+   case FILE_PREDICATE:
+      ready = MAX2(ready, score->rd.p[a]);
+      break;
+   case FILE_FLAGS:
+      ready = MAX2(ready, score->rd.c);
+      break;
+   default:
+      break;
+   }
+   if (cycle < ready)
+      delay = MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculatorGM107::commitInsn(const Instruction *insn, int cycle)
+{
+   const int ready = cycle + targ->getLatency(insn);
+
+   for (int d = 0; insn->defExists(d); ++d)
+      recordWr(insn->getDef(d), cycle, ready);
+
+#ifdef GM107_DEBUG_SCHED_DATA
+   score->print(cycle);
+#endif
+}
+
+#define GM107_MIN_ISSUE_DELAY 0x1
+#define GM107_MAX_ISSUE_DELAY 0xf
+
+int
+SchedDataCalculatorGM107::calcDelay(const Instruction *insn, int cycle) const
+{
+   int delay = 0, ready = cycle;
+
+   for (int s = 0; insn->srcExists(s); ++s)
+      checkRd(insn->getSrc(s), cycle, delay);
+
+   // TODO: make use of getReadLatency()!
+
+   return MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculatorGM107::setDelay(Instruction *insn, int delay,
+                                   const Instruction *next)
+{
+   const OpClass cl = targ->getOpClass(insn->op);
+   int wr, rd;
+
+   if (insn->op == OP_EXIT ||
+       insn->op == OP_BAR ||
+       insn->op == OP_MEMBAR) {
+      delay = GM107_MAX_ISSUE_DELAY;
+   } else
+   if (insn->op == OP_QUADON ||
+       insn->op == OP_QUADPOP) {
+      delay = 0xd;
+   } else
+   if (cl == OPCLASS_FLOW || insn->join) {
+      delay = 0xd;
+   }
+
+   if (!next || !targ->canDualIssue(insn, next)) {
+      delay = CLAMP(delay, GM107_MIN_ISSUE_DELAY, GM107_MAX_ISSUE_DELAY);
+   } else {
+      delay = 0x0; // dual-issue
+   }
+
+   wr = getWrDepBar(insn);
+   rd = getRdDepBar(insn);
+
+   if (delay == GM107_MIN_ISSUE_DELAY && (wr & rd) != 7) {
+      // Barriers take one additional clock cycle to become active on top of
+      // the clock consumed by the instruction producing it.
+      if (!next || insn->bb != next->bb) {
+         delay = 0x2;
+      } else {
+         int wt = getWtDepBar(next);
+         if ((wt & (1 << wr)) | (wt & (1 << rd)))
+            delay = 0x2;
+      }
+   }
+
+   emitStall(insn, delay);
+}
+
+
+// Return true when the given instruction needs to emit a read dependency
+// barrier (for WaR hazards) because it doesn't operate at a fixed latency, and
+// setting the maximum number of stall counts is not enough.
+bool
+SchedDataCalculatorGM107::needRdDepBar(const Instruction *insn) const
+{
+   BitSet srcs(255, 1), defs(255, 1);
+   int a, b;
+
+   if (!targ->isBarrierRequired(insn))
+      return false;
+
+   // Do not emit a read dependency barrier when the instruction doesn't use
+   // any GPR (like st s[0x4] 0x0) as input because it's unnecessary.
+   for (int s = 0; insn->srcExists(s); ++s) {
+      const Value *src = insn->src(s).rep();
+      if (insn->src(s).getFile() != FILE_GPR)
+         continue;
+      if (src->reg.data.id == 255)
+         continue;
+
+      a = src->reg.data.id;
+      b = a + src->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         srcs.set(r);
+   }
+
+   if (!srcs.popCount())
+      return false;
+
+   // Do not emit a read dependency barrier when the output GPRs are equal to
+   // the input GPRs (like rcp $r0 $r0) because a write dependency barrier will
+   // be produced and WaR hazards are prevented.
+   for (int d = 0; insn->defExists(d); ++d) {
+      const Value *def = insn->def(d).rep();
+      if (insn->def(d).getFile() != FILE_GPR)
+         continue;
+      if (def->reg.data.id == 255)
+         continue;
+
+      a = def->reg.data.id;
+      b = a + def->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         defs.set(r);
+   }
+
+   srcs.andNot(defs);
+   if (!srcs.popCount())
+      return false;
+
+   return true;
+}
+
+// Return true when the given instruction needs to emit a write dependency
+// barrier (for RaW hazards) because it doesn't operate at a fixed latency, and
+// setting the maximum number of stall counts is not enough. This is only legal
+// if the instruction output something.
+bool
+SchedDataCalculatorGM107::needWrDepBar(const Instruction *insn) const
+{
+   if (!targ->isBarrierRequired(insn))
+      return false;
+
+   for (int d = 0; insn->defExists(d); ++d) {
+      if (insn->def(d).getFile() == FILE_GPR ||
+          insn->def(d).getFile() == FILE_PREDICATE)
+         return true;
+   }
+   return false;
+}
+
+// Find the next instruction inside the same basic block which uses the output
+// of the given instruction in order to avoid RaW hazards.
+Instruction *
+SchedDataCalculatorGM107::findFirstUse(const Instruction *bari) const
+{
+   Instruction *insn, *next;
+   int minGPR, maxGPR;
+
+   if (!bari->defExists(0))
+      return NULL;
+
+   minGPR = bari->def(0).rep()->reg.data.id;
+   maxGPR = minGPR + bari->def(0).rep()->reg.size / 4 - 1;
+
+   for (insn = bari->next; insn != NULL; insn = next) {
+      next = insn->next;
+
+      for (int s = 0; insn->srcExists(s); ++s) {
+         const Value *src = insn->src(s).rep();
+         if (bari->def(0).getFile() == FILE_GPR) {
+            if (insn->src(s).getFile() != FILE_GPR ||
+                src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
+                src->reg.data.id > maxGPR)
+               continue;
+            return insn;
+         } else
+         if (bari->def(0).getFile() == FILE_PREDICATE) {
+            if (insn->src(s).getFile() != FILE_PREDICATE ||
+                src->reg.data.id != minGPR)
+               continue;
+            return insn;
+         }
+      }
+   }
+   return NULL;
+}
+
+// Find the next instruction inside the same basic block which overwrites, at
+// least, one source of the given instruction in order to avoid WaR hazards.
+Instruction *
+SchedDataCalculatorGM107::findFirstDef(const Instruction *bari) const
+{
+   Instruction *insn, *next;
+   int minGPR, maxGPR;
+
+   for (insn = bari->next; insn != NULL; insn = next) {
+      next = insn->next;
+
+      for (int d = 0; insn->defExists(d); ++d) {
+         const Value *def = insn->def(d).rep();
+         if (insn->def(d).getFile() != FILE_GPR)
+            continue;
+
+         minGPR = def->reg.data.id;
+         maxGPR = minGPR + def->reg.size / 4 - 1;
+
+         for (int s = 0; bari->srcExists(s); ++s) {
+            const Value *src = bari->src(s).rep();
+            if (bari->src(s).getFile() != FILE_GPR ||
+                src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
+                src->reg.data.id > maxGPR)
+               continue;
+            return insn;
+         }
+      }
+   }
+   return NULL;
+}
+
+// Dependency barriers:
+// This pass is a bit ugly and could probably be improved by performing a
+// better allocation.
+//
+// The main idea is to avoid WaR and RaW hazards by emitting read/write
+// dependency barriers using the control codes.
+bool
+SchedDataCalculatorGM107::insertBarriers(BasicBlock *bb)
+{
+   std::list<LiveBarUse> live_uses;
+   std::list<LiveBarDef> live_defs;
+   Instruction *insn, *next;
+   BitSet bars(6, 1);
+   int bar_id;
+
+   for (insn = bb->getEntry(); insn != NULL; insn = next) {
+      Instruction *usei = NULL, *defi = NULL;
+      bool need_wr_bar, need_rd_bar;
+
+      next = insn->next;
+
+      // Expire old barrier uses.
+      for (std::list<LiveBarUse>::iterator it = live_uses.begin();
+           it != live_uses.end();) {
+         if (insn->serial >= it->usei->serial) {
+            int wr = getWrDepBar(it->insn);
+            emitWtDepBar(insn, wr);
+            bars.clr(wr); // free barrier
+            it = live_uses.erase(it);
+            continue;
+         }
+         ++it;
+      }
+
+      // Expire old barrier defs.
+      for (std::list<LiveBarDef>::iterator it = live_defs.begin();
+           it != live_defs.end();) {
+         if (insn->serial >= it->defi->serial) {
+            int rd = getRdDepBar(it->insn);
+            emitWtDepBar(insn, rd);
+            bars.clr(rd); // free barrier
+            it = live_defs.erase(it);
+            continue;
+         }
+         ++it;
+      }
+
+      need_wr_bar = needWrDepBar(insn);
+      need_rd_bar = needRdDepBar(insn);
+
+      if (need_wr_bar) {
+         // When the instruction requires to emit a write dependency barrier
+         // (all which write something at a variable latency), find the next
+         // instruction which reads the outputs.
+         usei = findFirstUse(insn);
+
+         // Allocate and emit a new barrier.
+         bar_id = bars.findFreeRange(1);
+         if (bar_id == -1)
+            bar_id = 5;
+         bars.set(bar_id);
+         emitWrDepBar(insn, bar_id);
+         if (usei)
+            live_uses.push_back(LiveBarUse(insn, usei));
+      }
+
+      if (need_rd_bar) {
+         // When the instruction requires to emit a read dependency barrier
+         // (all which read something at a variable latency), find the next
+         // instruction which will write the inputs.
+         defi = findFirstDef(insn);
+
+         if (usei && defi && usei->serial <= defi->serial)
+            continue;
+
+         // Allocate and emit a new barrier.
+         bar_id = bars.findFreeRange(1);
+         if (bar_id == -1)
+            bar_id = 5;
+         bars.set(bar_id);
+         emitRdDepBar(insn, bar_id);
+         if (defi)
+            live_defs.push_back(LiveBarDef(insn, defi));
+      }
+   }
+
+   // Remove unnecessary barrier waits.
+   BitSet alive_bars(6, 1);
+   for (insn = bb->getEntry(); insn != NULL; insn = next) {
+      int wr, rd, wt;
+
+      next = insn->next;
+
+      wr = getWrDepBar(insn);
+      rd = getRdDepBar(insn);
+      wt = getWtDepBar(insn);
+
+      for (int idx = 0; idx < 6; ++idx) {
+         if (!(wt & (1 << idx)))
+            continue;
+         if (!alive_bars.test(idx)) {
+            insn->sched &= ~(1 << (11  + idx));
+         } else {
+            alive_bars.clr(idx);
+         }
+      }
+
+      if (wr < 6)
+         alive_bars.set(wr);
+      if (rd < 6)
+         alive_bars.set(rd);
+   }
+
+   return true;
+}
+
+bool
+SchedDataCalculatorGM107::visit(Function *func)
+{
+   ArrayList insns;
+
+   func->orderInstructions(insns);
+
+   scoreBoards.resize(func->cfg.getSize());
+   for (size_t i = 0; i < scoreBoards.size(); ++i)
+      scoreBoards[i].wipe();
+   return true;
+}
+
 bool
 SchedDataCalculatorGM107::visit(BasicBlock *bb)
 {
+   Instruction *insn, *next = NULL;
+   int cycle = 0;
+
    for (Instruction *insn = bb->getEntry(); insn; insn = insn->next) {
       /*XXX*/
       insn->sched = 0x7e0;
    }
 
+   if (!debug_get_bool_option("NV50_PROG_SCHED", true))
+      return true;
+
+   // Insert read/write dependency barriers for instructions which don't
+   // operate at a fixed latency.
+   insertBarriers(bb);
+
+   score = &scoreBoards.at(bb->getId());
+
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      // back branches will wait until all target dependencies are satisfied
+      if (ei.getType() == Graph::Edge::BACK) // sched would be uninitialized
+         continue;
+      BasicBlock *in = BasicBlock::get(ei.getNode());
+      score->setMax(&scoreBoards.at(in->getId()));
+   }
+
+#ifdef GM107_DEBUG_SCHED_DATA
+   INFO("=== BB:%i initial scores\n", bb->getId());
+   score->print(cycle);
+#endif
+
+   // Because barriers are allocated locally (intra-BB), we have to make sure
+   // that all produced barriers have been consumed before entering inside a
+   // new basic block. The best way is to do a global allocation pre RA but
+   // it's really more difficult, especially because of the phi nodes. Anyways,
+   // it seems like that waiting on a barrier which has already been consumed
+   // doesn't add any additional cost, it's just not elegant!
+   Instruction *start = bb->getEntry();
+   if (start && bb->cfg.incidentCount() > 0) {
+      for (int b = 0; b < 6; b++)
+         emitWtDepBar(start, b);
+   }
+
+   for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) {
+      next = insn->next;
+
+      commitInsn(insn, cycle);
+      int delay = calcDelay(next, cycle);
+      setDelay(insn, delay, next);
+      cycle += getStall(insn);
+
+      setReuseFlag(insn);
+
+      // XXX: The yield flag seems to destroy a bunch of things when it is
+      // set on every instruction, need investigation.
+      //emitYield(insn);
+
+#ifdef GM107_DEBUG_SCHED_DATA
+      printSchedInfo(cycle, insn);
+      insn->print();
+      next->print();
+#endif
+   }
+
+   if (!insn)
+      return true;
+   commitInsn(insn, cycle);
+
+   int bbDelay = -1;
+
+#ifdef GM107_DEBUG_SCHED_DATA
+   fprintf(stderr, "last instruction is : ");
+   insn->print();
+   fprintf(stderr, "cycle=%d\n", cycle);
+#endif
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      BasicBlock *out = BasicBlock::get(ei.getNode());
+
+      if (ei.getType() != Graph::Edge::BACK) {
+         // Only test the first instruction of the outgoing block.
+         next = out->getEntry();
+         if (next) {
+            bbDelay = MAX2(bbDelay, calcDelay(next, cycle));
+         } else {
+            // When the outgoing BB is empty, make sure to set the number of
+            // stall counts needed by the instruction because we don't know the
+            // next instruction.
+            bbDelay = MAX2(bbDelay, targ->getLatency(insn));
+         }
+      } else {
+         // Wait until all dependencies are satisfied.
+         const int regsFree = score->getLatest();
+         next = out->getFirst();
+         for (int c = cycle; next && c < regsFree; next = next->next) {
+            bbDelay = MAX2(bbDelay, calcDelay(next, c));
+            c += getStall(next);
+         }
+         next = NULL;
+      }
+   }
+   if (bb->cfg.outgoingCount() != 1)
+      next = NULL;
+   setDelay(insn, bbDelay, next);
+   cycle += getStall(insn);
+
+   score->rebase(cycle); // common base for initializing out blocks' scores
    return true;
 }
 
@@ -3398,7 +4161,7 @@ SchedDataCalculatorGM107::visit(BasicBlock *bb)
 void
 CodeEmitterGM107::prepareEmission(Function *func)
 {
-   SchedDataCalculatorGM107 sched(targ);
+   SchedDataCalculatorGM107 sched(targGM107);
    CodeEmitter::prepareEmission(func);
    sched.run(func, true, true);
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
index 6b8f767a3c0..04cbd402a18 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
@@ -68,6 +68,259 @@ TargetGM107::isOpSupported(operation op, DataType ty) const
    return true;
 }
 
+// Return true when an instruction supports the reuse flag. When supported, the
+// hardware will use the operand reuse cache introduced since Maxwell, which
+// should try to reduce bank conflicts by caching values for the subsequent
+// instructions. Note that the next instructions have to use the same GPR id in
+// the same operand slot.
+bool
+TargetGM107::isReuseSupported(const Instruction *insn) const
+{
+   const OpClass cl = getOpClass(insn->op);
+
+   // TODO: double-check!
+   switch (cl) {
+   case OPCLASS_ARITH:
+   case OPCLASS_COMPARE:
+   case OPCLASS_LOGIC:
+   case OPCLASS_MOVE:
+   case OPCLASS_SHIFT:
+      return true;
+   case OPCLASS_BITFIELD:
+      if (insn->op == OP_INSBF || insn->op == OP_EXTBF)
+         return true;
+      break;
+   default:
+      break;
+   }
+   return false;
+}
+
+// Return true when an instruction requires to set up a barrier because it
+// doesn't operate at a fixed latency. Variable latency instructions are memory
+// operations, double precision operations, special function unit operations
+// and other low throughput instructions.
+bool
+TargetGM107::isBarrierRequired(const Instruction *insn) const
+{
+   const OpClass cl = getOpClass(insn->op);
+
+   if (insn->dType == TYPE_F64 || insn->sType == TYPE_F64)
+      return true;
+
+   switch (cl) {
+   case OPCLASS_ATOMIC:
+   case OPCLASS_LOAD:
+   case OPCLASS_STORE:
+   case OPCLASS_SURFACE:
+   case OPCLASS_TEXTURE:
+      return true;
+   case OPCLASS_SFU:
+      switch (insn->op) {
+      case OP_COS:
+      case OP_EX2:
+      case OP_LG2:
+      case OP_LINTERP:
+      case OP_PINTERP:
+      case OP_RCP:
+      case OP_RSQ:
+      case OP_SIN:
+         return true;
+      default:
+         break;
+      }
+      break;
+   case OPCLASS_BITFIELD:
+      switch (insn->op) {
+      case OP_BFIND:
+      case OP_POPCNT:
+         return true;
+      default:
+         break;
+      }
+      break;
+   case OPCLASS_CONTROL:
+      switch (insn->op) {
+      case OP_EMIT:
+      case OP_RESTART:
+         return true;
+      default:
+         break;
+      }
+      break;
+   case OPCLASS_OTHER:
+      switch (insn->op) {
+      case OP_AFETCH:
+      case OP_PFETCH:
+      case OP_PIXLD:
+      case OP_RDSV:
+      case OP_SHFL:
+         return true;
+      default:
+         break;
+      }
+      break;
+   case OPCLASS_ARITH:
+      // TODO: IMUL/IMAD require barriers too, use of XMAD instead!
+      if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
+          !isFloatType(insn->dType))
+         return true;
+      break;
+   case OPCLASS_CONVERT:
+      if (insn->def(0).getFile() != FILE_PREDICATE &&
+          insn->src(0).getFile() != FILE_PREDICATE)
+         return true;
+      break;
+   default:
+      break;
+   }
+   return false;
+}
+
+bool
+TargetGM107::canDualIssue(const Instruction *a, const Instruction *b) const
+{
+   // TODO
+   return false;
+}
+
+// Return the number of stall counts needed to complete a single instruction.
+// On Maxwell GPUs, the pipeline depth is 6, but some instructions require
+// different number of stall counts like memory operations.
+int
+TargetGM107::getLatency(const Instruction *insn) const
+{
+   // TODO: better values! This should be good enough for now though.
+   switch (insn->op) {
+   case OP_EMIT:
+   case OP_EXPORT:
+   case OP_PIXLD:
+   case OP_RESTART:
+   case OP_STORE:
+   case OP_SUSTB:
+   case OP_SUSTP:
+      return 1;
+   case OP_SHFL:
+      return 2;
+   case OP_ADD:
+   case OP_AND:
+   case OP_EXTBF:
+   case OP_FMA:
+   case OP_INSBF:
+   case OP_MAD:
+   case OP_MAX:
+   case OP_MIN:
+   case OP_MOV:
+   case OP_MUL:
+   case OP_NOT:
+   case OP_OR:
+   case OP_PREEX2:
+   case OP_PRESIN:
+   case OP_QUADOP:
+   case OP_SELP:
+   case OP_SET:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+   case OP_SHL:
+   case OP_SHLADD:
+   case OP_SHR:
+   case OP_SLCT:
+   case OP_SUB:
+   case OP_VOTE:
+   case OP_XOR:
+      if (insn->dType != TYPE_F64)
+         return 6;
+      break;
+   case OP_ABS:
+   case OP_CEIL:
+   case OP_CVT:
+   case OP_FLOOR:
+   case OP_NEG:
+   case OP_SAT:
+   case OP_TRUNC:
+      if (insn->op == OP_CVT && (insn->def(0).getFile() == FILE_PREDICATE ||
+                                 insn->src(0).getFile() == FILE_PREDICATE))
+         return 6;
+      break;
+   case OP_BFIND:
+   case OP_COS:
+   case OP_EX2:
+   case OP_LG2:
+   case OP_POPCNT:
+   case OP_QUADON:
+   case OP_QUADPOP:
+   case OP_RCP:
+   case OP_RSQ:
+   case OP_SIN:
+      return 13;
+   default:
+      break;
+   }
+   // Use the maximum number of stall counts for other instructions.
+   return 15;
+}
+
+// Return the operand read latency which is the number of stall counts before
+// an instruction can read its sources. For memory operations like ATOM, LOAD
+// and STORE, the memory access has to be indirect.
+int
+TargetGM107::getReadLatency(const Instruction *insn) const
+{
+   switch (insn->op) {
+   case OP_ABS:
+   case OP_BFIND:
+   case OP_CEIL:
+   case OP_COS:
+   case OP_EX2:
+   case OP_FLOOR:
+   case OP_LG2:
+   case OP_NEG:
+   case OP_POPCNT:
+   case OP_RCP:
+   case OP_RSQ:
+   case OP_SAT:
+   case OP_SIN:
+   case OP_SULDB:
+   case OP_SULDP:
+   case OP_SUREDB:
+   case OP_SUREDP:
+   case OP_SUSTB:
+   case OP_SUSTP:
+   case OP_TRUNC:
+      return 4;
+   case OP_CVT:
+      if (insn->def(0).getFile() != FILE_PREDICATE &&
+          insn->src(0).getFile() != FILE_PREDICATE)
+         return 4;
+      break;
+   case OP_ATOM:
+   case OP_LOAD:
+   case OP_STORE:
+      if (insn->src(0).isIndirect(0)) {
+         switch (insn->src(0).getFile()) {
+         case FILE_MEMORY_SHARED:
+         case FILE_MEMORY_CONST:
+            return 2;
+         case FILE_MEMORY_GLOBAL:
+         case FILE_MEMORY_LOCAL:
+            return 4;
+         default:
+            break;
+         }
+      }
+      break;
+   case OP_EXPORT:
+   case OP_PFETCH:
+   case OP_SHFL:
+   case OP_VFETCH:
+      return 2;
+   default:
+      break;
+   }
+   return 0;
+}
+
 bool
 TargetGM107::runLegalizePass(Program *prog, CGStage stage) const
 {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
index 5d606378953..dd4aa6a54d0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
@@ -16,6 +16,13 @@ public:
    virtual uint32_t getBuiltinOffset(int) const;
 
    virtual bool isOpSupported(operation, DataType) const;
+   virtual bool isReuseSupported(const Instruction *) const;
+
+   virtual bool isBarrierRequired(const Instruction *) const;
+
+   virtual bool canDualIssue(const Instruction *, const Instruction *) const;
+   virtual int getLatency(const Instruction *) const;
+   virtual int getReadLatency(const Instruction *) const;
 };
 
 } // namespace nv50_ir
-- 
2.30.2