aco: improve waitcnt insertion around loops

author Rhys Perry <pendingchaos02@gmail.com>

Mon, 21 Oct 2019 20:36:41 +0000 (21:36 +0100)

committer Rhys Perry <pendingchaos02@gmail.com>

Thu, 21 Nov 2019 20:28:57 +0000 (20:28 +0000)
author Rhys Perry <pendingchaos02@gmail.com>
Mon, 21 Oct 2019 20:36:41 +0000 (21:36 +0100)
committer Rhys Perry <pendingchaos02@gmail.com>
Thu, 21 Nov 2019 20:28:57 +0000 (20:28 +0000)
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp

index 1f4733ed356cecdb9659a204d9624de3981efb9b..5d65ae253ffc373eff2704f5bb42b6821f168032 100644 (file)
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -24,6 +24,7 @@
  
  #include <algorithm>
  #include <map>
+#include <stack>
  
  #include "aco_ir.h"
  #include "vulkan/radv_shader.h"
@@ -34,8 +35,9 @@ namespace {
  
  /**
   * The general idea of this pass is:
- * The CFG is traversed in reverse postorder (forward).
- * Per BB one wait_ctx is maintained.
+ * The CFG is traversed in reverse postorder (forward) and loops are processed
+ * several times until no progress is made.
+ * Per BB two wait_ctx is maintained: an in-context and out-context.
   * The in-context is the joined out-contexts of the predecessors.
   * The context contains a map: gpr -> wait_entry
   * consisting of the information about the cnt values to be waited for.
@@ -114,6 +116,19 @@ struct wait_imm {
     wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) :
        vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {}
  
+   wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter)
+   {
+      vm = packed & 0xf;
+      if (chip >= GFX9)
+         vm |= (packed >> 10) & 0x30;
+
+      exp = (packed >> 4) & 0x7;
+
+      lgkm = (packed >> 8) & 0xf;
+      if (chip >= GFX10)
+         lgkm |= (packed >> 8) & 0x30;
+   }
+
     uint16_t pack(enum chip_class chip) const
     {
        uint16_t imm = 0;
@@ -142,12 +157,14 @@ struct wait_imm {
        return imm;
     }
  
-   void combine(const wait_imm& other)
+   bool combine(const wait_imm& other)
     {
+      bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs;
        vm = std::min(vm, other.vm);
        exp = std::min(exp, other.exp);
        lgkm = std::min(lgkm, other.lgkm);
        vs = std::min(vs, other.vs);
+      return changed;
     }
  
     bool empty() const
@@ -168,13 +185,17 @@ struct wait_entry {
             : imm(imm), events(event), counters(get_counters_for_event(event)),
               wait_on_read(wait_on_read), logical(logical) {}
  
-   void join(const wait_entry& other)
+   bool join(const wait_entry& other)
     {
+      bool changed = (other.events & ~events) ||
+                     (other.counters & ~counters) ||
+                     (other.wait_on_read && !wait_on_read);
        events |= other.events;
        counters |= other.counters;
-      imm.combine(other.imm);
+      changed |= imm.combine(other.imm);
        wait_on_read = wait_on_read || other.wait_on_read;
        assert(logical == other.logical);
+      return changed;
     }
  
     void remove_counter(counter_type counter)
@@ -237,8 +258,15 @@ struct wait_ctx {
               max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
               unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
  
-   void join(const wait_ctx* other, bool logical)
+   bool join(const wait_ctx* other, bool logical)
     {
+      bool changed = other->exp_cnt > exp_cnt ||
+                     other->vm_cnt > vm_cnt ||
+                     other->lgkm_cnt > lgkm_cnt ||
+                     other->vs_cnt > vs_cnt ||
+                     (other->pending_flat_lgkm && !pending_flat_lgkm) ||
+                     (other->pending_flat_vm && !pending_flat_vm);
+
        exp_cnt = std::max(exp_cnt, other->exp_cnt);
        vm_cnt = std::max(vm_cnt, other->vm_cnt);
        lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt);
@@ -253,14 +281,18 @@ struct wait_ctx {
           if (entry.second.logical != logical)
              continue;
  
-         if (it != gpr_map.end())
-            it->second.join(entry.second);
-         else
+         if (it != gpr_map.end()) {
+            changed |= it->second.join(entry.second);
+         } else {
              gpr_map.insert(entry);
+            changed = true;
+         }
        }
  
        for (unsigned i = 0; i < barrier_count; i++)
-         barrier_imm[i].combine(other->barrier_imm[i]);
+         changed |= barrier_imm[i].combine(other->barrier_imm[i]);
+
+      return changed;
     }
  };
  
@@ -319,12 +351,27 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
     return wait;
  }
  
+wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
+{
+   if (instr->opcode == aco_opcode::s_waitcnt_vscnt &&
+       instr->definitions[0].physReg() == sgpr_null) {
+      wait_imm imm;
+      imm.vs = std::min<uint8_t>(imm.vs, static_cast<SOPK_instruction*>(instr)->imm);
+      return imm;
+   } else if (instr->opcode == aco_opcode::s_waitcnt) {
+      return wait_imm(ctx.chip_class, static_cast<SOPK_instruction*>(instr)->imm);
+   }
+   return wait_imm();
+}
+
  wait_imm kill(Instruction* instr, wait_ctx& ctx)
  {
     wait_imm imm;
     if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
        imm.combine(check_instr(instr, ctx));
  
+   imm.combine(parse_wait_instr(ctx, instr));
+
     if (ctx.chip_class >= GFX10) {
        /* Seems to be required on GFX10 to achieve correct behaviour.
         * It shouldn't cost anything anyways since we're about to do s_endpgm.
@@ -665,39 +712,23 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
  {
     std::vector<aco_ptr<Instruction>> new_instructions;
  
+   wait_imm queued_imm;
     for (aco_ptr<Instruction>& instr : block.instructions) {
-      wait_imm imm = kill(instr.get(), ctx);
+      bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
  
-      if (!imm.empty())
-         emit_waitcnt(ctx, new_instructions, imm);
+      queued_imm.combine(kill(instr.get(), ctx));
  
        gen(instr.get(), ctx);
  
-      if (instr->format != Format::PSEUDO_BARRIER)
+      if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
+         if (!queued_imm.empty()) {
+            emit_waitcnt(ctx, new_instructions, queued_imm);
+            queued_imm = wait_imm();
+         }
           new_instructions.emplace_back(std::move(instr));
-   }
-
-   /* check if this block is at the end of a loop */
-   for (unsigned succ_idx : block.linear_succs) {
-      /* eliminate any remaining counters */
-      if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt)) {
-         // TODO: we could do better if we only wait if the regs between the block and other predecessors differ
-
-         aco_ptr<Instruction> branch = std::move(new_instructions.back());
-         new_instructions.pop_back();
-
-         wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter,
-                      ctx.exp_cnt ? 0 : wait_imm::unset_counter,
-                      ctx.lgkm_cnt ? 0 : wait_imm::unset_counter,
-                      ctx.vs_cnt ? 0 : wait_imm::unset_counter);
-         emit_waitcnt(ctx, new_instructions, imm);
-
-         new_instructions.push_back(std::move(branch));
-
-         ctx = wait_ctx(program);
-         break;
        }
     }
+
     block.instructions.swap(new_instructions);
  }
  
@@ -705,23 +736,55 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
  
  void insert_wait_states(Program* program)
  {
-   wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */
+   /* per BB ctx */
+   std::vector<bool> done(program->blocks.size());
+   wait_ctx in_ctx[program->blocks.size()];
+   wait_ctx out_ctx[program->blocks.size()];
     for (unsigned i = 0; i < program->blocks.size(); i++)
-      out_ctx[i] = wait_ctx(program);
-
-   for (unsigned i = 0; i < program->blocks.size(); i++) {
-      Block& current = program->blocks[i];
-      wait_ctx& in = out_ctx[current.index];
+      in_ctx[i] = wait_ctx(program);
+   std::stack<unsigned> loop_header_indices;
+   unsigned loop_progress = 0;
+
+   for (unsigned i = 0; i < program->blocks.size();) {
+      Block& current = program->blocks[i++];
+      wait_ctx ctx = in_ctx[current.index];
+
+      if (current.kind & block_kind_loop_header) {
+         loop_header_indices.push(current.index);
+      } else if (current.kind & block_kind_loop_exit) {
+         bool repeat = false;
+         if (loop_progress == loop_header_indices.size()) {
+            i = loop_header_indices.top();
+            repeat = true;
+         }
+         loop_header_indices.pop();
+         loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size());
+         if (repeat)
+            continue;
+      }
  
+      bool changed = false;
        for (unsigned b : current.linear_preds)
-         in.join(&out_ctx[b], false);
+         changed |= ctx.join(&out_ctx[b], false);
        for (unsigned b : current.logical_preds)
-         in.join(&out_ctx[b], true);
+         changed |= ctx.join(&out_ctx[b], true);
+
+      in_ctx[current.index] = ctx;
+
+      if (done[current.index] && !changed)
+         continue;
  
-      if (current.instructions.empty())
+      if (current.instructions.empty()) {
+         out_ctx[current.index] = ctx;
           continue;
+      }
+
+      loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth);
+      done[current.index] = true;
+
+      handle_block(program, current, ctx);
  
-      handle_block(program, current, in);
+      out_ctx[current.index] = ctx;
     }
  }
author	Rhys Perry <pendingchaos02@gmail.com>
	Mon, 21 Oct 2019 20:36:41 +0000 (21:36 +0100)
committer	Rhys Perry <pendingchaos02@gmail.com>
	Thu, 21 Nov 2019 20:28:57 +0000 (20:28 +0000)