aco: add vmem/smem score statistic
authorRhys Perry <pendingchaos02@gmail.com>
Wed, 4 Dec 2019 14:41:18 +0000 (14:41 +0000)
committerMarge Bot <eric+marge@anholt.net>
Fri, 3 Apr 2020 12:12:08 +0000 (12:12 +0000)
This isn't perfect (for example, changes might not be too meaningful when
comparing shaders with different control flow) but it should be useful for
evaluating scheduler changes.

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/2965>

src/amd/compiler/aco_insert_waitcnt.cpp
src/amd/compiler/aco_interface.cpp
src/amd/compiler/aco_ir.h

index 09556d232b520f269a4ee51892cde47c2b436ef4..c0a93e3a9291ac4605e3fb49a1caf4e803d32d6a 100644 (file)
@@ -25,6 +25,7 @@
 #include <algorithm>
 #include <map>
 #include <stack>
+#include <math.h>
 
 #include "aco_ir.h"
 #include "vulkan/radv_shader.h"
@@ -66,6 +67,7 @@ enum wait_event : uint16_t {
    event_gds_gpr_lock = 1 << 9,
    event_vmem_gpr_lock = 1 << 10,
    event_sendmsg = 1 << 11,
+   num_events = 12,
 };
 
 enum counter_type : uint8_t {
@@ -73,6 +75,7 @@ enum counter_type : uint8_t {
    counter_lgkm = 1 << 1,
    counter_vm = 1 << 2,
    counter_vs = 1 << 3,
+   num_counters = 4,
 };
 
 static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock;
@@ -105,6 +108,21 @@ uint8_t get_counters_for_event(wait_event ev)
    }
 }
 
+uint16_t get_events_for_counter(counter_type ctr)
+{
+   switch (ctr) {
+   case counter_exp:
+      return exp_events;
+   case counter_lgkm:
+      return lgkm_events;
+   case counter_vm:
+      return vm_events;
+   case counter_vs:
+      return vs_events;
+   }
+   return 0;
+}
+
 struct wait_imm {
    static const uint8_t unset_counter = 0xff;
 
@@ -251,6 +269,13 @@ struct wait_ctx {
 
    std::map<PhysReg,wait_entry> gpr_map;
 
+   /* used for vmem/smem scores */
+   bool collect_statistics;
+   Instruction *gen_instr;
+   std::map<Instruction *, unsigned> unwaited_instrs[num_counters];
+   std::map<PhysReg,std::set<Instruction *>> reg_instrs[num_counters];
+   std::vector<unsigned> wait_distances[num_events];
+
    wait_ctx() {}
    wait_ctx(Program *program_)
            : program(program_),
@@ -298,8 +323,53 @@ struct wait_ctx {
          barrier_events[i] |= other->barrier_events[i];
       }
 
+      /* these are used for statistics, so don't update "changed" */
+      for (unsigned i = 0; i < num_counters; i++) {
+         for (std::pair<Instruction *, unsigned> instr : other->unwaited_instrs[i]) {
+            auto pos = unwaited_instrs[i].find(instr.first);
+            if (pos == unwaited_instrs[i].end())
+               unwaited_instrs[i].insert(instr);
+            else
+               pos->second = std::min(pos->second, instr.second);
+         }
+         /* don't use a foreach loop to avoid copies */
+         for (auto it = other->reg_instrs[i].begin(); it != other->reg_instrs[i].end(); ++it)
+            reg_instrs[i][it->first].insert(it->second.begin(), it->second.end());
+      }
+
       return changed;
    }
+
+   void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) {
+      if (collect_statistics && (entry.counters & counter)) {
+         unsigned counter_idx = ffs(counter) - 1;
+         for (Instruction *instr : reg_instrs[counter_idx][reg]) {
+            auto pos = unwaited_instrs[counter_idx].find(instr);
+            if (pos == unwaited_instrs[counter_idx].end())
+               continue;
+
+            unsigned distance = pos->second;
+            unsigned events = entry.events & get_events_for_counter(counter);
+            while (events) {
+               unsigned event_idx = u_bit_scan(&events);
+               wait_distances[event_idx].push_back(distance);
+            }
+
+            unwaited_instrs[counter_idx].erase(instr);
+         }
+         reg_instrs[counter_idx][reg].clear();
+      }
+
+      entry.remove_counter(counter);
+   }
+
+   void advance_unwaited_instrs()
+   {
+      for (unsigned i = 0; i < num_counters; i++) {
+         for (auto it = unwaited_instrs[i].begin(); it != unwaited_instrs[i].end(); ++it)
+            it->second++;
+      }
+   }
 };
 
 wait_imm check_instr(Instruction* instr, wait_ctx& ctx)
@@ -477,13 +547,13 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
       while (it != ctx.gpr_map.end())
       {
          if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp)
-            it->second.remove_counter(counter_exp);
+            ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp);
          if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm)
-            it->second.remove_counter(counter_vm);
+            ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm);
          if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm)
-            it->second.remove_counter(counter_lgkm);
+            ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm);
          if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs)
-            it->second.remove_counter(counter_vs);
+            ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs);
          if (!it->second.counters)
             it = ctx.gpr_map.erase(it);
          else
@@ -619,6 +689,16 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event
       if (!it.second)
          it.first->second.join(new_entry);
    }
+
+   if (ctx.collect_statistics) {
+      unsigned counters_todo = counters;
+      while (counters_todo) {
+         unsigned i = u_bit_scan(&counters_todo);
+         ctx.unwaited_instrs[i].insert(std::make_pair(ctx.gen_instr, 0u));
+         for (unsigned j = 0; j < rc.size(); j++)
+            ctx.reg_instrs[i][PhysReg{reg.reg+j}].insert(ctx.gen_instr);
+      }
+   }
 }
 
 void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event)
@@ -758,11 +838,15 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
    std::vector<aco_ptr<Instruction>> new_instructions;
 
    wait_imm queued_imm;
+
+   ctx.collect_statistics = program->collect_statistics;
+
    for (aco_ptr<Instruction>& instr : block.instructions) {
       bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
 
       queued_imm.combine(kill(instr.get(), ctx));
 
+      ctx.gen_instr = instr.get();
       gen(instr.get(), ctx);
 
       if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
@@ -771,6 +855,9 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
             queued_imm = wait_imm();
          }
          new_instructions.emplace_back(std::move(instr));
+
+         if (ctx.collect_statistics)
+            ctx.advance_unwaited_instrs();
       }
    }
 
@@ -782,12 +869,58 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
 
 } /* end namespace */
 
+static uint32_t calculate_score(unsigned num_ctx, wait_ctx *ctx, uint32_t event_mask)
+{
+   double result = 0.0;
+   unsigned num_waits = 0;
+   while (event_mask) {
+      unsigned event_index = u_bit_scan(&event_mask);
+      for (unsigned i = 0; i < num_ctx; i++) {
+         for (unsigned dist : ctx[i].wait_distances[event_index]) {
+            double score = dist;
+            /* for many events, excessive distances provide little benefit, so
+             * decrease the score in that case. */
+            double threshold = INFINITY;
+            double inv_strength = 0.000001;
+            switch (1 << event_index) {
+            case event_smem:
+               threshold = 70.0;
+               inv_strength = 75.0;
+               break;
+            case event_vmem:
+            case event_vmem_store:
+            case event_flat:
+               threshold = 230.0;
+               inv_strength = 150.0;
+               break;
+            case event_lds:
+               threshold = 16.0;
+               break;
+            default:
+               break;
+            }
+            if (score > threshold) {
+               score -= threshold;
+               score = threshold + score / (1.0 + score / inv_strength);
+            }
+
+            /* we don't want increases in high scores to hide decreases in low scores,
+             * so raise to the power of 0.1 before averaging. */
+            result += pow(score, 0.1);
+            num_waits++;
+         }
+      }
+   }
+   return round(pow(result / num_waits, 10.0) * 10.0);
+}
+
 void insert_wait_states(Program* program)
 {
    /* per BB ctx */
    std::vector<bool> done(program->blocks.size());
    wait_ctx in_ctx[program->blocks.size()];
    wait_ctx out_ctx[program->blocks.size()];
+
    for (unsigned i = 0; i < program->blocks.size(); i++)
       in_ctx[i] = wait_ctx(program);
    std::stack<unsigned> loop_header_indices;
@@ -817,13 +950,15 @@ void insert_wait_states(Program* program)
       for (unsigned b : current.logical_preds)
          changed |= ctx.join(&out_ctx[b], true);
 
-      in_ctx[current.index] = ctx;
-
-      if (done[current.index] && !changed)
+      if (done[current.index] && !changed) {
+         in_ctx[current.index] = std::move(ctx);
          continue;
+      } else {
+         in_ctx[current.index] = ctx;
+      }
 
       if (current.instructions.empty()) {
-         out_ctx[current.index] = ctx;
+         out_ctx[current.index] = std::move(ctx);
          continue;
       }
 
@@ -832,7 +967,14 @@ void insert_wait_states(Program* program)
 
       handle_block(program, current, ctx);
 
-      out_ctx[current.index] = ctx;
+      out_ctx[current.index] = std::move(ctx);
+   }
+
+   if (program->collect_statistics) {
+      program->statistics[statistic_vmem_score] =
+         calculate_score(program->blocks.size(), out_ctx, event_vmem | event_flat | event_vmem_store);
+      program->statistics[statistic_smem_score] =
+         calculate_score(program->blocks.size(), out_ctx, event_smem);
    }
 }
 
index 104436d33c6b296f6382801909ebbe8109158de7..19c98c8196ccfa41992a451fd82c0df7495a47f8 100644 (file)
@@ -62,6 +62,8 @@ static radv_compiler_statistic_info statistic_infos[] = {
    [aco::statistic_cycles] = {"Busy Cycles", "Estimate of busy cycles"},
    [aco::statistic_vmem_clauses] = {"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"},
    [aco::statistic_smem_clauses] = {"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"},
+   [aco::statistic_vmem_score] = {"VMEM Score", "Average VMEM def-use distances"},
+   [aco::statistic_smem_score] = {"SMEM Score", "Average SMEM def-use distances"},
    [aco::statistic_sgpr_presched] = {"Pre-Sched SGPRs", "SGPR usage before scheduling"},
    [aco::statistic_vgpr_presched] = {"Pre-Sched VGPRs", "VGPR usage before scheduling"},
 };
index c6213e0c04ec241785e7c5b6b76392b6e5d95211..a62525016fb27608287c053f169bad90b7ed3deb 100644 (file)
@@ -1227,6 +1227,8 @@ enum statistic {
    statistic_cycles,
    statistic_vmem_clauses,
    statistic_smem_clauses,
+   statistic_vmem_score,
+   statistic_smem_score,
    statistic_sgpr_presched,
    statistic_vgpr_presched,
    num_statistics