From: Rhys Perry Date: Wed, 4 Dec 2019 14:41:18 +0000 (+0000) Subject: aco: add vmem/smem score statistic X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=507956ed04fcdcfd44419d1b16f032e1d81d0dcb;p=mesa.git aco: add vmem/smem score statistic This isn't perfect (for example, changes might not be too meaningful when comparing shaders with different control flow) but it should be useful for evaluating scheduler changes. Signed-off-by: Rhys Perry Acked-by: Bas Nieuwenhuizen Reviewed-by: Daniel Schürmann Part-of: --- diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 09556d232b5..c0a93e3a929 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include "aco_ir.h" #include "vulkan/radv_shader.h" @@ -66,6 +67,7 @@ enum wait_event : uint16_t { event_gds_gpr_lock = 1 << 9, event_vmem_gpr_lock = 1 << 10, event_sendmsg = 1 << 11, + num_events = 12, }; enum counter_type : uint8_t { @@ -73,6 +75,7 @@ enum counter_type : uint8_t { counter_lgkm = 1 << 1, counter_vm = 1 << 2, counter_vs = 1 << 3, + num_counters = 4, }; static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; @@ -105,6 +108,21 @@ uint8_t get_counters_for_event(wait_event ev) } } +uint16_t get_events_for_counter(counter_type ctr) +{ + switch (ctr) { + case counter_exp: + return exp_events; + case counter_lgkm: + return lgkm_events; + case counter_vm: + return vm_events; + case counter_vs: + return vs_events; + } + return 0; +} + struct wait_imm { static const uint8_t unset_counter = 0xff; @@ -251,6 +269,13 @@ struct wait_ctx { std::map gpr_map; + /* used for vmem/smem scores */ + bool collect_statistics; + Instruction *gen_instr; + std::map unwaited_instrs[num_counters]; + std::map> reg_instrs[num_counters]; + std::vector wait_distances[num_events]; + wait_ctx() {} wait_ctx(Program *program_) : program(program_), @@ -298,8 +323,53 @@ struct wait_ctx { barrier_events[i] |= other->barrier_events[i]; } + /* these are used for statistics, so don't update "changed" */ + for (unsigned i = 0; i < num_counters; i++) { + for (std::pair instr : other->unwaited_instrs[i]) { + auto pos = unwaited_instrs[i].find(instr.first); + if (pos == unwaited_instrs[i].end()) + unwaited_instrs[i].insert(instr); + else + pos->second = std::min(pos->second, instr.second); + } + /* don't use a foreach loop to avoid copies */ + for (auto it = other->reg_instrs[i].begin(); it != other->reg_instrs[i].end(); ++it) + reg_instrs[i][it->first].insert(it->second.begin(), it->second.end()); + } + return changed; } + + void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) { + if (collect_statistics && (entry.counters & counter)) { + unsigned counter_idx = ffs(counter) - 1; + for (Instruction *instr : reg_instrs[counter_idx][reg]) { + auto pos = unwaited_instrs[counter_idx].find(instr); + if (pos == unwaited_instrs[counter_idx].end()) + continue; + + unsigned distance = pos->second; + unsigned events = entry.events & get_events_for_counter(counter); + while (events) { + unsigned event_idx = u_bit_scan(&events); + wait_distances[event_idx].push_back(distance); + } + + unwaited_instrs[counter_idx].erase(instr); + } + reg_instrs[counter_idx][reg].clear(); + } + + entry.remove_counter(counter); + } + + void advance_unwaited_instrs() + { + for (unsigned i = 0; i < num_counters; i++) { + for (auto it = unwaited_instrs[i].begin(); it != unwaited_instrs[i].end(); ++it) + it->second++; + } + } }; wait_imm check_instr(Instruction* instr, wait_ctx& ctx) @@ -477,13 +547,13 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) while (it != ctx.gpr_map.end()) { if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp) - it->second.remove_counter(counter_exp); + ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp); if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm) - it->second.remove_counter(counter_vm); + ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm); if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm) - it->second.remove_counter(counter_lgkm); + ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm); if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) - it->second.remove_counter(counter_vs); + ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs); if (!it->second.counters) it = ctx.gpr_map.erase(it); else @@ -619,6 +689,16 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event if (!it.second) it.first->second.join(new_entry); } + + if (ctx.collect_statistics) { + unsigned counters_todo = counters; + while (counters_todo) { + unsigned i = u_bit_scan(&counters_todo); + ctx.unwaited_instrs[i].insert(std::make_pair(ctx.gen_instr, 0u)); + for (unsigned j = 0; j < rc.size(); j++) + ctx.reg_instrs[i][PhysReg{reg.reg+j}].insert(ctx.gen_instr); + } + } } void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event) @@ -758,11 +838,15 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) std::vector> new_instructions; wait_imm queued_imm; + + ctx.collect_statistics = program->collect_statistics; + for (aco_ptr& instr : block.instructions) { bool is_wait = !parse_wait_instr(ctx, instr.get()).empty(); queued_imm.combine(kill(instr.get(), ctx)); + ctx.gen_instr = instr.get(); gen(instr.get(), ctx); if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { @@ -771,6 +855,9 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) queued_imm = wait_imm(); } new_instructions.emplace_back(std::move(instr)); + + if (ctx.collect_statistics) + ctx.advance_unwaited_instrs(); } } @@ -782,12 +869,58 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) } /* end namespace */ +static uint32_t calculate_score(unsigned num_ctx, wait_ctx *ctx, uint32_t event_mask) +{ + double result = 0.0; + unsigned num_waits = 0; + while (event_mask) { + unsigned event_index = u_bit_scan(&event_mask); + for (unsigned i = 0; i < num_ctx; i++) { + for (unsigned dist : ctx[i].wait_distances[event_index]) { + double score = dist; + /* for many events, excessive distances provide little benefit, so + * decrease the score in that case. */ + double threshold = INFINITY; + double inv_strength = 0.000001; + switch (1 << event_index) { + case event_smem: + threshold = 70.0; + inv_strength = 75.0; + break; + case event_vmem: + case event_vmem_store: + case event_flat: + threshold = 230.0; + inv_strength = 150.0; + break; + case event_lds: + threshold = 16.0; + break; + default: + break; + } + if (score > threshold) { + score -= threshold; + score = threshold + score / (1.0 + score / inv_strength); + } + + /* we don't want increases in high scores to hide decreases in low scores, + * so raise to the power of 0.1 before averaging. */ + result += pow(score, 0.1); + num_waits++; + } + } + } + return round(pow(result / num_waits, 10.0) * 10.0); +} + void insert_wait_states(Program* program) { /* per BB ctx */ std::vector done(program->blocks.size()); wait_ctx in_ctx[program->blocks.size()]; wait_ctx out_ctx[program->blocks.size()]; + for (unsigned i = 0; i < program->blocks.size(); i++) in_ctx[i] = wait_ctx(program); std::stack loop_header_indices; @@ -817,13 +950,15 @@ void insert_wait_states(Program* program) for (unsigned b : current.logical_preds) changed |= ctx.join(&out_ctx[b], true); - in_ctx[current.index] = ctx; - - if (done[current.index] && !changed) + if (done[current.index] && !changed) { + in_ctx[current.index] = std::move(ctx); continue; + } else { + in_ctx[current.index] = ctx; + } if (current.instructions.empty()) { - out_ctx[current.index] = ctx; + out_ctx[current.index] = std::move(ctx); continue; } @@ -832,7 +967,14 @@ void insert_wait_states(Program* program) handle_block(program, current, ctx); - out_ctx[current.index] = ctx; + out_ctx[current.index] = std::move(ctx); + } + + if (program->collect_statistics) { + program->statistics[statistic_vmem_score] = + calculate_score(program->blocks.size(), out_ctx, event_vmem | event_flat | event_vmem_store); + program->statistics[statistic_smem_score] = + calculate_score(program->blocks.size(), out_ctx, event_smem); } } diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 104436d33c6..19c98c8196c 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -62,6 +62,8 @@ static radv_compiler_statistic_info statistic_infos[] = { [aco::statistic_cycles] = {"Busy Cycles", "Estimate of busy cycles"}, [aco::statistic_vmem_clauses] = {"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"}, [aco::statistic_smem_clauses] = {"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"}, + [aco::statistic_vmem_score] = {"VMEM Score", "Average VMEM def-use distances"}, + [aco::statistic_smem_score] = {"SMEM Score", "Average SMEM def-use distances"}, [aco::statistic_sgpr_presched] = {"Pre-Sched SGPRs", "SGPR usage before scheduling"}, [aco::statistic_vgpr_presched] = {"Pre-Sched VGPRs", "VGPR usage before scheduling"}, }; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index c6213e0c04e..a62525016fb 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1227,6 +1227,8 @@ enum statistic { statistic_cycles, statistic_vmem_clauses, statistic_smem_clauses, + statistic_vmem_score, + statistic_smem_score, statistic_sgpr_presched, statistic_vgpr_presched, num_statistics