From d1b9deeea89f70ea5a4f2cb6c82785d3179e5ad2 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 21 Oct 2019 21:36:41 +0100 Subject: [PATCH] aco: improve waitcnt insertion around loops MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Do this by repeating processing of loops until no progress is made. Totals from affected shaders: SGPRS: 162576 -> 162576 (0.00 %) VGPRS: 145228 -> 145228 (0.00 %) Spilled SGPRs: 668 -> 668 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 15778640 -> 15771336 (-0.05 %) bytes LDS: 146 -> 146 (0.00 %) blocks Max Waves: 6087 -> 6087 (0.00 %) v2: use block_kind_loop_header/block_kind_loop_exit to repeat at the end of loops instead of at each continue Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann --- src/amd/compiler/aco_insert_waitcnt.cpp | 153 +++++++++++++++++------- 1 file changed, 108 insertions(+), 45 deletions(-) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 1f4733ed356..5d65ae253ff 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -24,6 +24,7 @@ #include #include +#include #include "aco_ir.h" #include "vulkan/radv_shader.h" @@ -34,8 +35,9 @@ namespace { /** * The general idea of this pass is: - * The CFG is traversed in reverse postorder (forward). - * Per BB one wait_ctx is maintained. + * The CFG is traversed in reverse postorder (forward) and loops are processed + * several times until no progress is made. + * Per BB two wait_ctx is maintained: an in-context and out-context. * The in-context is the joined out-contexts of the predecessors. * The context contains a map: gpr -> wait_entry * consisting of the information about the cnt values to be waited for. @@ -114,6 +116,19 @@ struct wait_imm { wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {} + wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter) + { + vm = packed & 0xf; + if (chip >= GFX9) + vm |= (packed >> 10) & 0x30; + + exp = (packed >> 4) & 0x7; + + lgkm = (packed >> 8) & 0xf; + if (chip >= GFX10) + lgkm |= (packed >> 8) & 0x30; + } + uint16_t pack(enum chip_class chip) const { uint16_t imm = 0; @@ -142,12 +157,14 @@ struct wait_imm { return imm; } - void combine(const wait_imm& other) + bool combine(const wait_imm& other) { + bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs; vm = std::min(vm, other.vm); exp = std::min(exp, other.exp); lgkm = std::min(lgkm, other.lgkm); vs = std::min(vs, other.vs); + return changed; } bool empty() const @@ -168,13 +185,17 @@ struct wait_entry { : imm(imm), events(event), counters(get_counters_for_event(event)), wait_on_read(wait_on_read), logical(logical) {} - void join(const wait_entry& other) + bool join(const wait_entry& other) { + bool changed = (other.events & ~events) || + (other.counters & ~counters) || + (other.wait_on_read && !wait_on_read); events |= other.events; counters |= other.counters; - imm.combine(other.imm); + changed |= imm.combine(other.imm); wait_on_read = wait_on_read || other.wait_on_read; assert(logical == other.logical); + return changed; } void remove_counter(counter_type counter) @@ -237,8 +258,15 @@ struct wait_ctx { max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {} - void join(const wait_ctx* other, bool logical) + bool join(const wait_ctx* other, bool logical) { + bool changed = other->exp_cnt > exp_cnt || + other->vm_cnt > vm_cnt || + other->lgkm_cnt > lgkm_cnt || + other->vs_cnt > vs_cnt || + (other->pending_flat_lgkm && !pending_flat_lgkm) || + (other->pending_flat_vm && !pending_flat_vm); + exp_cnt = std::max(exp_cnt, other->exp_cnt); vm_cnt = std::max(vm_cnt, other->vm_cnt); lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt); @@ -253,14 +281,18 @@ struct wait_ctx { if (entry.second.logical != logical) continue; - if (it != gpr_map.end()) - it->second.join(entry.second); - else + if (it != gpr_map.end()) { + changed |= it->second.join(entry.second); + } else { gpr_map.insert(entry); + changed = true; + } } for (unsigned i = 0; i < barrier_count; i++) - barrier_imm[i].combine(other->barrier_imm[i]); + changed |= barrier_imm[i].combine(other->barrier_imm[i]); + + return changed; } }; @@ -319,12 +351,27 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx) return wait; } +wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr) +{ + if (instr->opcode == aco_opcode::s_waitcnt_vscnt && + instr->definitions[0].physReg() == sgpr_null) { + wait_imm imm; + imm.vs = std::min(imm.vs, static_cast(instr)->imm); + return imm; + } else if (instr->opcode == aco_opcode::s_waitcnt) { + return wait_imm(ctx.chip_class, static_cast(instr)->imm); + } + return wait_imm(); +} + wait_imm kill(Instruction* instr, wait_ctx& ctx) { wait_imm imm; if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) imm.combine(check_instr(instr, ctx)); + imm.combine(parse_wait_instr(ctx, instr)); + if (ctx.chip_class >= GFX10) { /* Seems to be required on GFX10 to achieve correct behaviour. * It shouldn't cost anything anyways since we're about to do s_endpgm. @@ -665,39 +712,23 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) { std::vector> new_instructions; + wait_imm queued_imm; for (aco_ptr& instr : block.instructions) { - wait_imm imm = kill(instr.get(), ctx); + bool is_wait = !parse_wait_instr(ctx, instr.get()).empty(); - if (!imm.empty()) - emit_waitcnt(ctx, new_instructions, imm); + queued_imm.combine(kill(instr.get(), ctx)); gen(instr.get(), ctx); - if (instr->format != Format::PSEUDO_BARRIER) + if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { + if (!queued_imm.empty()) { + emit_waitcnt(ctx, new_instructions, queued_imm); + queued_imm = wait_imm(); + } new_instructions.emplace_back(std::move(instr)); - } - - /* check if this block is at the end of a loop */ - for (unsigned succ_idx : block.linear_succs) { - /* eliminate any remaining counters */ - if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt)) { - // TODO: we could do better if we only wait if the regs between the block and other predecessors differ - - aco_ptr branch = std::move(new_instructions.back()); - new_instructions.pop_back(); - - wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter, - ctx.exp_cnt ? 0 : wait_imm::unset_counter, - ctx.lgkm_cnt ? 0 : wait_imm::unset_counter, - ctx.vs_cnt ? 0 : wait_imm::unset_counter); - emit_waitcnt(ctx, new_instructions, imm); - - new_instructions.push_back(std::move(branch)); - - ctx = wait_ctx(program); - break; } } + block.instructions.swap(new_instructions); } @@ -705,23 +736,55 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) void insert_wait_states(Program* program) { - wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */ + /* per BB ctx */ + std::vector done(program->blocks.size()); + wait_ctx in_ctx[program->blocks.size()]; + wait_ctx out_ctx[program->blocks.size()]; for (unsigned i = 0; i < program->blocks.size(); i++) - out_ctx[i] = wait_ctx(program); - - for (unsigned i = 0; i < program->blocks.size(); i++) { - Block& current = program->blocks[i]; - wait_ctx& in = out_ctx[current.index]; + in_ctx[i] = wait_ctx(program); + std::stack loop_header_indices; + unsigned loop_progress = 0; + + for (unsigned i = 0; i < program->blocks.size();) { + Block& current = program->blocks[i++]; + wait_ctx ctx = in_ctx[current.index]; + + if (current.kind & block_kind_loop_header) { + loop_header_indices.push(current.index); + } else if (current.kind & block_kind_loop_exit) { + bool repeat = false; + if (loop_progress == loop_header_indices.size()) { + i = loop_header_indices.top(); + repeat = true; + } + loop_header_indices.pop(); + loop_progress = std::min(loop_progress, loop_header_indices.size()); + if (repeat) + continue; + } + bool changed = false; for (unsigned b : current.linear_preds) - in.join(&out_ctx[b], false); + changed |= ctx.join(&out_ctx[b], false); for (unsigned b : current.logical_preds) - in.join(&out_ctx[b], true); + changed |= ctx.join(&out_ctx[b], true); + + in_ctx[current.index] = ctx; + + if (done[current.index] && !changed) + continue; - if (current.instructions.empty()) + if (current.instructions.empty()) { + out_ctx[current.index] = ctx; continue; + } + + loop_progress = std::max(loop_progress, current.loop_nest_depth); + done[current.index] = true; + + handle_block(program, current, ctx); - handle_block(program, current, in); + out_ctx[current.index] = ctx; } } -- 2.30.2