X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_insert_waitcnt.cpp;h=751892e44368a476d1f1ff44261826d536037691;hp=7960902d6907361efdc6fc27db505d015ef861f3;hb=a0814a873d50f65484b17927379fbb47cf90372e;hpb=54742e157d16de68e06105910d8e26d3215565c7 diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 7960902d690..751892e4436 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include "aco_ir.h" #include "vulkan/radv_shader.h" @@ -65,6 +66,8 @@ enum wait_event : uint16_t { event_exp_mrt_null = 1 << 8, event_gds_gpr_lock = 1 << 9, event_vmem_gpr_lock = 1 << 10, + event_sendmsg = 1 << 11, + num_events = 12, }; enum counter_type : uint8_t { @@ -72,10 +75,11 @@ enum counter_type : uint8_t { counter_lgkm = 1 << 1, counter_vm = 1 << 2, counter_vs = 1 << 3, + num_counters = 4, }; static const uint16_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | event_gds_gpr_lock | event_vmem_gpr_lock; -static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat; +static const uint16_t lgkm_events = event_smem | event_lds | event_gds | event_flat | event_sendmsg; static const uint16_t vm_events = event_vmem | event_flat; static const uint16_t vs_events = event_vmem_store; @@ -85,6 +89,7 @@ uint8_t get_counters_for_event(wait_event ev) case event_smem: case event_lds: case event_gds: + case event_sendmsg: return counter_lgkm; case event_vmem: return counter_vm; @@ -103,6 +108,21 @@ uint8_t get_counters_for_event(wait_event ev) } } +uint16_t get_events_for_counter(counter_type ctr) +{ + switch (ctr) { + case counter_exp: + return exp_events; + case counter_lgkm: + return lgkm_events; + case counter_vm: + return vm_events; + case counter_vs: + return vs_events; + } + return 0; +} + struct wait_imm { static const uint8_t unset_counter = 0xff; @@ -135,6 +155,7 @@ struct wait_imm { assert(exp == unset_counter || exp <= 0x7); switch (chip) { case GFX10: + case GFX10_3: assert(lgkm == unset_counter || lgkm <= 0x3f); assert(vm == unset_counter || vm <= 0x3f); imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf); @@ -180,20 +201,27 @@ struct wait_entry { uint8_t counters; /* use counter_type notion */ bool wait_on_read:1; bool logical:1; + bool has_vmem_nosampler:1; + bool has_vmem_sampler:1; wait_entry(wait_event event, wait_imm imm, bool logical, bool wait_on_read) : imm(imm), events(event), counters(get_counters_for_event(event)), - wait_on_read(wait_on_read), logical(logical) {} + wait_on_read(wait_on_read), logical(logical), + has_vmem_nosampler(false), has_vmem_sampler(false) {} bool join(const wait_entry& other) { bool changed = (other.events & ~events) || (other.counters & ~counters) || - (other.wait_on_read && !wait_on_read); + (other.wait_on_read && !wait_on_read) || + (other.has_vmem_nosampler && !has_vmem_nosampler) || + (other.has_vmem_sampler && !has_vmem_sampler); events |= other.events; counters |= other.counters; changed |= imm.combine(other.imm); - wait_on_read = wait_on_read || other.wait_on_read; + wait_on_read |= other.wait_on_read; + has_vmem_nosampler |= other.has_vmem_nosampler; + has_vmem_sampler |= other.has_vmem_sampler; assert(logical == other.logical); return changed; } @@ -204,12 +232,14 @@ struct wait_entry { if (counter == counter_lgkm) { imm.lgkm = wait_imm::unset_counter; - events &= ~(event_smem | event_lds | event_gds); + events &= ~(event_smem | event_lds | event_gds | event_sendmsg); } if (counter == counter_vm) { imm.vm = wait_imm::unset_counter; events &= ~event_vmem; + has_vmem_nosampler = false; + has_vmem_sampler = false; } if (counter == counter_exp) { @@ -244,10 +274,18 @@ struct wait_ctx { bool pending_flat_vm = false; bool pending_s_buffer_store = false; /* GFX10 workaround */ - wait_imm barrier_imm[barrier_count]; + wait_imm barrier_imm[storage_count]; + uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */ std::map gpr_map; + /* used for vmem/smem scores */ + bool collect_statistics; + Instruction *gen_instr; + std::map unwaited_instrs[num_counters]; + std::map> reg_instrs[num_counters]; + std::vector wait_distances[num_events]; + wait_ctx() {} wait_ctx(Program *program_) : program(program_), @@ -256,7 +294,8 @@ struct wait_ctx { max_exp_cnt(6), max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14), max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), - unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {} + unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)), + collect_statistics(program_->collect_statistics) {} bool join(const wait_ctx* other, bool logical) { @@ -289,11 +328,59 @@ struct wait_ctx { } } - for (unsigned i = 0; i < barrier_count; i++) + for (unsigned i = 0; i < storage_count; i++) { changed |= barrier_imm[i].combine(other->barrier_imm[i]); + changed |= other->barrier_events[i] & ~barrier_events[i]; + barrier_events[i] |= other->barrier_events[i]; + } + + /* these are used for statistics, so don't update "changed" */ + for (unsigned i = 0; i < num_counters; i++) { + for (std::pair instr : other->unwaited_instrs[i]) { + auto pos = unwaited_instrs[i].find(instr.first); + if (pos == unwaited_instrs[i].end()) + unwaited_instrs[i].insert(instr); + else + pos->second = std::min(pos->second, instr.second); + } + /* don't use a foreach loop to avoid copies */ + for (auto it = other->reg_instrs[i].begin(); it != other->reg_instrs[i].end(); ++it) + reg_instrs[i][it->first].insert(it->second.begin(), it->second.end()); + } return changed; } + + void wait_and_remove_from_entry(PhysReg reg, wait_entry& entry, counter_type counter) { + if (collect_statistics && (entry.counters & counter)) { + unsigned counter_idx = ffs(counter) - 1; + for (Instruction *instr : reg_instrs[counter_idx][reg]) { + auto pos = unwaited_instrs[counter_idx].find(instr); + if (pos == unwaited_instrs[counter_idx].end()) + continue; + + unsigned distance = pos->second; + unsigned events = entry.events & get_events_for_counter(counter); + while (events) { + unsigned event_idx = u_bit_scan(&events); + wait_distances[event_idx].push_back(distance); + } + + unwaited_instrs[counter_idx].erase(instr); + } + reg_instrs[counter_idx][reg].clear(); + } + + entry.remove_counter(counter); + } + + void advance_unwaited_instrs() + { + for (unsigned i = 0; i < num_counters; i++) { + for (auto it = unwaited_instrs[i].begin(); it != unwaited_instrs[i].end(); ++it) + it->second++; + } + } }; wait_imm check_instr(Instruction* instr, wait_ctx& ctx) @@ -326,22 +413,16 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx) continue; /* Vector Memory reads and writes return in the order they were issued */ - if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem)) { - it->second.remove_counter(counter_vm); - if (!it->second.counters) - it = ctx.gpr_map.erase(it); + bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4; + if (instr->isVMEM() && ((it->second.events & vm_events) == event_vmem) && + it->second.has_vmem_nosampler == !has_sampler && it->second.has_vmem_sampler == has_sampler) continue; - } /* LDS reads and writes return in the order they were issued. same for GDS */ if (instr->format == Format::DS) { bool gds = static_cast(instr)->gds; - if ((it->second.events & lgkm_events) == (gds ? event_gds : event_lds)) { - it->second.remove_counter(counter_lgkm); - if (!it->second.counters) - it = ctx.gpr_map.erase(it); + if ((it->second.events & lgkm_events) == (gds ? event_gds : event_lds)) continue; - } } wait.combine(it->second.imm); @@ -364,21 +445,75 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr) return wait_imm(); } -wait_imm kill(Instruction* instr, wait_ctx& ctx) +wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics) +{ + wait_imm imm; + sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup; + if ((sync.semantics & semantics) && sync.scope > subgroup_scope) { + unsigned storage = sync.storage; + while (storage) { + unsigned idx = u_bit_scan(&storage); + + /* LDS is private to the workgroup */ + sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup); + + uint16_t events = ctx.barrier_events[idx]; + if (bar_scope_lds <= subgroup_scope) + events &= ~event_lds; + + /* in non-WGP, the L1/L0 cache keeps all memory operations in-order for the same workgroup */ + if (ctx.chip_class < GFX10 && sync.scope <= scope_workgroup) + events &= ~(event_vmem | event_vmem_store | event_smem); + + if (events) + imm.combine(ctx.barrier_imm[idx]); + } + } + + return imm; +} + +void force_waitcnt(wait_ctx& ctx, wait_imm& imm) +{ + if (ctx.vm_cnt) + imm.vm = 0; + if (ctx.exp_cnt) + imm.exp = 0; + if (ctx.lgkm_cnt) + imm.lgkm = 0; + + if (ctx.chip_class >= GFX10) { + if (ctx.vs_cnt) + imm.vs = 0; + } +} + +wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) { wait_imm imm; + + if (debug_flags & DEBUG_FORCE_WAITCNT) { + /* Force emitting waitcnt states right after the instruction if there is + * something to wait for. + */ + force_waitcnt(ctx, imm); + } + if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) imm.combine(check_instr(instr, ctx)); imm.combine(parse_wait_instr(ctx, instr)); - if (ctx.chip_class >= GFX10) { - /* Seems to be required on GFX10 to achieve correct behaviour. - * It shouldn't cost anything anyways since we're about to do s_endpgm. - */ - if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) - imm.lgkm = 0; + /* It's required to wait for scalar stores before "writing back" data. + * It shouldn't cost anything anyways since we're about to do s_endpgm. + */ + if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) { + assert(ctx.chip_class >= GFX8); + imm.lgkm = 0; + } + + if (ctx.chip_class >= GFX10 && instr->format == Format::SMEM) { /* GFX10: A store followed by a load at the same address causes a problem because * the load doesn't load the correct values unless we wait for the store first. * This is NOT mitigated by an s_nop. @@ -388,40 +523,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) SMEM_instruction *smem = static_cast(instr); if (ctx.pending_s_buffer_store && !smem->definitions.empty() && - !smem->can_reorder && smem->barrier == barrier_buffer) { + !smem->sync.can_reorder()) { imm.lgkm = 0; } } - if (instr->format == Format::PSEUDO_BARRIER) { - unsigned* bsize = ctx.program->info->cs.block_size; - unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2]; - switch (instr->opcode) { - case aco_opcode::p_memory_barrier_all: - for (unsigned i = 0; i < barrier_count; i++) { - if ((1 << i) == barrier_shared && workgroup_size <= 64) - continue; - imm.combine(ctx.barrier_imm[i]); - } - break; - case aco_opcode::p_memory_barrier_atomic: - imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); - break; - /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */ - case aco_opcode::p_memory_barrier_buffer: - case aco_opcode::p_memory_barrier_image: - imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]); - imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); - break; - case aco_opcode::p_memory_barrier_shared: - if (workgroup_size > 64) - imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); - break; - default: - assert(false); - break; - } - } + if (instr->opcode == aco_opcode::p_barrier) + imm.combine(perform_barrier(ctx, static_cast(instr)->sync, semantic_acqrel)); + else + imm.combine(perform_barrier(ctx, sync_info, semantic_release)); if (!imm.empty()) { if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter) @@ -436,16 +546,27 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs); /* update barrier wait imms */ - for (unsigned i = 0; i < barrier_count; i++) { + for (unsigned i = 0; i < storage_count; i++) { wait_imm& bar = ctx.barrier_imm[i]; - if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) + uint16_t& bar_ev = ctx.barrier_events[i]; + if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) { bar.exp = wait_imm::unset_counter; - if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) + bar_ev &= ~exp_events; + } + if (bar.vm != wait_imm::unset_counter && imm.vm <= bar.vm) { bar.vm = wait_imm::unset_counter; - if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) + bar_ev &= ~(vm_events & ~event_flat); + } + if (bar.lgkm != wait_imm::unset_counter && imm.lgkm <= bar.lgkm) { bar.lgkm = wait_imm::unset_counter; - if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) + bar_ev &= ~(lgkm_events & ~event_flat); + } + if (bar.vs != wait_imm::unset_counter && imm.vs <= bar.vs) { bar.vs = wait_imm::unset_counter; + bar_ev &= ~vs_events; + } + if (bar.vm == wait_imm::unset_counter && bar.lgkm == wait_imm::unset_counter) + bar_ev &= ~event_flat; } /* remove all gprs with higher counter from map */ @@ -453,13 +574,13 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) while (it != ctx.gpr_map.end()) { if (imm.exp != wait_imm::unset_counter && imm.exp <= it->second.imm.exp) - it->second.remove_counter(counter_exp); + ctx.wait_and_remove_from_entry(it->first, it->second, counter_exp); if (imm.vm != wait_imm::unset_counter && imm.vm <= it->second.imm.vm) - it->second.remove_counter(counter_vm); + ctx.wait_and_remove_from_entry(it->first, it->second, counter_vm); if (imm.lgkm != wait_imm::unset_counter && imm.lgkm <= it->second.imm.lgkm) - it->second.remove_counter(counter_lgkm); - if (imm.lgkm != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) - it->second.remove_counter(counter_vs); + ctx.wait_and_remove_from_entry(it->first, it->second, counter_lgkm); + if (imm.vs != wait_imm::unset_counter && imm.vs <= it->second.imm.vs) + ctx.wait_and_remove_from_entry(it->first, it->second, counter_vs); if (!it->second.counters) it = ctx.gpr_map.erase(it); else @@ -477,12 +598,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) return imm; } -void update_barrier_imm(wait_ctx& ctx, uint8_t counters, barrier_interaction barrier) +void update_barrier_counter(uint8_t *ctr, unsigned max) { - unsigned barrier_index = ffs(barrier) - 1; - for (unsigned i = 0; i < barrier_count; i++) { + if (*ctr != wait_imm::unset_counter && *ctr < max) + (*ctr)++; +} + +void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync) +{ + for (unsigned i = 0; i < storage_count; i++) { wait_imm& bar = ctx.barrier_imm[i]; - if (i == barrier_index) { + uint16_t& bar_ev = ctx.barrier_events[i]; + if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) { + bar_ev |= event; if (counters & counter_lgkm) bar.lgkm = 0; if (counters & counter_vm) @@ -491,20 +619,20 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, barrier_interaction bar bar.exp = 0; if (counters & counter_vs) bar.vs = 0; - } else { - if (counters & counter_lgkm && bar.lgkm != wait_imm::unset_counter && bar.lgkm < ctx.max_lgkm_cnt) - bar.lgkm++; - if (counters & counter_vm && bar.vm != wait_imm::unset_counter && bar.vm < ctx.max_vm_cnt) - bar.vm++; - if (counters & counter_exp && bar.exp != wait_imm::unset_counter && bar.exp < ctx.max_exp_cnt) - bar.exp++; - if (counters & counter_vs && bar.vs != wait_imm::unset_counter && bar.vs < ctx.max_vs_cnt) - bar.vs++; + } else if (!(bar_ev & ctx.unordered_events) && !(ctx.unordered_events & event)) { + if (counters & counter_lgkm && (bar_ev & lgkm_events) == event) + update_barrier_counter(&bar.lgkm, ctx.max_lgkm_cnt); + if (counters & counter_vm && (bar_ev & vm_events) == event) + update_barrier_counter(&bar.vm, ctx.max_vm_cnt); + if (counters & counter_exp && (bar_ev & exp_events) == event) + update_barrier_counter(&bar.exp, ctx.max_exp_cnt); + if (counters & counter_vs && (bar_ev & vs_events) == event) + update_barrier_counter(&bar.vs, ctx.max_vs_cnt); } } } -void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none) +void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info()) { uint8_t counters = get_counters_for_event(event); @@ -517,7 +645,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt) ctx.vs_cnt++; - update_barrier_imm(ctx, counters, barrier); + update_barrier_imm(ctx, counters, event, sync); if (ctx.unordered_events & event) return; @@ -546,7 +674,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie } } -void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none) +void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info()) { assert(ctx.chip_class < GFX10); @@ -555,7 +683,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=ba if (ctx.vm_cnt <= ctx.max_vm_cnt) ctx.vm_cnt++; - update_barrier_imm(ctx, counter_vm | counter_lgkm, barrier); + update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync); for (std::pair e : ctx.gpr_map) { @@ -568,7 +696,8 @@ void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=ba ctx.pending_flat_vm = true; } -void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read) +void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read, + bool has_sampler=false) { uint16_t counters = get_counters_for_event(event); wait_imm imm; @@ -582,23 +711,35 @@ void insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event imm.vs = 0; wait_entry new_entry(event, imm, !rc.is_linear(), wait_on_read); + new_entry.has_vmem_nosampler = (event & event_vmem) && !has_sampler; + new_entry.has_vmem_sampler = (event & event_vmem) && has_sampler; for (unsigned i = 0; i < rc.size(); i++) { - auto it = ctx.gpr_map.emplace(PhysReg{reg.reg+i}, new_entry); + auto it = ctx.gpr_map.emplace(PhysReg{reg.reg()+i}, new_entry); if (!it.second) it.first->second.join(new_entry); } + + if (ctx.collect_statistics) { + unsigned counters_todo = counters; + while (counters_todo) { + unsigned i = u_bit_scan(&counters_todo); + ctx.unwaited_instrs[i].insert(std::make_pair(ctx.gen_instr, 0u)); + for (unsigned j = 0; j < rc.size(); j++) + ctx.reg_instrs[i][PhysReg{reg.reg()+j}].insert(ctx.gen_instr); + } + } } -void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event) +void insert_wait_entry(wait_ctx& ctx, Operand op, wait_event event, bool has_sampler=false) { if (!op.isConstant() && !op.isUndefined()) - insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false); + insert_wait_entry(ctx, op.physReg(), op.regClass(), event, false, has_sampler); } -void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event) +void insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, bool has_sampler=false) { - insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true); + insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, has_sampler); } void gen(Instruction* instr, wait_ctx& ctx) @@ -630,10 +771,11 @@ void gen(Instruction* instr, wait_ctx& ctx) break; } case Format::FLAT: { + FLAT_instruction *flat = static_cast(instr); if (ctx.chip_class < GFX10 && !instr->definitions.empty()) - update_counters_for_flat_load(ctx, barrier_buffer); + update_counters_for_flat_load(ctx, flat->sync); else - update_counters(ctx, event_flat, barrier_buffer); + update_counters(ctx, event_flat, flat->sync); if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_flat); @@ -641,27 +783,26 @@ void gen(Instruction* instr, wait_ctx& ctx) } case Format::SMEM: { SMEM_instruction *smem = static_cast(instr); - update_counters(ctx, event_smem, static_cast(instr)->barrier); + update_counters(ctx, event_smem, smem->sync); if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_smem); else if (ctx.chip_class >= GFX10 && - !smem->can_reorder && - smem->barrier == barrier_buffer) + !smem->sync.can_reorder()) ctx.pending_s_buffer_store = true; break; } case Format::DS: { - bool gds = static_cast(instr)->gds; - update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared); - if (gds) + DS_instruction *ds = static_cast(instr); + update_counters(ctx, ds->gds ? event_gds : event_lds, ds->sync); + if (ds->gds) update_counters(ctx, event_gds_gpr_lock); if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds); + insert_wait_entry(ctx, instr->definitions[0], ds->gds ? event_gds : event_lds); - if (gds) { + if (ds->gds) { for (const Operand& op : instr->operands) insert_wait_entry(ctx, op, event_gds_gpr_lock); insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false); @@ -673,18 +814,34 @@ void gen(Instruction* instr, wait_ctx& ctx) case Format::MIMG: case Format::GLOBAL: { wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store; - update_counters(ctx, ev, get_barrier_interaction(instr)); + update_counters(ctx, ev, get_sync_info(instr)); + + bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4; if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], ev); + insert_wait_entry(ctx, instr->definitions[0], ev, has_sampler); - if (instr->operands.size() == 4 && ctx.chip_class == GFX6) { + if (ctx.chip_class == GFX6 && + instr->format != Format::MIMG && + instr->operands.size() == 4) { ctx.exp_cnt++; update_counters(ctx, event_vmem_gpr_lock); insert_wait_entry(ctx, instr->operands[3], event_vmem_gpr_lock); + } else if (ctx.chip_class == GFX6 && + instr->format == Format::MIMG && + instr->operands[1].regClass().type() == RegType::vgpr) { + ctx.exp_cnt++; + update_counters(ctx, event_vmem_gpr_lock); + insert_wait_entry(ctx, instr->operands[1], event_vmem_gpr_lock); } + break; } + case Format::SOPP: { + if (instr->opcode == aco_opcode::s_sendmsg || + instr->opcode == aco_opcode::s_sendmsghalt) + update_counters(ctx, event_sendmsg); + } default: break; } @@ -713,11 +870,14 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) std::vector> new_instructions; wait_imm queued_imm; + for (aco_ptr& instr : block.instructions) { bool is_wait = !parse_wait_instr(ctx, instr.get()).empty(); - queued_imm.combine(kill(instr.get(), ctx)); + memory_sync_info sync_info = get_sync_info(instr.get()); + queued_imm.combine(kill(instr.get(), ctx, sync_info)); + ctx.gen_instr = instr.get(); gen(instr.get(), ctx); if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { @@ -726,6 +886,11 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) queued_imm = wait_imm(); } new_instructions.emplace_back(std::move(instr)); + + queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire)); + + if (ctx.collect_statistics) + ctx.advance_unwaited_instrs(); } } @@ -737,14 +902,58 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) } /* end namespace */ +static uint32_t calculate_score(std::vector &ctx_vec, uint32_t event_mask) +{ + double result = 0.0; + unsigned num_waits = 0; + while (event_mask) { + unsigned event_index = u_bit_scan(&event_mask); + for (const wait_ctx &ctx : ctx_vec) { + for (unsigned dist : ctx.wait_distances[event_index]) { + double score = dist; + /* for many events, excessive distances provide little benefit, so + * decrease the score in that case. */ + double threshold = INFINITY; + double inv_strength = 0.000001; + switch (1 << event_index) { + case event_smem: + threshold = 70.0; + inv_strength = 75.0; + break; + case event_vmem: + case event_vmem_store: + case event_flat: + threshold = 230.0; + inv_strength = 150.0; + break; + case event_lds: + threshold = 16.0; + break; + default: + break; + } + if (score > threshold) { + score -= threshold; + score = threshold + score / (1.0 + score / inv_strength); + } + + /* we don't want increases in high scores to hide decreases in low scores, + * so raise to the power of 0.1 before averaging. */ + result += pow(score, 0.1); + num_waits++; + } + } + } + return round(pow(result / num_waits, 10.0) * 10.0); +} + void insert_wait_states(Program* program) { /* per BB ctx */ std::vector done(program->blocks.size()); - wait_ctx in_ctx[program->blocks.size()]; - wait_ctx out_ctx[program->blocks.size()]; - for (unsigned i = 0; i < program->blocks.size(); i++) - in_ctx[i] = wait_ctx(program); + std::vector in_ctx(program->blocks.size(), wait_ctx(program)); + std::vector out_ctx(program->blocks.size(), wait_ctx(program)); + std::stack loop_header_indices; unsigned loop_progress = 0; @@ -772,13 +981,15 @@ void insert_wait_states(Program* program) for (unsigned b : current.logical_preds) changed |= ctx.join(&out_ctx[b], true); - in_ctx[current.index] = ctx; - - if (done[current.index] && !changed) + if (done[current.index] && !changed) { + in_ctx[current.index] = std::move(ctx); continue; + } else { + in_ctx[current.index] = ctx; + } if (current.instructions.empty()) { - out_ctx[current.index] = ctx; + out_ctx[current.index] = std::move(ctx); continue; } @@ -787,7 +998,14 @@ void insert_wait_states(Program* program) handle_block(program, current, ctx); - out_ctx[current.index] = ctx; + out_ctx[current.index] = std::move(ctx); + } + + if (program->collect_statistics) { + program->statistics[statistic_vmem_score] = + calculate_score(out_ctx, event_vmem | event_flat | event_vmem_store); + program->statistics[statistic_smem_score] = + calculate_score(out_ctx, event_smem); } }