From d1f992f3c2d138faa0c89a2486c4252a06886106 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 26 Jun 2020 15:54:22 +0100 Subject: [PATCH] aco: rework barriers and replace can_reorder MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit fossil-db (Navi): Totals from 273 (0.21% of 132058) affected shaders: CodeSize: 937472 -> 936556 (-0.10%) Instrs: 158874 -> 158648 (-0.14%) Cycles: 13563516 -> 13562612 (-0.01%) VMEM: 85246 -> 85244 (-0.00%) SMEM: 21407 -> 21310 (-0.45%); split: +0.05%, -0.50% VClause: 9321 -> 9317 (-0.04%) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_dead_code_analysis.cpp | 2 +- src/amd/compiler/aco_insert_waitcnt.cpp | 116 +++++---- .../compiler/aco_instruction_selection.cpp | 182 +++++++------- src/amd/compiler/aco_ir.cpp | 22 ++ src/amd/compiler/aco_ir.h | 138 ++++++++--- src/amd/compiler/aco_lower_to_hw_instr.cpp | 13 + src/amd/compiler/aco_opcodes.py | 15 +- src/amd/compiler/aco_opt_value_numbering.cpp | 23 +- src/amd/compiler/aco_optimizer.cpp | 3 +- src/amd/compiler/aco_print_ir.cpp | 97 ++++++-- src/amd/compiler/aco_scheduler.cpp | 234 +++++++++--------- src/amd/compiler/aco_spill.cpp | 15 +- 12 files changed, 520 insertions(+), 340 deletions(-) diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp index 87b3112dc97..f43d784c55f 100644 --- a/src/amd/compiler/aco_dead_code_analysis.cpp +++ b/src/amd/compiler/aco_dead_code_analysis.cpp @@ -84,7 +84,7 @@ bool is_dead(const std::vector& uses, Instruction *instr) if (std::any_of(instr->definitions.begin(), instr->definitions.end(), [&uses] (const Definition& def) { return uses[def.tempId()];})) return false; - return !instr_info.is_atomic[(int)instr->opcode]; + return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel)); } std::vector dead_code_analysis(Program *program) { diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index aedcad1b573..fa2b5234d2e 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -273,8 +273,8 @@ struct wait_ctx { bool pending_flat_vm = false; bool pending_s_buffer_store = false; /* GFX10 workaround */ - wait_imm barrier_imm[barrier_count]; - uint16_t barrier_events[barrier_count] = {}; /* use wait_event notion */ + wait_imm barrier_imm[storage_count]; + uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */ std::map gpr_map; @@ -327,7 +327,7 @@ struct wait_ctx { } } - for (unsigned i = 0; i < barrier_count; i++) { + for (unsigned i = 0; i < storage_count; i++) { changed |= barrier_imm[i].combine(other->barrier_imm[i]); changed |= other->barrier_events[i] & ~barrier_events[i]; barrier_events[i] |= other->barrier_events[i]; @@ -444,7 +444,31 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr) return wait_imm(); } -wait_imm kill(Instruction* instr, wait_ctx& ctx) +wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics) +{ + wait_imm imm; + sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup; + if (sync.semantics & semantics) { + unsigned storage = sync.storage; + while (storage) { + unsigned idx = u_bit_scan(&storage); + + /* LDS is private to the workgroup */ + sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup); + + uint16_t events = ctx.barrier_events[idx]; + if (bar_scope_lds <= subgroup_scope) + events &= ~event_lds; + + if (events) + imm.combine(ctx.barrier_imm[idx]); + } + } + + return imm; +} + +wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info) { wait_imm imm; if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) @@ -471,44 +495,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) SMEM_instruction *smem = static_cast(instr); if (ctx.pending_s_buffer_store && !smem->definitions.empty() && - !smem->can_reorder && smem->barrier == barrier_buffer) { + !smem->sync.can_reorder()) { imm.lgkm = 0; } } - if (instr->format == Format::PSEUDO_BARRIER) { - switch (instr->opcode) { - case aco_opcode::p_memory_barrier_common: - imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); - imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]); - imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); - if (ctx.program->workgroup_size > ctx.program->wave_size) - imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); - break; - case aco_opcode::p_memory_barrier_atomic: - imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); - break; - /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */ - case aco_opcode::p_memory_barrier_buffer: - case aco_opcode::p_memory_barrier_image: - imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]); - imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); - break; - case aco_opcode::p_memory_barrier_shared: - if (ctx.program->workgroup_size > ctx.program->wave_size) - imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); - break; - case aco_opcode::p_memory_barrier_gs_data: - imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]); - break; - case aco_opcode::p_memory_barrier_gs_sendmsg: - imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]); - break; - default: - assert(false); - break; - } - } + if (instr->opcode == aco_opcode::p_barrier) + imm.combine(perform_barrier(ctx, static_cast(instr)->sync, semantic_acqrel)); + else + imm.combine(perform_barrier(ctx, sync_info, semantic_release)); if (!imm.empty()) { if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter) @@ -523,7 +518,7 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs); /* update barrier wait imms */ - for (unsigned i = 0; i < barrier_count; i++) { + for (unsigned i = 0; i < storage_count; i++) { wait_imm& bar = ctx.barrier_imm[i]; uint16_t& bar_ev = ctx.barrier_events[i]; if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) { @@ -581,12 +576,12 @@ void update_barrier_counter(uint8_t *ctr, unsigned max) (*ctr)++; } -void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barrier_interaction barrier) +void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync) { - for (unsigned i = 0; i < barrier_count; i++) { + for (unsigned i = 0; i < storage_count; i++) { wait_imm& bar = ctx.barrier_imm[i]; uint16_t& bar_ev = ctx.barrier_events[i]; - if (barrier & (1 << i)) { + if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) { bar_ev |= event; if (counters & counter_lgkm) bar.lgkm = 0; @@ -609,7 +604,7 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barri } } -void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none) +void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info()) { uint8_t counters = get_counters_for_event(event); @@ -622,7 +617,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt) ctx.vs_cnt++; - update_barrier_imm(ctx, counters, event, barrier); + update_barrier_imm(ctx, counters, event, sync); if (ctx.unordered_events & event) return; @@ -651,7 +646,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie } } -void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none) +void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info()) { assert(ctx.chip_class < GFX10); @@ -660,7 +655,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=ba if (ctx.vm_cnt <= ctx.max_vm_cnt) ctx.vm_cnt++; - update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, barrier); + update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync); for (std::pair e : ctx.gpr_map) { @@ -748,10 +743,11 @@ void gen(Instruction* instr, wait_ctx& ctx) break; } case Format::FLAT: { + FLAT_instruction *flat = static_cast(instr); if (ctx.chip_class < GFX10 && !instr->definitions.empty()) - update_counters_for_flat_load(ctx, barrier_buffer); + update_counters_for_flat_load(ctx, flat->sync); else - update_counters(ctx, event_flat, barrier_buffer); + update_counters(ctx, event_flat, flat->sync); if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_flat); @@ -759,27 +755,26 @@ void gen(Instruction* instr, wait_ctx& ctx) } case Format::SMEM: { SMEM_instruction *smem = static_cast(instr); - update_counters(ctx, event_smem, static_cast(instr)->barrier); + update_counters(ctx, event_smem, smem->sync); if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_smem); else if (ctx.chip_class >= GFX10 && - !smem->can_reorder && - smem->barrier == barrier_buffer) + !smem->sync.can_reorder()) ctx.pending_s_buffer_store = true; break; } case Format::DS: { - bool gds = static_cast(instr)->gds; - update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared); - if (gds) + DS_instruction *ds = static_cast(instr); + update_counters(ctx, ds->gds ? event_gds : event_lds, ds->sync); + if (ds->gds) update_counters(ctx, event_gds_gpr_lock); if (!instr->definitions.empty()) - insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds); + insert_wait_entry(ctx, instr->definitions[0], ds->gds ? event_gds : event_lds); - if (gds) { + if (ds->gds) { for (const Operand& op : instr->operands) insert_wait_entry(ctx, op, event_gds_gpr_lock); insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false); @@ -791,7 +786,7 @@ void gen(Instruction* instr, wait_ctx& ctx) case Format::MIMG: case Format::GLOBAL: { wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store; - update_counters(ctx, ev, get_barrier_interaction(instr)); + update_counters(ctx, ev, get_sync_info(instr)); bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4; @@ -817,7 +812,7 @@ void gen(Instruction* instr, wait_ctx& ctx) case Format::SOPP: { if (instr->opcode == aco_opcode::s_sendmsg || instr->opcode == aco_opcode::s_sendmsghalt) - update_counters(ctx, event_sendmsg, get_barrier_interaction(instr)); + update_counters(ctx, event_sendmsg); } default: break; @@ -851,7 +846,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) for (aco_ptr& instr : block.instructions) { bool is_wait = !parse_wait_instr(ctx, instr.get()).empty(); - queued_imm.combine(kill(instr.get(), ctx)); + memory_sync_info sync_info = get_sync_info(instr.get()); + queued_imm.combine(kill(instr.get(), ctx, sync_info)); ctx.gen_instr = instr.get(); gen(instr.get(), ctx); @@ -863,6 +859,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) } new_instructions.emplace_back(std::move(instr)); + queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire)); + if (ctx.collect_statistics) ctx.advance_unwaited_instrs(); } diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 80ea1e133d6..81c50ebbc9c 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3137,8 +3137,7 @@ struct LoadEmitInfo { bool glc = false; unsigned swizzle_component_size = 0; - barrier_interaction barrier = barrier_none; - bool can_reorder = true; + memory_sync_info sync; Temp soffset = Temp(0, s1); }; @@ -3441,10 +3440,12 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info, RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4)); Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc); + Instruction *instr; if (read2) - bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1); + instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1); else - bld.ds(op, Definition(val), offset, m, const_offset); + instr = bld.ds(op, Definition(val), offset, m, const_offset); + static_cast(instr)->sync = info->sync; if (size < 4) val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u)); @@ -3490,8 +3491,7 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info, load->definitions[0] = Definition(val); load->glc = info->glc; load->dlc = info->glc && bld.program->chip_class >= GFX10; - load->barrier = info->barrier; - load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works + load->sync = info->sync; bld.insert(std::move(load)); return val; } @@ -3540,8 +3540,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, mubuf->offen = (offset.type() == RegType::vgpr); mubuf->glc = info->glc; mubuf->dlc = info->glc && bld.program->chip_class >= GFX10; - mubuf->barrier = info->barrier; - mubuf->can_reorder = info->can_reorder; + mubuf->sync = info->sync; mubuf->offset = const_offset; mubuf->swizzled = info->swizzle_component_size != 0; RegClass rc = RegClass::get(RegType::vgpr, bytes_size); @@ -3605,7 +3604,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info, mubuf->offset = 0; mubuf->addr64 = offset.type() == RegType::vgpr; mubuf->disable_wqm = false; - mubuf->barrier = info->barrier; + mubuf->sync = info->sync; mubuf->definitions[0] = Definition(val); bld.insert(std::move(mubuf)); } else { @@ -3616,7 +3615,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info, flat->operands[1] = Operand(s1); flat->glc = info->glc; flat->dlc = info->glc && bld.program->chip_class >= GFX10; - flat->barrier = info->barrier; + flat->sync = info->sync; flat->offset = 0u; flat->definitions[0] = Definition(val); bld.insert(std::move(flat)); @@ -3638,8 +3637,7 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst, LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes}; info.align_mul = align; info.align_offset = 0; - info.barrier = barrier_shared; - info.can_reorder = false; + info.sync = memory_sync_info(storage_shared); info.const_offset = base_offset; emit_lds_load(ctx, bld, &info); @@ -3848,13 +3846,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t } assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */ + Instruction *instr; if (write2) { Temp second_data = write_datas[second]; inline_offset /= data.bytes(); - bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off); + instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off); } else { - bld.ds(op, address_offset, data, m, inline_offset); + instr = bld.ds(op, address_offset, data, m, inline_offset); } + static_cast(instr)->sync = + memory_sync_info(storage_shared); } } @@ -4017,7 +4018,8 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc); - static_cast(r.instr)->can_reorder = allow_reorder; + if (!allow_reorder) + static_cast(r.instr)->sync = memory_sync_info(storage_buffer, semantic_private); } void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, @@ -4847,15 +4849,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) } if (use_mubuf) { - Instruction *mubuf = bld.mubuf(opcode, - Definition(fetch_dst), list, fetch_index, soffset, - fetch_offset, false, false, true).instr; - static_cast(mubuf)->can_reorder = true; + bld.mubuf(opcode, + Definition(fetch_dst), list, fetch_index, soffset, + fetch_offset, false, false, true).instr; } else { - Instruction *mtbuf = bld.mtbuf(opcode, - Definition(fetch_dst), list, fetch_index, soffset, - fetch_dfmt, nfmt, fetch_offset, false, true).instr; - static_cast(mtbuf)->can_reorder = true; + bld.mtbuf(opcode, + Definition(fetch_dst), list, fetch_index, soffset, + fetch_dfmt, nfmt, fetch_offset, false, true).instr; } emit_split_vector(ctx, fetch_dst, fetch_dst.size()); @@ -5208,7 +5208,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr) void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size, Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, - bool glc=false, bool readonly=true, bool allow_smem=true) + bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info()) { Builder bld(ctx->program, ctx->block); @@ -5218,8 +5218,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_ LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc}; info.glc = glc; - info.barrier = readonly ? barrier_none : barrier_buffer; - info.can_reorder = readonly; + info.sync = sync; info.align_mul = align_mul; info.align_offset = align_offset; if (use_smem) @@ -5737,7 +5736,6 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec load->unrm = true; load->da = da; load->dim = dim; - load->can_reorder = true; /* fmask images shouldn't be modified */ ctx->block->instructions.emplace_back(std::move(load)); Operand sample_index4; @@ -5837,6 +5835,22 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr } +memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics) +{ + /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */ + if (semantics & semantic_atomicrmw) + return memory_sync_info(storage, semantics); + + unsigned access = nir_intrinsic_access(instr); + + if (access & ACCESS_VOLATILE) + semantics |= semantic_volatile; + if (access & ACCESS_CAN_REORDER) + semantics |= semantic_can_reorder | semantic_private; + + return memory_sync_info(storage, semantics); +} + void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); @@ -5846,6 +5860,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) bool is_array = glsl_sampler_type_is_array(type); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); + if (dim == GLSL_SAMPLER_DIM_BUF) { unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); unsigned num_channels = util_last_bit(mask); @@ -5882,7 +5898,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) load->idxen = true; load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT); load->dlc = load->glc && ctx->options->chip_class >= GFX10; - load->barrier = barrier_image; + load->sync = sync; ctx->block->instructions.emplace_back(std::move(load)); expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1); @@ -5914,7 +5930,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr) load->dmask = dmask; load->unrm = true; load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); - load->barrier = barrier_image; + load->sync = sync; ctx->block->instructions.emplace_back(std::move(load)); expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask); @@ -5929,6 +5945,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) bool is_array = glsl_sampler_type_is_array(type); Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); + memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0; if (dim == GLSL_SAMPLER_DIM_BUF) { @@ -5960,7 +5977,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) store->glc = glc; store->dlc = false; store->disable_wqm = true; - store->barrier = barrier_image; + store->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(store)); return; @@ -5984,7 +6001,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr) store->unrm = true; store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); store->disable_wqm = true; - store->barrier = barrier_image; + store->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(store)); return; @@ -6062,6 +6079,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) } Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw); if (dim == GLSL_SAMPLER_DIM_BUF) { Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); @@ -6079,7 +6097,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->glc = return_previous; mubuf->dlc = false; /* Not needed for atomics */ mubuf->disable_wqm = true; - mubuf->barrier = barrier_image; + mubuf->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mubuf)); return; @@ -6100,7 +6118,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr) mimg->unrm = true; mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type)); mimg->disable_wqm = true; - mimg->barrier = barrier_image; + mimg->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mimg)); return; @@ -6164,7 +6182,6 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr) mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; mimg->da = glsl_sampler_type_is_array(type); - mimg->can_reorder = true; Definition& def = mimg->definitions[0]; ctx->block->instructions.emplace_back(std::move(mimg)); @@ -6219,7 +6236,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER); load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), - nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem); + nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem, + get_memory_sync_info(instr, storage_buffer, 0)); } void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) @@ -6233,6 +6251,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u)); + memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr)); /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores @@ -6275,7 +6294,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) store->glc = glc; store->dlc = false; store->disable_wqm = true; - store->barrier = barrier_buffer; + store->sync = sync; ctx->block->instructions.emplace_back(std::move(store)); ctx->program->wb_smem_l1_on_end = true; if (op == aco_opcode::p_fs_buffer_store_smem) { @@ -6293,7 +6312,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) store->glc = glc; store->dlc = false; store->disable_wqm = true; - store->barrier = barrier_buffer; + store->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(store)); } @@ -6384,7 +6403,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->glc = return_previous; mubuf->dlc = false; /* Not needed for atomics */ mubuf->disable_wqm = true; - mubuf->barrier = barrier_buffer; + mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mubuf)); } @@ -6409,8 +6428,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); info.align_mul = nir_intrinsic_align_mul(instr); info.align_offset = nir_intrinsic_align_offset(instr); - info.barrier = barrier_buffer; - info.can_reorder = false; + info.sync = get_memory_sync_info(instr, storage_buffer, 0); /* VMEM stores don't update the SMEM cache and it's difficult to prove that * it's safe to use SMEM */ bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE; @@ -6430,6 +6448,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); Temp addr = get_ssa_temp(ctx, instr->src[1].ssa); + memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); if (ctx->options->chip_class >= GFX7) @@ -6495,7 +6514,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) flat->dlc = false; flat->offset = offset; flat->disable_wqm = true; - flat->barrier = barrier_buffer; + flat->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(flat)); } else { @@ -6515,7 +6534,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->offset = offsets[i]; mubuf->addr64 = addr.type() == RegType::vgpr; mubuf->disable_wqm = true; - mubuf->barrier = barrier_buffer; + mubuf->sync = sync; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mubuf)); } @@ -6608,7 +6627,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) flat->dlc = false; /* Not needed for atomics */ flat->offset = 0; flat->disable_wqm = true; - flat->barrier = barrier_buffer; + flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(flat)); } else { @@ -6675,7 +6694,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) mubuf->offset = 0; mubuf->addr64 = addr.type() == RegType::vgpr; mubuf->disable_wqm = true; - mubuf->barrier = barrier_buffer; + mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(mubuf)); } @@ -6683,20 +6702,30 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr) void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); + storage_class all_mem = (storage_class)(storage_buffer | storage_image | storage_atomic_counter | storage_shared); switch(instr->intrinsic) { case nir_intrinsic_group_memory_barrier: + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(all_mem, semantic_acqrel, scope_workgroup)); + break; case nir_intrinsic_memory_barrier: - bld.barrier(aco_opcode::p_memory_barrier_common); + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(all_mem, semantic_acqrel, scope_device)); break; case nir_intrinsic_memory_barrier_buffer: - bld.barrier(aco_opcode::p_memory_barrier_buffer); - break; case nir_intrinsic_memory_barrier_image: - bld.barrier(aco_opcode::p_memory_barrier_image); + /* since NIR splits barriers, we have to unify buffer and image barriers + * for now so dEQP-VK.memory_model.message_passing.core11.u32.coherent. + * fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp + * passes + */ + bld.barrier(aco_opcode::p_barrier, + memory_sync_info((storage_class)(storage_buffer | storage_image), semantic_acqrel, scope_device)); break; case nir_intrinsic_memory_barrier_tcs_patch: case nir_intrinsic_memory_barrier_shared: - bld.barrier(aco_opcode::p_memory_barrier_shared); + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup)); break; default: unreachable("Unimplemented memory barrier intrinsic"); @@ -6844,6 +6873,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr) ds->offset0 = offset; if (return_previous) ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa)); + ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw); ctx->block->instructions.emplace_back(std::move(ds)); } @@ -6884,7 +6914,7 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { info.align_mul = nir_intrinsic_align_mul(instr); info.align_offset = nir_intrinsic_align_offset(instr); info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0; - info.can_reorder = false; + info.sync = memory_sync_info(storage_buffer, semantic_private); info.soffset = ctx->program->scratch_offset; emit_scratch_load(ctx, bld, &info); } @@ -6907,7 +6937,8 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); - bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true); + Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true); + static_cast(instr)->sync = memory_sync_info(storage_buffer, semantic_private); } } @@ -7021,8 +7052,7 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst mtbuf->offset = const_offset; mtbuf->glc = true; mtbuf->slc = true; - mtbuf->barrier = barrier_gs_data; - mtbuf->can_reorder = true; + mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder); bld.insert(std::move(mtbuf)); } @@ -7347,8 +7377,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) load->glc = false; load->dlc = false; load->disable_wqm = false; - load->barrier = barrier_none; - load->can_reorder = true; ctx->block->instructions.emplace_back(std::move(load)); } @@ -7531,17 +7559,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) visit_get_buffer_size(ctx, instr); break; case nir_intrinsic_control_barrier: { - if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { - /* GFX6 only (thanks to a hw bug workaround): - * The real barrier instruction isn’t needed, because an entire patch - * always fits into a single wave. - */ - break; - } - - if (ctx->program->workgroup_size > ctx->program->wave_size) - bld.sopp(aco_opcode::s_barrier); - + bld.barrier(aco_opcode::p_barrier, memory_sync_info(0, 0, scope_invocation), scope_workgroup); break; } case nir_intrinsic_memory_barrier_tcs_patch: @@ -8093,7 +8111,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ? aco_opcode::s_memrealtime : aco_opcode::s_memtime; - bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false); + bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), memory_sync_info(0, semantic_volatile)); emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2); break; } @@ -8674,7 +8692,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex->da = da; tex->definitions[0] = Definition(tmp_dst); tex->dim = dim; - tex->can_reorder = true; ctx->block->instructions.emplace_back(std::move(tex)); if (div_by_6) { @@ -8707,7 +8724,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex->da = da; Temp size = bld.tmp(v2); tex->definitions[0] = Definition(size); - tex->can_reorder = true; ctx->block->instructions.emplace_back(std::move(tex)); emit_split_vector(ctx, size, size.size()); @@ -8809,7 +8825,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) mubuf->operands[2] = Operand((uint32_t) 0); mubuf->definitions[0] = Definition(tmp_dst); mubuf->idxen = true; - mubuf->can_reorder = true; ctx->block->instructions.emplace_back(std::move(mubuf)); expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1); @@ -8858,7 +8873,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex->unrm = true; tex->da = da; tex->definitions[0] = Definition(tmp_dst); - tex->can_reorder = true; ctx->block->instructions.emplace_back(std::move(tex)); if (instr->op == nir_texop_samples_identical) { @@ -9002,7 +9016,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) tex->dmask = dmask; tex->da = da; tex->definitions[0] = Definition(tmp_dst); - tex->can_reorder = true; ctx->block->instructions.emplace_back(std::move(tex)); if (tg4_integer_cube_workaround) { @@ -10285,6 +10298,13 @@ static void create_fs_exports(isel_context *ctx) create_null_export(ctx); } +static void create_workgroup_barrier(Builder& bld) +{ + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), + scope_workgroup); +} + static void write_tcs_tess_factors(isel_context *ctx) { unsigned outer_comps; @@ -10309,9 +10329,7 @@ static void write_tcs_tess_factors(isel_context *ctx) Builder bld(ctx->program, ctx->block); - bld.barrier(aco_opcode::p_memory_barrier_shared); - if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size)) - bld.sopp(aco_opcode::s_barrier); + create_workgroup_barrier(bld); Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids); Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u)); @@ -10470,7 +10488,6 @@ static void emit_stream_output(isel_context *ctx, store->glc = true; store->dlc = false; store->slc = true; - store->can_reorder = true; ctx->block->instructions.emplace_back(std::move(store)); } } @@ -10890,8 +10907,7 @@ void ngg_emit_nogs_output(isel_context *ctx) if (ctx->stage == ngg_vertex_gs) { /* Wait for GS threads to store primitive ID in LDS. */ - bld.barrier(aco_opcode::p_memory_barrier_shared); - bld.sopp(aco_opcode::s_barrier); + create_workgroup_barrier(bld); /* Calculate LDS address where the GS threads stored the primitive ID. */ Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), @@ -10975,8 +10991,7 @@ void select_program(Program *program, if (i) { Builder bld(ctx.program, ctx.block); - bld.barrier(aco_opcode::p_memory_barrier_shared); - bld.sopp(aco_opcode::s_barrier); + create_workgroup_barrier(bld); if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) { ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u)); @@ -10999,7 +11014,8 @@ void select_program(Program *program, ngg_emit_nogs_output(&ctx); } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { Builder bld(ctx.program, ctx.block); - bld.barrier(aco_opcode::p_memory_barrier_gs_data); + bld.barrier(aco_opcode::p_barrier, + memory_sync_info(storage_vmem_output, semantic_release, scope_device)); bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0)); } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { write_tcs_tess_factors(&ctx); @@ -11031,7 +11047,7 @@ void select_program(Program *program, ctx.block->kind |= block_kind_uniform; Builder bld(ctx.program, ctx.block); if (ctx.program->wb_smem_l1_on_end) - bld.smem(aco_opcode::s_dcache_wb, false); + bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile)); bld.sopp(aco_opcode::s_endpgm); cleanup_cfg(program); @@ -11116,8 +11132,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, mubuf->glc = true; mubuf->slc = true; mubuf->dlc = args->options->chip_class >= GFX10; - mubuf->barrier = barrier_none; - mubuf->can_reorder = true; ctx.outputs.mask[i] |= 1 << j; ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp(); diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 92c50cdd94e..75ca80cf2ac 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -127,6 +127,28 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info, program->next_fp_mode.round32 = fp_round_ne; } +memory_sync_info get_sync_info(const Instruction* instr) +{ + switch (instr->format) { + case Format::SMEM: + return static_cast(instr)->sync; + case Format::MUBUF: + return static_cast(instr)->sync; + case Format::MIMG: + return static_cast(instr)->sync; + case Format::MTBUF: + return static_cast(instr)->sync; + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + return static_cast(instr)->sync; + case Format::DS: + return static_cast(instr)->sync; + default: + return memory_sync_info(); + } +} + bool can_use_SDWA(chip_class chip, const aco_ptr& instr) { if (!instr->isVALU()) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 6f2a4a46b91..9dda1ffb723 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -103,22 +103,79 @@ enum class Format : std::uint16_t { SDWA = 1 << 14, }; -enum barrier_interaction : uint8_t { - barrier_none = 0, - barrier_buffer = 0x1, - barrier_image = 0x2, - barrier_atomic = 0x4, - barrier_shared = 0x8, - /* used for geometry shaders to ensure vertex data writes are before the - * GS_DONE s_sendmsg. */ - barrier_gs_data = 0x10, - /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */ - barrier_gs_sendmsg = 0x20, - /* used by barriers. created by s_barrier */ - barrier_barrier = 0x40, - barrier_count = 7, +enum storage_class : uint8_t { + storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */ + storage_buffer = 0x1, /* SSBOs and global memory */ + storage_atomic_counter = 0x2, /* not used for Vulkan */ + storage_image = 0x4, + storage_shared = 0x8, /* or TCS output */ + storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */ + storage_scratch = 0x20, + storage_vgpr_spill = 0x40, + storage_count = 8, }; +enum memory_semantics : uint8_t { + semantic_none = 0x0, + /* for loads: don't move any access after this load to before this load (even other loads) + * for barriers: don't move any access after the barrier to before any + * atomics/control_barriers/sendmsg_gs_done before the barrier */ + semantic_acquire = 0x1, + /* for stores: don't move any access before this store to after this store + * for barriers: don't move any access before the barrier to after any + * atomics/control_barriers/sendmsg_gs_done after the barrier */ + semantic_release = 0x2, + + /* the rest are for load/stores/atomics only */ + /* cannot be DCE'd or CSE'd */ + semantic_volatile = 0x4, + /* does not interact with barriers and assumes this lane is the only lane + * accessing this memory */ + semantic_private = 0x8, + /* this operation can be reordered around operations of the same storage. says nothing about barriers */ + semantic_can_reorder = 0x10, + /* this is a atomic instruction (may only read or write memory) */ + semantic_atomic = 0x20, + /* this is instruction both reads and writes memory */ + semantic_rmw = 0x40, + + semantic_acqrel = semantic_acquire | semantic_release, + semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw, +}; + +enum sync_scope : uint8_t { + scope_invocation = 0, + scope_subgroup = 1, + scope_workgroup = 2, + scope_queuefamily = 3, + scope_device = 4, +}; + +struct memory_sync_info { + memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {} + memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation) + : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {} + + storage_class storage:8; + memory_semantics semantics:8; + sync_scope scope:8; + + bool operator == (const memory_sync_info& rhs) const { + return storage == rhs.storage && + semantics == rhs.semantics && + scope == rhs.scope; + } + + bool can_reorder() const { + if (semantics & semantic_acqrel) + return false; + /* Also check storage so that zero-initialized memory_sync_info can be + * reordered. */ + return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile); + } +}; +static_assert(sizeof(memory_sync_info) == 3); + enum fp_round { fp_round_ne = 0, fp_round_pi = 1, @@ -931,14 +988,13 @@ static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected p * */ struct SMEM_instruction : public Instruction { - barrier_interaction barrier; + memory_sync_info sync; bool glc : 1; /* VI+: globally coherent */ bool dlc : 1; /* NAVI: device level coherent */ bool nv : 1; /* VEGA only: Non-volatile */ - bool can_reorder : 1; bool disable_wqm : 1; bool prevent_overflow : 1; /* avoid overflow when combining additions */ - uint32_t padding: 18; + uint32_t padding: 3; }; static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); @@ -1066,11 +1122,13 @@ static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected * */ struct DS_instruction : public Instruction { + memory_sync_info sync; + bool gds; int16_t offset0; int8_t offset1; - bool gds; + uint8_t padding; }; -static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); +static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); /** * Vector Memory Untyped-buffer Instructions @@ -1081,7 +1139,7 @@ static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected pad * */ struct MUBUF_instruction : public Instruction { - uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ + memory_sync_info sync; bool offen : 1; /* Supply an offset from VGPR (VADDR) */ bool idxen : 1; /* Supply an index from VGPR (VADDR) */ bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ @@ -1091,12 +1149,11 @@ struct MUBUF_instruction : public Instruction { bool tfe : 1; /* texture fail enable */ bool lds : 1; /* Return read-data to LDS instead of VGPRs */ bool disable_wqm : 1; /* Require an exec mask without helper invocations */ - bool can_reorder : 1; - bool swizzled:1; - uint8_t padding : 1; - barrier_interaction barrier; + uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ + bool swizzled : 1; + uint32_t padding1 : 18; }; -static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); +static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); /** * Vector Memory Typed-buffer Instructions @@ -1107,8 +1164,7 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected * */ struct MTBUF_instruction : public Instruction { - uint16_t offset; /* Unsigned byte offset - 12 bit */ - barrier_interaction barrier; + memory_sync_info sync; uint8_t dfmt : 4; /* Data Format of data in memory buffer */ uint8_t nfmt : 3; /* Numeric format of data in memory */ bool offen : 1; /* Supply an offset from VGPR (VADDR) */ @@ -1118,8 +1174,8 @@ struct MTBUF_instruction : public Instruction { bool slc : 1; /* system level coherent */ bool tfe : 1; /* texture fail enable */ bool disable_wqm : 1; /* Require an exec mask without helper invocations */ - bool can_reorder : 1; - uint32_t padding : 25; + uint32_t padding : 10; + uint16_t offset; /* Unsigned byte offset - 12 bit */ }; static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); @@ -1133,6 +1189,7 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected * */ struct MIMG_instruction : public Instruction { + memory_sync_info sync; uint8_t dmask; /* Data VGPR enable mask */ uint8_t dim : 3; /* NAVI: dimensionality */ bool unrm : 1; /* Force address to be un-normalized */ @@ -1146,11 +1203,9 @@ struct MIMG_instruction : public Instruction { bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ bool d16 : 1; /* Convert 32-bit data to 16-bit data */ bool disable_wqm : 1; /* Require an exec mask without helper invocations */ - bool can_reorder : 1; - uint8_t padding : 1; - barrier_interaction barrier; + uint32_t padding : 18; }; -static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); +static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); /** * Flat/Scratch/Global Instructions @@ -1160,18 +1215,18 @@ static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected p * */ struct FLAT_instruction : public Instruction { - uint16_t offset; /* Vega/Navi only */ + memory_sync_info sync; bool slc : 1; /* system level coherent */ bool glc : 1; /* globally coherent */ bool dlc : 1; /* NAVI: device level coherent */ bool lds : 1; bool nv : 1; bool disable_wqm : 1; /* Require an exec mask without helper invocations */ - bool can_reorder : 1; - uint8_t padding : 1; - barrier_interaction barrier; + uint32_t padding0 : 2; + uint16_t offset; /* Vega/Navi only */ + uint16_t padding1; }; -static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); +static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); struct Export_instruction : public Instruction { uint8_t enabled_mask; @@ -1200,8 +1255,10 @@ struct Pseudo_branch_instruction : public Instruction { static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); struct Pseudo_barrier_instruction : public Instruction { + memory_sync_info sync; + sync_scope exec_scope; }; -static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); +static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); enum ReduceOp : uint16_t { iadd8, iadd16, iadd32, iadd64, @@ -1298,7 +1355,8 @@ static inline bool is_phi(aco_ptr& instr) return is_phi(instr.get()); } -barrier_interaction get_barrier_interaction(const Instruction* instr); +memory_sync_info get_sync_info(const Instruction* instr); + bool is_dead(const std::vector& uses, Instruction *instr); bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 9af52aad905..c7e4acd542e 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1854,6 +1854,7 @@ void lower_to_hw_instr(Program* program) emit_gfx10_wave64_bpermute(program, instr, bld); else unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute."); + break; } default: break; @@ -1908,6 +1909,18 @@ void lower_to_hw_instr(Program* program) reduce->operands[2].physReg(), // vtmp reduce->definitions[2].physReg(), // sitmp reduce->operands[0], reduce->definitions[0]); + } else if (instr->format == Format::PSEUDO_BARRIER) { + Pseudo_barrier_instruction* barrier = static_cast(instr.get()); + + /* Anything larger than a workgroup isn't possible. Anything + * smaller requires no instructions and this pseudo instruction + * would only be included to control optimizations. */ + bool emit_s_barrier = barrier->exec_scope == scope_workgroup && + program->workgroup_size > program->wave_size; + + bld.insert(std::move(instr)); + if (emit_s_barrier) + bld.sopp(aco_opcode::s_barrier); } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) { float_mode new_mode = block->fp_mode; new_mode.round16_64 = fp_round_ne; diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 46246cbea01..019ad9c0e0d 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -66,7 +66,7 @@ class Format(Enum): return [('uint32_t', 'block', '-1'), ('uint32_t', 'imm', '0')] elif self == Format.SMEM: - return [('bool', 'can_reorder', 'true'), + return [('memory_sync_info', 'sync', 'memory_sync_info()'), ('bool', 'glc', 'false'), ('bool', 'dlc', 'false'), ('bool', 'nv', 'false')] @@ -123,6 +123,9 @@ class Format(Enum): elif self == Format.PSEUDO_REDUCTION: return [('ReduceOp', 'op', None, 'reduce_op'), ('unsigned', 'cluster_size', '0')] + elif self == Format.PSEUDO_BARRIER: + return [('memory_sync_info', 'sync', None), + ('sync_scope', 'exec_scope', 'scope_invocation')] elif self == Format.VINTRP: return [('unsigned', 'attribute', None), ('unsigned', 'component', None)] @@ -133,7 +136,7 @@ class Format(Enum): ('bool', 'bound_ctrl', 'true')] elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: return [('uint16_t', 'offset', 0), - ('bool', 'can_reorder', 'true'), + ('memory_sync_info', 'sync', 'memory_sync_info()'), ('bool', 'glc', 'false'), ('bool', 'slc', 'false'), ('bool', 'lds', 'false'), @@ -265,13 +268,7 @@ opcode("p_cbranch", format=Format.PSEUDO_BRANCH) opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH) opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH) -opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared -opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER) -opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER) -opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER) -opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER) -opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER) -opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER) +opcode("p_barrier", format=Format.PSEUDO_BARRIER) opcode("p_spill") opcode("p_reload") diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 2fdbfaabd4a..de3d894bf54 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -224,12 +224,15 @@ struct InstrPred { return aK->imm == bK->imm; } case Format::SMEM: { + if (!a->operands.empty() && a->operands[0].bytes() == 16) + return false; SMEM_instruction* aS = static_cast(a); SMEM_instruction* bS = static_cast(b); /* isel shouldn't be creating situations where this assertion fails */ assert(aS->prevent_overflow == bS->prevent_overflow); - return aS->can_reorder && bS->can_reorder && - aS->glc == bS->glc && aS->nv == bS->nv && + return aS->sync.can_reorder() && bS->sync.can_reorder() && + aS->sync == bS->sync && aS->glc == bS->glc && aS->dlc == bS->dlc && + aS->nv == bS->nv && aS->disable_wqm == bS->disable_wqm && aS->prevent_overflow == bS->prevent_overflow; } case Format::VINTRP: { @@ -251,8 +254,8 @@ struct InstrPred { case Format::MTBUF: { MTBUF_instruction* aM = static_cast(a); MTBUF_instruction* bM = static_cast(b); - return aM->can_reorder && bM->can_reorder && - aM->barrier == bM->barrier && + return aM->sync.can_reorder() && bM->sync.can_reorder() && + aM->sync == bM->sync && aM->dfmt == bM->dfmt && aM->nfmt == bM->nfmt && aM->offset == bM->offset && @@ -267,8 +270,8 @@ struct InstrPred { case Format::MUBUF: { MUBUF_instruction* aM = static_cast(a); MUBUF_instruction* bM = static_cast(b); - return aM->can_reorder && bM->can_reorder && - aM->barrier == bM->barrier && + return aM->sync.can_reorder() && bM->sync.can_reorder() && + aM->sync == bM->sync && aM->offset == bM->offset && aM->offen == bM->offen && aM->idxen == bM->idxen && @@ -295,7 +298,9 @@ struct InstrPred { return false; DS_instruction* aD = static_cast(a); DS_instruction* bD = static_cast(b); - return aD->pass_flags == bD->pass_flags && + return aD->sync.can_reorder() && bD->sync.can_reorder() && + aD->sync == bD->sync && + aD->pass_flags == bD->pass_flags && aD->gds == bD->gds && aD->offset0 == bD->offset0 && aD->offset1 == bD->offset1; @@ -303,8 +308,8 @@ struct InstrPred { case Format::MIMG: { MIMG_instruction* aM = static_cast(a); MIMG_instruction* bM = static_cast(b); - return aM->can_reorder && bM->can_reorder && - aM->barrier == bM->barrier && + return aM->sync.can_reorder() && bM->sync.can_reorder() && + aM->sync == bM->sync && aM->dmask == bM->dmask && aM->unrm == bM->unrm && aM->glc == bM->glc && diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index e00c8c1fcaf..c1f0bc20f9e 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1026,8 +1026,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) new_instr->operands.back() = Operand(base); if (!smem->definitions.empty()) new_instr->definitions[0] = smem->definitions[0]; - new_instr->can_reorder = smem->can_reorder; - new_instr->barrier = smem->barrier; + new_instr->sync = smem->sync; new_instr->glc = smem->glc; new_instr->dlc = smem->dlc; new_instr->nv = smem->nv; diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 9172fc1ac60..8b8b5d0f306 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -189,23 +189,73 @@ static void print_definition(const Definition *definition, FILE *output) print_physReg(definition->physReg(), definition->bytes(), output); } -static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, FILE *output) +static void print_storage(storage_class storage, FILE *output) { - if (can_reorder) - fprintf(output, " reorder"); + fprintf(output, " storage:"); + int printed = 0; + if (storage & storage_buffer) + printed += fprintf(output, "%sbuffer", printed ? "," : ""); + if (storage & storage_atomic_counter) + printed += fprintf(output, "%satomic_counter", printed ? "," : ""); + if (storage & storage_image) + printed += fprintf(output, "%simage", printed ? "," : ""); + if (storage & storage_shared) + printed += fprintf(output, "%sshared", printed ? "," : ""); + if (storage & storage_vmem_output) + printed += fprintf(output, "%svmem_output", printed ? "," : ""); + if (storage & storage_scratch) + printed += fprintf(output, "%sscratch", printed ? "," : ""); + if (storage & storage_vgpr_spill) + printed += fprintf(output, "%svgpr_spill", printed ? "," : ""); +} + +static void print_semantics(memory_semantics sem, FILE *output) +{ + fprintf(output, " semantics:"); + int printed = 0; + if (sem & semantic_acquire) + printed += fprintf(output, "%sacquire", printed ? "," : ""); + if (sem & semantic_release) + printed += fprintf(output, "%srelease", printed ? "," : ""); + if (sem & semantic_volatile) + printed += fprintf(output, "%svolatile", printed ? "," : ""); + if (sem & semantic_private) + printed += fprintf(output, "%sprivate", printed ? "," : ""); + if (sem & semantic_can_reorder) + printed += fprintf(output, "%sreorder", printed ? "," : ""); + if (sem & semantic_atomic) + printed += fprintf(output, "%satomic", printed ? "," : ""); + if (sem & semantic_rmw) + printed += fprintf(output, "%srmw", printed ? "," : ""); +} + +static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope") +{ + fprintf(output, " %s:", prefix); + switch (scope) { + case scope_invocation: + fprintf(output, "invocation"); + break; + case scope_subgroup: + fprintf(output, "subgroup"); + break; + case scope_workgroup: + fprintf(output, "workgroup"); + break; + case scope_queuefamily: + fprintf(output, "queuefamily"); + break; + case scope_device: + fprintf(output, "device"); + break; + } +} - if (barrier & barrier_buffer) - fprintf(output, " buffer"); - if (barrier & barrier_image) - fprintf(output, " image"); - if (barrier & barrier_atomic) - fprintf(output, " atomic"); - if (barrier & barrier_shared) - fprintf(output, " shared"); - if (barrier & barrier_gs_data) - fprintf(output, " gs_data"); - if (barrier & barrier_gs_sendmsg) - fprintf(output, " gs_sendmsg"); +static void print_sync(memory_sync_info sync, FILE *output) +{ + print_storage(sync.storage, output); + print_semantics(sync.semantics, output); + print_scope(sync.scope, output); } static void print_instr_format_specific(const Instruction *instr, FILE *output) @@ -292,7 +342,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) fprintf(output, " dlc"); if (smem->nv) fprintf(output, " nv"); - print_barrier_reorder(smem->can_reorder, smem->barrier, output); + print_sync(smem->sync, output); break; } case Format::VINTRP: { @@ -308,6 +358,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) fprintf(output, " offset1:%u", ds->offset1); if (ds->gds) fprintf(output, " gds"); + print_sync(ds->sync, output); break; } case Format::MUBUF: { @@ -332,7 +383,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) fprintf(output, " lds"); if (mubuf->disable_wqm) fprintf(output, " disable_wqm"); - print_barrier_reorder(mubuf->can_reorder, mubuf->barrier, output); + print_sync(mubuf->sync, output); break; } case Format::MIMG: { @@ -392,7 +443,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) fprintf(output, " d16"); if (mimg->disable_wqm) fprintf(output, " disable_wqm"); - print_barrier_reorder(mimg->can_reorder, mimg->barrier, output); + print_sync(mimg->sync, output); break; } case Format::EXP: { @@ -439,6 +490,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) fprintf(output, " cluster_size:%u", reduce->cluster_size); break; } + case Format::PSEUDO_BARRIER: { + const Pseudo_barrier_instruction* barrier = static_cast(instr); + print_sync(barrier->sync, output); + print_scope(barrier->exec_scope, output, "exec_scope"); + break; + } case Format::FLAT: case Format::GLOBAL: case Format::SCRATCH: { @@ -457,7 +514,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) fprintf(output, " nv"); if (flat->disable_wqm) fprintf(output, " disable_wqm"); - print_barrier_reorder(flat->can_reorder, flat->barrier, output); + print_sync(flat->sync, output); break; } case Format::MTBUF: { @@ -507,7 +564,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output) fprintf(output, " tfe"); if (mtbuf->disable_wqm) fprintf(output, " disable_wqm"); - print_barrier_reorder(mtbuf->can_reorder, mtbuf->barrier, output); + print_sync(mtbuf->sync, output); break; } case Format::VOP3P: { diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index d837059cfd4..261d58ceb03 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -318,26 +318,6 @@ void MoveState::upwards_skip() source_idx++; } -bool can_reorder(Instruction* candidate) -{ - switch (candidate->format) { - case Format::SMEM: - return static_cast(candidate)->can_reorder; - case Format::MUBUF: - return static_cast(candidate)->can_reorder; - case Format::MIMG: - return static_cast(candidate)->can_reorder; - case Format::MTBUF: - return static_cast(candidate)->can_reorder; - case Format::FLAT: - case Format::GLOBAL: - case Format::SCRATCH: - return static_cast(candidate)->can_reorder; - default: - return true; - } -} - bool is_gs_or_done_sendmsg(const Instruction *instr) { if (instr->opcode == aco_opcode::s_sendmsg) { @@ -357,96 +337,96 @@ bool is_done_sendmsg(const Instruction *instr) return false; } -barrier_interaction get_barrier_interaction(const Instruction* instr) +memory_sync_info get_sync_info_with_hack(const Instruction* instr) { - switch (instr->format) { - case Format::SMEM: - return static_cast(instr)->barrier; - case Format::MUBUF: - return static_cast(instr)->barrier; - case Format::MIMG: - return static_cast(instr)->barrier; - case Format::MTBUF: - return static_cast(instr)->barrier; - case Format::FLAT: - case Format::GLOBAL: - case Format::SCRATCH: - return static_cast(instr)->barrier; - case Format::DS: - return barrier_shared; - case Format::SOPP: - if (is_done_sendmsg(instr)) - return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg); - else if (is_gs_or_done_sendmsg(instr)) - return barrier_gs_sendmsg; - else - return barrier_none; - case Format::PSEUDO_BARRIER: - return barrier_barrier; - default: - return barrier_none; + memory_sync_info sync = get_sync_info(instr); + if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) { + // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works + sync.storage = (storage_class)(sync.storage | storage_buffer); + sync.semantics = (memory_semantics)(sync.semantics | semantic_private); } + return sync; } -barrier_interaction parse_barrier(Instruction *instr) -{ - if (instr->format == Format::PSEUDO_BARRIER) { - switch (instr->opcode) { - case aco_opcode::p_memory_barrier_atomic: - return barrier_atomic; - /* For now, buffer and image barriers are treated the same. this is because of - * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp - * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important. - * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it - * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers. - * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to? - * Either way, this solution should work. */ - case aco_opcode::p_memory_barrier_buffer: - case aco_opcode::p_memory_barrier_image: - return (barrier_interaction)(barrier_image | barrier_buffer); - case aco_opcode::p_memory_barrier_shared: - return barrier_shared; - case aco_opcode::p_memory_barrier_common: - return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic); - case aco_opcode::p_memory_barrier_gs_data: - return barrier_gs_data; - case aco_opcode::p_memory_barrier_gs_sendmsg: - return barrier_gs_sendmsg; - default: - break; - } - } else if (instr->opcode == aco_opcode::s_barrier) { - return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic); - } - return barrier_none; -} +struct memory_event_set { + bool has_control_barrier; + + unsigned bar_acquire; + unsigned bar_release; + unsigned bar_classes; + + unsigned access_acquire; + unsigned access_release; + unsigned access_relaxed; + unsigned access_atomic; +}; struct hazard_query { bool contains_spill; - int barriers; - int barrier_interaction; - bool can_reorder_vmem; - bool can_reorder_smem; + bool contains_sendmsg; + memory_event_set mem_events; + unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */ + unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */ }; void init_hazard_query(hazard_query *query) { query->contains_spill = false; - query->barriers = 0; - query->barrier_interaction = 0; - query->can_reorder_vmem = true; - query->can_reorder_smem = true; + query->contains_sendmsg = false; + memset(&query->mem_events, 0, sizeof(query->mem_events)); + query->aliasing_storage = 0; + query->aliasing_storage_smem = 0; +} + +void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync) +{ + set->has_control_barrier |= is_done_sendmsg(instr); + if (instr->opcode == aco_opcode::p_barrier) { + Pseudo_barrier_instruction *bar = static_cast(instr); + if (bar->sync.semantics & semantic_acquire) + set->bar_acquire |= bar->sync.storage; + if (bar->sync.semantics & semantic_release) + set->bar_release |= bar->sync.storage; + set->bar_classes |= bar->sync.storage; + + set->has_control_barrier |= bar->exec_scope > scope_invocation; + } + + if (!sync->storage) + return; + + if (sync->semantics & semantic_acquire) + set->access_acquire |= sync->storage; + if (sync->semantics & semantic_release) + set->access_release |= sync->storage; + + if (!(sync->semantics & semantic_private)) { + if (sync->semantics & semantic_atomic) + set->access_atomic |= sync->storage; + else + set->access_relaxed |= sync->storage; + } } void add_to_hazard_query(hazard_query *query, Instruction *instr) { - query->barriers |= parse_barrier(instr); - query->barrier_interaction |= get_barrier_interaction(instr); if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) query->contains_spill = true; + query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg; + + memory_sync_info sync = get_sync_info_with_hack(instr); - bool can_reorder_instr = can_reorder(instr); - query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr; - query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr; + add_memory_event(&query->mem_events, instr, &sync); + + if (!(sync.semantics & semantic_can_reorder)) { + unsigned storage = sync.storage; + /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias + if (storage & (storage_buffer | storage_image)) + storage |= storage_buffer | storage_image; + if (instr->format == Format::SMEM) + query->aliasing_storage_smem |= storage; + else + query->aliasing_storage |= storage; + } } enum HazardResult { @@ -463,10 +443,8 @@ enum HazardResult { hazard_fail_unreorderable, }; -HazardResult perform_hazard_query(hazard_query *query, Instruction *instr) +HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards) { - bool can_reorder_candidate = can_reorder(instr); - if (instr->opcode == aco_opcode::p_exit_early_if) return hazard_fail_exec; for (const Definition& def : instr->definitions) { @@ -484,27 +462,61 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr) instr->opcode == aco_opcode::s_setprio) return hazard_fail_unreorderable; - barrier_interaction bar = parse_barrier(instr); - if (query->barrier_interaction && (query->barrier_interaction & bar)) + memory_event_set instr_set; + memset(&instr_set, 0, sizeof(instr_set)); + memory_sync_info sync = get_sync_info_with_hack(instr); + add_memory_event(&instr_set, instr, &sync); + + memory_event_set *first = &instr_set; + memory_event_set *second = &query->mem_events; + if (upwards) + std::swap(first, second); + + /* everything after barrier(acquire) happens after the atomics/control_barriers before + * everything after load(acquire) happens after the load + */ + if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire) + return hazard_fail_barrier; + if (((first->access_acquire || first->bar_acquire) && second->bar_classes) || + ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic))) + return hazard_fail_barrier; + + /* everything before barrier(release) happens before the atomics/control_barriers after * + * everything before store(release) happens before the store + */ + if (first->bar_release && (second->has_control_barrier || second->access_atomic)) return hazard_fail_barrier; - if (bar && query->barriers && (query->barriers & ~bar)) + if ((first->bar_classes && (second->bar_release || second->access_release)) || + ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release))) return hazard_fail_barrier; - if (query->barriers && (query->barriers & get_barrier_interaction(instr))) + + /* don't move memory barriers around other memory barriers */ + if (first->bar_classes && second->bar_classes) return hazard_fail_barrier; - if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate) - return hazard_fail_reorder_vmem_smem; - if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate) + /* Don't move memory loads/stores to before control barriers. This is to make + * memory barriers followed by control barriers work. */ + if (first->has_control_barrier && (second->access_atomic | second->access_relaxed)) + return hazard_fail_barrier; + + /* don't move memory loads/stores past potentially aliasing loads/stores */ + unsigned aliasing_storage = instr->format == Format::SMEM ? + query->aliasing_storage_smem : + query->aliasing_storage; + if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) { + unsigned intersect = sync.storage & aliasing_storage; + if (intersect & storage_shared) + return hazard_fail_reorder_ds; return hazard_fail_reorder_vmem_smem; - if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS) - return hazard_fail_reorder_ds; - if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr))) - return hazard_fail_reorder_sendmsg; + } if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) && query->contains_spill) return hazard_fail_spill; + if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg) + return hazard_fail_reorder_sendmsg; + return hazard_success; } @@ -546,7 +558,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block, bool can_move_down = true; - HazardResult haz = perform_hazard_query(&hq, candidate.get()); + HazardResult haz = perform_hazard_query(&hq, candidate.get(), false); if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export) can_move_down = false; else if (haz != hazard_success) @@ -594,7 +606,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block, break; if (found_dependency) { - HazardResult haz = perform_hazard_query(&hq, candidate.get()); + HazardResult haz = perform_hazard_query(&hq, candidate.get(), true); if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export) @@ -686,7 +698,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, /* if current depends on candidate, add additional dependencies and continue */ bool can_move_down = !is_vmem || part_of_clause; - HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get()); + HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false); if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export) @@ -735,7 +747,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, /* check if candidate depends on current */ bool is_dependency = false; if (found_dependency) { - HazardResult haz = perform_hazard_query(&indep_hq, candidate.get()); + HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true); if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export) @@ -802,7 +814,7 @@ void schedule_position_export(sched_ctx& ctx, Block* block, if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal()) break; - HazardResult haz = perform_hazard_query(&hq, candidate.get()); + HazardResult haz = perform_hazard_query(&hq, candidate.get(), false); if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable) break; diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index d472f63ec57..69d5cb23b23 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1565,10 +1565,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { for (unsigned i = 0; i < temp.size(); i++) split->definitions[i] = bld.def(v1); bld.insert(split); - for (unsigned i = 0; i < temp.size(); i++) - bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true); + for (unsigned i = 0; i < temp.size(); i++) { + Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true); + static_cast(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private); + } } else { - bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true); + Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true); + static_cast(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private); } } else { ctx.program->config->spilled_sgprs += (*it)->operands[0].size(); @@ -1632,11 +1635,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { for (unsigned i = 0; i < def.size(); i++) { Temp tmp = bld.tmp(v1); vec->operands[i] = Operand(tmp); - bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true); + Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true); + static_cast(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private); } bld.insert(vec); } else { - bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true); + Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true); + static_cast(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private); } } else { uint32_t spill_slot = slots[spill_id]; -- 2.30.2