if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
[&uses] (const Definition& def) { return uses[def.tempId()];}))
return false;
- return !instr_info.is_atomic[(int)instr->opcode];
+ return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
}
std::vector<uint16_t> dead_code_analysis(Program *program) {
bool pending_flat_vm = false;
bool pending_s_buffer_store = false; /* GFX10 workaround */
- wait_imm barrier_imm[barrier_count];
- uint16_t barrier_events[barrier_count] = {}; /* use wait_event notion */
+ wait_imm barrier_imm[storage_count];
+ uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
std::map<PhysReg,wait_entry> gpr_map;
}
}
- for (unsigned i = 0; i < barrier_count; i++) {
+ for (unsigned i = 0; i < storage_count; i++) {
changed |= barrier_imm[i].combine(other->barrier_imm[i]);
changed |= other->barrier_events[i] & ~barrier_events[i];
barrier_events[i] |= other->barrier_events[i];
return wait_imm();
}
-wait_imm kill(Instruction* instr, wait_ctx& ctx)
+wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
+{
+ wait_imm imm;
+ sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
+ if (sync.semantics & semantics) {
+ unsigned storage = sync.storage;
+ while (storage) {
+ unsigned idx = u_bit_scan(&storage);
+
+ /* LDS is private to the workgroup */
+ sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
+
+ uint16_t events = ctx.barrier_events[idx];
+ if (bar_scope_lds <= subgroup_scope)
+ events &= ~event_lds;
+
+ if (events)
+ imm.combine(ctx.barrier_imm[idx]);
+ }
+ }
+
+ return imm;
+}
+
+wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
{
wait_imm imm;
if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
if (ctx.pending_s_buffer_store &&
!smem->definitions.empty() &&
- !smem->can_reorder && smem->barrier == barrier_buffer) {
+ !smem->sync.can_reorder()) {
imm.lgkm = 0;
}
}
- if (instr->format == Format::PSEUDO_BARRIER) {
- switch (instr->opcode) {
- case aco_opcode::p_memory_barrier_common:
- imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
- imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
- imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
- if (ctx.program->workgroup_size > ctx.program->wave_size)
- imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
- break;
- case aco_opcode::p_memory_barrier_atomic:
- imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
- break;
- /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
- case aco_opcode::p_memory_barrier_buffer:
- case aco_opcode::p_memory_barrier_image:
- imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
- imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
- break;
- case aco_opcode::p_memory_barrier_shared:
- if (ctx.program->workgroup_size > ctx.program->wave_size)
- imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
- break;
- case aco_opcode::p_memory_barrier_gs_data:
- imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]);
- break;
- case aco_opcode::p_memory_barrier_gs_sendmsg:
- imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]);
- break;
- default:
- assert(false);
- break;
- }
- }
+ if (instr->opcode == aco_opcode::p_barrier)
+ imm.combine(perform_barrier(ctx, static_cast<Pseudo_barrier_instruction *>(instr)->sync, semantic_acqrel));
+ else
+ imm.combine(perform_barrier(ctx, sync_info, semantic_release));
if (!imm.empty()) {
if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
/* update barrier wait imms */
- for (unsigned i = 0; i < barrier_count; i++) {
+ for (unsigned i = 0; i < storage_count; i++) {
wait_imm& bar = ctx.barrier_imm[i];
uint16_t& bar_ev = ctx.barrier_events[i];
if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
(*ctr)++;
}
-void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barrier_interaction barrier)
+void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
{
- for (unsigned i = 0; i < barrier_count; i++) {
+ for (unsigned i = 0; i < storage_count; i++) {
wait_imm& bar = ctx.barrier_imm[i];
uint16_t& bar_ev = ctx.barrier_events[i];
- if (barrier & (1 << i)) {
+ if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
bar_ev |= event;
if (counters & counter_lgkm)
bar.lgkm = 0;
}
}
-void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
+void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
{
uint8_t counters = get_counters_for_event(event);
if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
ctx.vs_cnt++;
- update_barrier_imm(ctx, counters, event, barrier);
+ update_barrier_imm(ctx, counters, event, sync);
if (ctx.unordered_events & event)
return;
}
}
-void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
+void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
{
assert(ctx.chip_class < GFX10);
if (ctx.vm_cnt <= ctx.max_vm_cnt)
ctx.vm_cnt++;
- update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, barrier);
+ update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
{
break;
}
case Format::FLAT: {
+ FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
if (ctx.chip_class < GFX10 && !instr->definitions.empty())
- update_counters_for_flat_load(ctx, barrier_buffer);
+ update_counters_for_flat_load(ctx, flat->sync);
else
- update_counters(ctx, event_flat, barrier_buffer);
+ update_counters(ctx, event_flat, flat->sync);
if (!instr->definitions.empty())
insert_wait_entry(ctx, instr->definitions[0], event_flat);
}
case Format::SMEM: {
SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
- update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
+ update_counters(ctx, event_smem, smem->sync);
if (!instr->definitions.empty())
insert_wait_entry(ctx, instr->definitions[0], event_smem);
else if (ctx.chip_class >= GFX10 &&
- !smem->can_reorder &&
- smem->barrier == barrier_buffer)
+ !smem->sync.can_reorder())
ctx.pending_s_buffer_store = true;
break;
}
case Format::DS: {
- bool gds = static_cast<DS_instruction*>(instr)->gds;
- update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
- if (gds)
+ DS_instruction *ds = static_cast<DS_instruction*>(instr);
+ update_counters(ctx, ds->gds ? event_gds : event_lds, ds->sync);
+ if (ds->gds)
update_counters(ctx, event_gds_gpr_lock);
if (!instr->definitions.empty())
- insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
+ insert_wait_entry(ctx, instr->definitions[0], ds->gds ? event_gds : event_lds);
- if (gds) {
+ if (ds->gds) {
for (const Operand& op : instr->operands)
insert_wait_entry(ctx, op, event_gds_gpr_lock);
insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
case Format::MIMG:
case Format::GLOBAL: {
wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
- update_counters(ctx, ev, get_barrier_interaction(instr));
+ update_counters(ctx, ev, get_sync_info(instr));
bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
case Format::SOPP: {
if (instr->opcode == aco_opcode::s_sendmsg ||
instr->opcode == aco_opcode::s_sendmsghalt)
- update_counters(ctx, event_sendmsg, get_barrier_interaction(instr));
+ update_counters(ctx, event_sendmsg);
}
default:
break;
for (aco_ptr<Instruction>& instr : block.instructions) {
bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
- queued_imm.combine(kill(instr.get(), ctx));
+ memory_sync_info sync_info = get_sync_info(instr.get());
+ queued_imm.combine(kill(instr.get(), ctx, sync_info));
ctx.gen_instr = instr.get();
gen(instr.get(), ctx);
}
new_instructions.emplace_back(std::move(instr));
+ queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire));
+
if (ctx.collect_statistics)
ctx.advance_unwaited_instrs();
}
bool glc = false;
unsigned swizzle_component_size = 0;
- barrier_interaction barrier = barrier_none;
- bool can_reorder = true;
+ memory_sync_info sync;
Temp soffset = Temp(0, s1);
};
RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+ Instruction *instr;
if (read2)
- bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+ instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
else
- bld.ds(op, Definition(val), offset, m, const_offset);
+ instr = bld.ds(op, Definition(val), offset, m, const_offset);
+ static_cast<DS_instruction *>(instr)->sync = info->sync;
if (size < 4)
val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
load->definitions[0] = Definition(val);
load->glc = info->glc;
load->dlc = info->glc && bld.program->chip_class >= GFX10;
- load->barrier = info->barrier;
- load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
+ load->sync = info->sync;
bld.insert(std::move(load));
return val;
}
mubuf->offen = (offset.type() == RegType::vgpr);
mubuf->glc = info->glc;
mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
- mubuf->barrier = info->barrier;
- mubuf->can_reorder = info->can_reorder;
+ mubuf->sync = info->sync;
mubuf->offset = const_offset;
mubuf->swizzled = info->swizzle_component_size != 0;
RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
mubuf->offset = 0;
mubuf->addr64 = offset.type() == RegType::vgpr;
mubuf->disable_wqm = false;
- mubuf->barrier = info->barrier;
+ mubuf->sync = info->sync;
mubuf->definitions[0] = Definition(val);
bld.insert(std::move(mubuf));
} else {
flat->operands[1] = Operand(s1);
flat->glc = info->glc;
flat->dlc = info->glc && bld.program->chip_class >= GFX10;
- flat->barrier = info->barrier;
+ flat->sync = info->sync;
flat->offset = 0u;
flat->definitions[0] = Definition(val);
bld.insert(std::move(flat));
LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
info.align_mul = align;
info.align_offset = 0;
- info.barrier = barrier_shared;
- info.can_reorder = false;
+ info.sync = memory_sync_info(storage_shared);
info.const_offset = base_offset;
emit_lds_load(ctx, bld, &info);
}
assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
+ Instruction *instr;
if (write2) {
Temp second_data = write_datas[second];
inline_offset /= data.bytes();
- bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+ instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
} else {
- bld.ds(op, address_offset, data, m, inline_offset);
+ instr = bld.ds(op, address_offset, data, m, inline_offset);
}
+ static_cast<DS_instruction *>(instr)->sync =
+ memory_sync_info(storage_shared);
}
}
/* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
/* dlc*/ false, /* slc */ slc);
- static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+ if (!allow_reorder)
+ static_cast<MUBUF_instruction *>(r.instr)->sync = memory_sync_info(storage_buffer, semantic_private);
}
void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
}
if (use_mubuf) {
- Instruction *mubuf = bld.mubuf(opcode,
- Definition(fetch_dst), list, fetch_index, soffset,
- fetch_offset, false, false, true).instr;
- static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
+ bld.mubuf(opcode,
+ Definition(fetch_dst), list, fetch_index, soffset,
+ fetch_offset, false, false, true).instr;
} else {
- Instruction *mtbuf = bld.mtbuf(opcode,
- Definition(fetch_dst), list, fetch_index, soffset,
- fetch_dfmt, nfmt, fetch_offset, false, true).instr;
- static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
+ bld.mtbuf(opcode,
+ Definition(fetch_dst), list, fetch_index, soffset,
+ fetch_dfmt, nfmt, fetch_offset, false, true).instr;
}
emit_split_vector(ctx, fetch_dst, fetch_dst.size());
void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
- bool glc=false, bool readonly=true, bool allow_smem=true)
+ bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
{
Builder bld(ctx->program, ctx->block);
LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
info.glc = glc;
- info.barrier = readonly ? barrier_none : barrier_buffer;
- info.can_reorder = readonly;
+ info.sync = sync;
info.align_mul = align_mul;
info.align_offset = align_offset;
if (use_smem)
load->unrm = true;
load->da = da;
load->dim = dim;
- load->can_reorder = true; /* fmask images shouldn't be modified */
ctx->block->instructions.emplace_back(std::move(load));
Operand sample_index4;
}
+memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
+{
+ /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
+ if (semantics & semantic_atomicrmw)
+ return memory_sync_info(storage, semantics);
+
+ unsigned access = nir_intrinsic_access(instr);
+
+ if (access & ACCESS_VOLATILE)
+ semantics |= semantic_volatile;
+ if (access & ACCESS_CAN_REORDER)
+ semantics |= semantic_can_reorder | semantic_private;
+
+ return memory_sync_info(storage, semantics);
+}
+
void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
{
Builder bld(ctx->program, ctx->block);
bool is_array = glsl_sampler_type_is_array(type);
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+
if (dim == GLSL_SAMPLER_DIM_BUF) {
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
unsigned num_channels = util_last_bit(mask);
load->idxen = true;
load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
load->dlc = load->glc && ctx->options->chip_class >= GFX10;
- load->barrier = barrier_image;
+ load->sync = sync;
ctx->block->instructions.emplace_back(std::move(load));
expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
load->dmask = dmask;
load->unrm = true;
load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
- load->barrier = barrier_image;
+ load->sync = sync;
ctx->block->instructions.emplace_back(std::move(load));
expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
bool is_array = glsl_sampler_type_is_array(type);
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
+ memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
if (dim == GLSL_SAMPLER_DIM_BUF) {
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
- store->barrier = barrier_image;
+ store->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(store));
return;
store->unrm = true;
store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
store->disable_wqm = true;
- store->barrier = barrier_image;
+ store->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(store));
return;
}
Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+ memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
mubuf->glc = return_previous;
mubuf->dlc = false; /* Not needed for atomics */
mubuf->disable_wqm = true;
- mubuf->barrier = barrier_image;
+ mubuf->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
return;
mimg->unrm = true;
mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
mimg->disable_wqm = true;
- mimg->barrier = barrier_image;
+ mimg->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mimg));
return;
mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
mimg->da = glsl_sampler_type_is_array(type);
- mimg->can_reorder = true;
Definition& def = mimg->definitions[0];
ctx->block->instructions.emplace_back(std::move(mimg));
allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
- nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
+ nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
+ get_memory_sync_info(instr, storage_buffer, 0));
}
void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
+ memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
/* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
- store->barrier = barrier_buffer;
+ store->sync = sync;
ctx->block->instructions.emplace_back(std::move(store));
ctx->program->wb_smem_l1_on_end = true;
if (op == aco_opcode::p_fs_buffer_store_smem) {
store->glc = glc;
store->dlc = false;
store->disable_wqm = true;
- store->barrier = barrier_buffer;
+ store->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(store));
}
mubuf->glc = return_previous;
mubuf->dlc = false; /* Not needed for atomics */
mubuf->disable_wqm = true;
- mubuf->barrier = barrier_buffer;
+ mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
}
info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
- info.barrier = barrier_buffer;
- info.can_reorder = false;
+ info.sync = get_memory_sync_info(instr, storage_buffer, 0);
/* VMEM stores don't update the SMEM cache and it's difficult to prove that
* it's safe to use SMEM */
bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+ memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
if (ctx->options->chip_class >= GFX7)
flat->dlc = false;
flat->offset = offset;
flat->disable_wqm = true;
- flat->barrier = barrier_buffer;
+ flat->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(flat));
} else {
mubuf->offset = offsets[i];
mubuf->addr64 = addr.type() == RegType::vgpr;
mubuf->disable_wqm = true;
- mubuf->barrier = barrier_buffer;
+ mubuf->sync = sync;
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
}
flat->dlc = false; /* Not needed for atomics */
flat->offset = 0;
flat->disable_wqm = true;
- flat->barrier = barrier_buffer;
+ flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(flat));
} else {
mubuf->offset = 0;
mubuf->addr64 = addr.type() == RegType::vgpr;
mubuf->disable_wqm = true;
- mubuf->barrier = barrier_buffer;
+ mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
ctx->program->needs_exact = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
}
void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
Builder bld(ctx->program, ctx->block);
+ storage_class all_mem = (storage_class)(storage_buffer | storage_image | storage_atomic_counter | storage_shared);
switch(instr->intrinsic) {
case nir_intrinsic_group_memory_barrier:
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(all_mem, semantic_acqrel, scope_workgroup));
+ break;
case nir_intrinsic_memory_barrier:
- bld.barrier(aco_opcode::p_memory_barrier_common);
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(all_mem, semantic_acqrel, scope_device));
break;
case nir_intrinsic_memory_barrier_buffer:
- bld.barrier(aco_opcode::p_memory_barrier_buffer);
- break;
case nir_intrinsic_memory_barrier_image:
- bld.barrier(aco_opcode::p_memory_barrier_image);
+ /* since NIR splits barriers, we have to unify buffer and image barriers
+ * for now so dEQP-VK.memory_model.message_passing.core11.u32.coherent.
+ * fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
+ * passes
+ */
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info((storage_class)(storage_buffer | storage_image), semantic_acqrel, scope_device));
break;
case nir_intrinsic_memory_barrier_tcs_patch:
case nir_intrinsic_memory_barrier_shared:
- bld.barrier(aco_opcode::p_memory_barrier_shared);
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup));
break;
default:
unreachable("Unimplemented memory barrier intrinsic");
ds->offset0 = offset;
if (return_previous)
ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
+ ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
ctx->block->instructions.emplace_back(std::move(ds));
}
info.align_mul = nir_intrinsic_align_mul(instr);
info.align_offset = nir_intrinsic_align_offset(instr);
info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
- info.can_reorder = false;
+ info.sync = memory_sync_info(storage_buffer, semantic_private);
info.soffset = ctx->program->scratch_offset;
emit_scratch_load(ctx, bld, &info);
}
for (unsigned i = 0; i < write_count; i++) {
aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
- bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+ Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+ static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_buffer, semantic_private);
}
}
mtbuf->offset = const_offset;
mtbuf->glc = true;
mtbuf->slc = true;
- mtbuf->barrier = barrier_gs_data;
- mtbuf->can_reorder = true;
+ mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
bld.insert(std::move(mtbuf));
}
load->glc = false;
load->dlc = false;
load->disable_wqm = false;
- load->barrier = barrier_none;
- load->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(load));
}
visit_get_buffer_size(ctx, instr);
break;
case nir_intrinsic_control_barrier: {
- if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
- /* GFX6 only (thanks to a hw bug workaround):
- * The real barrier instruction isn’t needed, because an entire patch
- * always fits into a single wave.
- */
- break;
- }
-
- if (ctx->program->workgroup_size > ctx->program->wave_size)
- bld.sopp(aco_opcode::s_barrier);
-
+ bld.barrier(aco_opcode::p_barrier, memory_sync_info(0, 0, scope_invocation), scope_workgroup);
break;
}
case nir_intrinsic_memory_barrier_tcs_patch:
aco_opcode opcode =
nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
aco_opcode::s_memrealtime : aco_opcode::s_memtime;
- bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
+ bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), memory_sync_info(0, semantic_volatile));
emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
break;
}
tex->da = da;
tex->definitions[0] = Definition(tmp_dst);
tex->dim = dim;
- tex->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(tex));
if (div_by_6) {
tex->da = da;
Temp size = bld.tmp(v2);
tex->definitions[0] = Definition(size);
- tex->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(tex));
emit_split_vector(ctx, size, size.size());
mubuf->operands[2] = Operand((uint32_t) 0);
mubuf->definitions[0] = Definition(tmp_dst);
mubuf->idxen = true;
- mubuf->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(mubuf));
expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
tex->unrm = true;
tex->da = da;
tex->definitions[0] = Definition(tmp_dst);
- tex->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(tex));
if (instr->op == nir_texop_samples_identical) {
tex->dmask = dmask;
tex->da = da;
tex->definitions[0] = Definition(tmp_dst);
- tex->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(tex));
if (tg4_integer_cube_workaround) {
create_null_export(ctx);
}
+static void create_workgroup_barrier(Builder& bld)
+{
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
+ scope_workgroup);
+}
+
static void write_tcs_tess_factors(isel_context *ctx)
{
unsigned outer_comps;
Builder bld(ctx->program, ctx->block);
- bld.barrier(aco_opcode::p_memory_barrier_shared);
- if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
- bld.sopp(aco_opcode::s_barrier);
+ create_workgroup_barrier(bld);
Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
store->glc = true;
store->dlc = false;
store->slc = true;
- store->can_reorder = true;
ctx->block->instructions.emplace_back(std::move(store));
}
}
if (ctx->stage == ngg_vertex_gs) {
/* Wait for GS threads to store primitive ID in LDS. */
- bld.barrier(aco_opcode::p_memory_barrier_shared);
- bld.sopp(aco_opcode::s_barrier);
+ create_workgroup_barrier(bld);
/* Calculate LDS address where the GS threads stored the primitive ID. */
Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
if (i) {
Builder bld(ctx.program, ctx.block);
- bld.barrier(aco_opcode::p_memory_barrier_shared);
- bld.sopp(aco_opcode::s_barrier);
+ create_workgroup_barrier(bld);
if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
ngg_emit_nogs_output(&ctx);
} else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
Builder bld(ctx.program, ctx.block);
- bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+ bld.barrier(aco_opcode::p_barrier,
+ memory_sync_info(storage_vmem_output, semantic_release, scope_device));
bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
write_tcs_tess_factors(&ctx);
ctx.block->kind |= block_kind_uniform;
Builder bld(ctx.program, ctx.block);
if (ctx.program->wb_smem_l1_on_end)
- bld.smem(aco_opcode::s_dcache_wb, false);
+ bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
bld.sopp(aco_opcode::s_endpgm);
cleanup_cfg(program);
mubuf->glc = true;
mubuf->slc = true;
mubuf->dlc = args->options->chip_class >= GFX10;
- mubuf->barrier = barrier_none;
- mubuf->can_reorder = true;
ctx.outputs.mask[i] |= 1 << j;
ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
program->next_fp_mode.round32 = fp_round_ne;
}
+memory_sync_info get_sync_info(const Instruction* instr)
+{
+ switch (instr->format) {
+ case Format::SMEM:
+ return static_cast<const SMEM_instruction*>(instr)->sync;
+ case Format::MUBUF:
+ return static_cast<const MUBUF_instruction*>(instr)->sync;
+ case Format::MIMG:
+ return static_cast<const MIMG_instruction*>(instr)->sync;
+ case Format::MTBUF:
+ return static_cast<const MTBUF_instruction*>(instr)->sync;
+ case Format::FLAT:
+ case Format::GLOBAL:
+ case Format::SCRATCH:
+ return static_cast<const FLAT_instruction*>(instr)->sync;
+ case Format::DS:
+ return static_cast<const DS_instruction*>(instr)->sync;
+ default:
+ return memory_sync_info();
+ }
+}
+
bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr)
{
if (!instr->isVALU())
SDWA = 1 << 14,
};
-enum barrier_interaction : uint8_t {
- barrier_none = 0,
- barrier_buffer = 0x1,
- barrier_image = 0x2,
- barrier_atomic = 0x4,
- barrier_shared = 0x8,
- /* used for geometry shaders to ensure vertex data writes are before the
- * GS_DONE s_sendmsg. */
- barrier_gs_data = 0x10,
- /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
- barrier_gs_sendmsg = 0x20,
- /* used by barriers. created by s_barrier */
- barrier_barrier = 0x40,
- barrier_count = 7,
+enum storage_class : uint8_t {
+ storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
+ storage_buffer = 0x1, /* SSBOs and global memory */
+ storage_atomic_counter = 0x2, /* not used for Vulkan */
+ storage_image = 0x4,
+ storage_shared = 0x8, /* or TCS output */
+ storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
+ storage_scratch = 0x20,
+ storage_vgpr_spill = 0x40,
+ storage_count = 8,
};
+enum memory_semantics : uint8_t {
+ semantic_none = 0x0,
+ /* for loads: don't move any access after this load to before this load (even other loads)
+ * for barriers: don't move any access after the barrier to before any
+ * atomics/control_barriers/sendmsg_gs_done before the barrier */
+ semantic_acquire = 0x1,
+ /* for stores: don't move any access before this store to after this store
+ * for barriers: don't move any access before the barrier to after any
+ * atomics/control_barriers/sendmsg_gs_done after the barrier */
+ semantic_release = 0x2,
+
+ /* the rest are for load/stores/atomics only */
+ /* cannot be DCE'd or CSE'd */
+ semantic_volatile = 0x4,
+ /* does not interact with barriers and assumes this lane is the only lane
+ * accessing this memory */
+ semantic_private = 0x8,
+ /* this operation can be reordered around operations of the same storage. says nothing about barriers */
+ semantic_can_reorder = 0x10,
+ /* this is a atomic instruction (may only read or write memory) */
+ semantic_atomic = 0x20,
+ /* this is instruction both reads and writes memory */
+ semantic_rmw = 0x40,
+
+ semantic_acqrel = semantic_acquire | semantic_release,
+ semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
+};
+
+enum sync_scope : uint8_t {
+ scope_invocation = 0,
+ scope_subgroup = 1,
+ scope_workgroup = 2,
+ scope_queuefamily = 3,
+ scope_device = 4,
+};
+
+struct memory_sync_info {
+ memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
+ memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
+ : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
+
+ storage_class storage:8;
+ memory_semantics semantics:8;
+ sync_scope scope:8;
+
+ bool operator == (const memory_sync_info& rhs) const {
+ return storage == rhs.storage &&
+ semantics == rhs.semantics &&
+ scope == rhs.scope;
+ }
+
+ bool can_reorder() const {
+ if (semantics & semantic_acqrel)
+ return false;
+ /* Also check storage so that zero-initialized memory_sync_info can be
+ * reordered. */
+ return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
+ }
+};
+static_assert(sizeof(memory_sync_info) == 3);
+
enum fp_round {
fp_round_ne = 0,
fp_round_pi = 1,
*
*/
struct SMEM_instruction : public Instruction {
- barrier_interaction barrier;
+ memory_sync_info sync;
bool glc : 1; /* VI+: globally coherent */
bool dlc : 1; /* NAVI: device level coherent */
bool nv : 1; /* VEGA only: Non-volatile */
- bool can_reorder : 1;
bool disable_wqm : 1;
bool prevent_overflow : 1; /* avoid overflow when combining additions */
- uint32_t padding: 18;
+ uint32_t padding: 3;
};
static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
*
*/
struct DS_instruction : public Instruction {
+ memory_sync_info sync;
+ bool gds;
int16_t offset0;
int8_t offset1;
- bool gds;
+ uint8_t padding;
};
-static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
/**
* Vector Memory Untyped-buffer Instructions
*
*/
struct MUBUF_instruction : public Instruction {
- uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+ memory_sync_info sync;
bool offen : 1; /* Supply an offset from VGPR (VADDR) */
bool idxen : 1; /* Supply an index from VGPR (VADDR) */
bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
bool tfe : 1; /* texture fail enable */
bool lds : 1; /* Return read-data to LDS instead of VGPRs */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
- bool can_reorder : 1;
- bool swizzled:1;
- uint8_t padding : 1;
- barrier_interaction barrier;
+ uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+ bool swizzled : 1;
+ uint32_t padding1 : 18;
};
-static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
/**
* Vector Memory Typed-buffer Instructions
*
*/
struct MTBUF_instruction : public Instruction {
- uint16_t offset; /* Unsigned byte offset - 12 bit */
- barrier_interaction barrier;
+ memory_sync_info sync;
uint8_t dfmt : 4; /* Data Format of data in memory buffer */
uint8_t nfmt : 3; /* Numeric format of data in memory */
bool offen : 1; /* Supply an offset from VGPR (VADDR) */
bool slc : 1; /* system level coherent */
bool tfe : 1; /* texture fail enable */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
- bool can_reorder : 1;
- uint32_t padding : 25;
+ uint32_t padding : 10;
+ uint16_t offset; /* Unsigned byte offset - 12 bit */
};
static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
*
*/
struct MIMG_instruction : public Instruction {
+ memory_sync_info sync;
uint8_t dmask; /* Data VGPR enable mask */
uint8_t dim : 3; /* NAVI: dimensionality */
bool unrm : 1; /* Force address to be un-normalized */
bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
bool d16 : 1; /* Convert 32-bit data to 16-bit data */
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
- bool can_reorder : 1;
- uint8_t padding : 1;
- barrier_interaction barrier;
+ uint32_t padding : 18;
};
-static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
/**
* Flat/Scratch/Global Instructions
*
*/
struct FLAT_instruction : public Instruction {
- uint16_t offset; /* Vega/Navi only */
+ memory_sync_info sync;
bool slc : 1; /* system level coherent */
bool glc : 1; /* globally coherent */
bool dlc : 1; /* NAVI: device level coherent */
bool lds : 1;
bool nv : 1;
bool disable_wqm : 1; /* Require an exec mask without helper invocations */
- bool can_reorder : 1;
- uint8_t padding : 1;
- barrier_interaction barrier;
+ uint32_t padding0 : 2;
+ uint16_t offset; /* Vega/Navi only */
+ uint16_t padding1;
};
-static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
struct Export_instruction : public Instruction {
uint8_t enabled_mask;
static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
struct Pseudo_barrier_instruction : public Instruction {
+ memory_sync_info sync;
+ sync_scope exec_scope;
};
-static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
+static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
enum ReduceOp : uint16_t {
iadd8, iadd16, iadd32, iadd64,
return is_phi(instr.get());
}
-barrier_interaction get_barrier_interaction(const Instruction* instr);
+memory_sync_info get_sync_info(const Instruction* instr);
+
bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
emit_gfx10_wave64_bpermute(program, instr, bld);
else
unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
+ break;
}
default:
break;
reduce->operands[2].physReg(), // vtmp
reduce->definitions[2].physReg(), // sitmp
reduce->operands[0], reduce->definitions[0]);
+ } else if (instr->format == Format::PSEUDO_BARRIER) {
+ Pseudo_barrier_instruction* barrier = static_cast<Pseudo_barrier_instruction*>(instr.get());
+
+ /* Anything larger than a workgroup isn't possible. Anything
+ * smaller requires no instructions and this pseudo instruction
+ * would only be included to control optimizations. */
+ bool emit_s_barrier = barrier->exec_scope == scope_workgroup &&
+ program->workgroup_size > program->wave_size;
+
+ bld.insert(std::move(instr));
+ if (emit_s_barrier)
+ bld.sopp(aco_opcode::s_barrier);
} else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
float_mode new_mode = block->fp_mode;
new_mode.round16_64 = fp_round_ne;
return [('uint32_t', 'block', '-1'),
('uint32_t', 'imm', '0')]
elif self == Format.SMEM:
- return [('bool', 'can_reorder', 'true'),
+ return [('memory_sync_info', 'sync', 'memory_sync_info()'),
('bool', 'glc', 'false'),
('bool', 'dlc', 'false'),
('bool', 'nv', 'false')]
elif self == Format.PSEUDO_REDUCTION:
return [('ReduceOp', 'op', None, 'reduce_op'),
('unsigned', 'cluster_size', '0')]
+ elif self == Format.PSEUDO_BARRIER:
+ return [('memory_sync_info', 'sync', None),
+ ('sync_scope', 'exec_scope', 'scope_invocation')]
elif self == Format.VINTRP:
return [('unsigned', 'attribute', None),
('unsigned', 'component', None)]
('bool', 'bound_ctrl', 'true')]
elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
return [('uint16_t', 'offset', 0),
- ('bool', 'can_reorder', 'true'),
+ ('memory_sync_info', 'sync', 'memory_sync_info()'),
('bool', 'glc', 'false'),
('bool', 'slc', 'false'),
('bool', 'lds', 'false'),
opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
-opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared
-opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER)
+opcode("p_barrier", format=Format.PSEUDO_BARRIER)
opcode("p_spill")
opcode("p_reload")
return aK->imm == bK->imm;
}
case Format::SMEM: {
+ if (!a->operands.empty() && a->operands[0].bytes() == 16)
+ return false;
SMEM_instruction* aS = static_cast<SMEM_instruction*>(a);
SMEM_instruction* bS = static_cast<SMEM_instruction*>(b);
/* isel shouldn't be creating situations where this assertion fails */
assert(aS->prevent_overflow == bS->prevent_overflow);
- return aS->can_reorder && bS->can_reorder &&
- aS->glc == bS->glc && aS->nv == bS->nv &&
+ return aS->sync.can_reorder() && bS->sync.can_reorder() &&
+ aS->sync == bS->sync && aS->glc == bS->glc && aS->dlc == bS->dlc &&
+ aS->nv == bS->nv && aS->disable_wqm == bS->disable_wqm &&
aS->prevent_overflow == bS->prevent_overflow;
}
case Format::VINTRP: {
case Format::MTBUF: {
MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a);
MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b);
- return aM->can_reorder && bM->can_reorder &&
- aM->barrier == bM->barrier &&
+ return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+ aM->sync == bM->sync &&
aM->dfmt == bM->dfmt &&
aM->nfmt == bM->nfmt &&
aM->offset == bM->offset &&
case Format::MUBUF: {
MUBUF_instruction* aM = static_cast<MUBUF_instruction *>(a);
MUBUF_instruction* bM = static_cast<MUBUF_instruction *>(b);
- return aM->can_reorder && bM->can_reorder &&
- aM->barrier == bM->barrier &&
+ return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+ aM->sync == bM->sync &&
aM->offset == bM->offset &&
aM->offen == bM->offen &&
aM->idxen == bM->idxen &&
return false;
DS_instruction* aD = static_cast<DS_instruction *>(a);
DS_instruction* bD = static_cast<DS_instruction *>(b);
- return aD->pass_flags == bD->pass_flags &&
+ return aD->sync.can_reorder() && bD->sync.can_reorder() &&
+ aD->sync == bD->sync &&
+ aD->pass_flags == bD->pass_flags &&
aD->gds == bD->gds &&
aD->offset0 == bD->offset0 &&
aD->offset1 == bD->offset1;
case Format::MIMG: {
MIMG_instruction* aM = static_cast<MIMG_instruction*>(a);
MIMG_instruction* bM = static_cast<MIMG_instruction*>(b);
- return aM->can_reorder && bM->can_reorder &&
- aM->barrier == bM->barrier &&
+ return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+ aM->sync == bM->sync &&
aM->dmask == bM->dmask &&
aM->unrm == bM->unrm &&
aM->glc == bM->glc &&
new_instr->operands.back() = Operand(base);
if (!smem->definitions.empty())
new_instr->definitions[0] = smem->definitions[0];
- new_instr->can_reorder = smem->can_reorder;
- new_instr->barrier = smem->barrier;
+ new_instr->sync = smem->sync;
new_instr->glc = smem->glc;
new_instr->dlc = smem->dlc;
new_instr->nv = smem->nv;
print_physReg(definition->physReg(), definition->bytes(), output);
}
-static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, FILE *output)
+static void print_storage(storage_class storage, FILE *output)
{
- if (can_reorder)
- fprintf(output, " reorder");
+ fprintf(output, " storage:");
+ int printed = 0;
+ if (storage & storage_buffer)
+ printed += fprintf(output, "%sbuffer", printed ? "," : "");
+ if (storage & storage_atomic_counter)
+ printed += fprintf(output, "%satomic_counter", printed ? "," : "");
+ if (storage & storage_image)
+ printed += fprintf(output, "%simage", printed ? "," : "");
+ if (storage & storage_shared)
+ printed += fprintf(output, "%sshared", printed ? "," : "");
+ if (storage & storage_vmem_output)
+ printed += fprintf(output, "%svmem_output", printed ? "," : "");
+ if (storage & storage_scratch)
+ printed += fprintf(output, "%sscratch", printed ? "," : "");
+ if (storage & storage_vgpr_spill)
+ printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
+}
+
+static void print_semantics(memory_semantics sem, FILE *output)
+{
+ fprintf(output, " semantics:");
+ int printed = 0;
+ if (sem & semantic_acquire)
+ printed += fprintf(output, "%sacquire", printed ? "," : "");
+ if (sem & semantic_release)
+ printed += fprintf(output, "%srelease", printed ? "," : "");
+ if (sem & semantic_volatile)
+ printed += fprintf(output, "%svolatile", printed ? "," : "");
+ if (sem & semantic_private)
+ printed += fprintf(output, "%sprivate", printed ? "," : "");
+ if (sem & semantic_can_reorder)
+ printed += fprintf(output, "%sreorder", printed ? "," : "");
+ if (sem & semantic_atomic)
+ printed += fprintf(output, "%satomic", printed ? "," : "");
+ if (sem & semantic_rmw)
+ printed += fprintf(output, "%srmw", printed ? "," : "");
+}
+
+static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
+{
+ fprintf(output, " %s:", prefix);
+ switch (scope) {
+ case scope_invocation:
+ fprintf(output, "invocation");
+ break;
+ case scope_subgroup:
+ fprintf(output, "subgroup");
+ break;
+ case scope_workgroup:
+ fprintf(output, "workgroup");
+ break;
+ case scope_queuefamily:
+ fprintf(output, "queuefamily");
+ break;
+ case scope_device:
+ fprintf(output, "device");
+ break;
+ }
+}
- if (barrier & barrier_buffer)
- fprintf(output, " buffer");
- if (barrier & barrier_image)
- fprintf(output, " image");
- if (barrier & barrier_atomic)
- fprintf(output, " atomic");
- if (barrier & barrier_shared)
- fprintf(output, " shared");
- if (barrier & barrier_gs_data)
- fprintf(output, " gs_data");
- if (barrier & barrier_gs_sendmsg)
- fprintf(output, " gs_sendmsg");
+static void print_sync(memory_sync_info sync, FILE *output)
+{
+ print_storage(sync.storage, output);
+ print_semantics(sync.semantics, output);
+ print_scope(sync.scope, output);
}
static void print_instr_format_specific(const Instruction *instr, FILE *output)
fprintf(output, " dlc");
if (smem->nv)
fprintf(output, " nv");
- print_barrier_reorder(smem->can_reorder, smem->barrier, output);
+ print_sync(smem->sync, output);
break;
}
case Format::VINTRP: {
fprintf(output, " offset1:%u", ds->offset1);
if (ds->gds)
fprintf(output, " gds");
+ print_sync(ds->sync, output);
break;
}
case Format::MUBUF: {
fprintf(output, " lds");
if (mubuf->disable_wqm)
fprintf(output, " disable_wqm");
- print_barrier_reorder(mubuf->can_reorder, mubuf->barrier, output);
+ print_sync(mubuf->sync, output);
break;
}
case Format::MIMG: {
fprintf(output, " d16");
if (mimg->disable_wqm)
fprintf(output, " disable_wqm");
- print_barrier_reorder(mimg->can_reorder, mimg->barrier, output);
+ print_sync(mimg->sync, output);
break;
}
case Format::EXP: {
fprintf(output, " cluster_size:%u", reduce->cluster_size);
break;
}
+ case Format::PSEUDO_BARRIER: {
+ const Pseudo_barrier_instruction* barrier = static_cast<const Pseudo_barrier_instruction*>(instr);
+ print_sync(barrier->sync, output);
+ print_scope(barrier->exec_scope, output, "exec_scope");
+ break;
+ }
case Format::FLAT:
case Format::GLOBAL:
case Format::SCRATCH: {
fprintf(output, " nv");
if (flat->disable_wqm)
fprintf(output, " disable_wqm");
- print_barrier_reorder(flat->can_reorder, flat->barrier, output);
+ print_sync(flat->sync, output);
break;
}
case Format::MTBUF: {
fprintf(output, " tfe");
if (mtbuf->disable_wqm)
fprintf(output, " disable_wqm");
- print_barrier_reorder(mtbuf->can_reorder, mtbuf->barrier, output);
+ print_sync(mtbuf->sync, output);
break;
}
case Format::VOP3P: {
source_idx++;
}
-bool can_reorder(Instruction* candidate)
-{
- switch (candidate->format) {
- case Format::SMEM:
- return static_cast<SMEM_instruction*>(candidate)->can_reorder;
- case Format::MUBUF:
- return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
- case Format::MIMG:
- return static_cast<MIMG_instruction*>(candidate)->can_reorder;
- case Format::MTBUF:
- return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
- case Format::FLAT:
- case Format::GLOBAL:
- case Format::SCRATCH:
- return static_cast<FLAT_instruction*>(candidate)->can_reorder;
- default:
- return true;
- }
-}
-
bool is_gs_or_done_sendmsg(const Instruction *instr)
{
if (instr->opcode == aco_opcode::s_sendmsg) {
return false;
}
-barrier_interaction get_barrier_interaction(const Instruction* instr)
+memory_sync_info get_sync_info_with_hack(const Instruction* instr)
{
- switch (instr->format) {
- case Format::SMEM:
- return static_cast<const SMEM_instruction*>(instr)->barrier;
- case Format::MUBUF:
- return static_cast<const MUBUF_instruction*>(instr)->barrier;
- case Format::MIMG:
- return static_cast<const MIMG_instruction*>(instr)->barrier;
- case Format::MTBUF:
- return static_cast<const MTBUF_instruction*>(instr)->barrier;
- case Format::FLAT:
- case Format::GLOBAL:
- case Format::SCRATCH:
- return static_cast<const FLAT_instruction*>(instr)->barrier;
- case Format::DS:
- return barrier_shared;
- case Format::SOPP:
- if (is_done_sendmsg(instr))
- return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
- else if (is_gs_or_done_sendmsg(instr))
- return barrier_gs_sendmsg;
- else
- return barrier_none;
- case Format::PSEUDO_BARRIER:
- return barrier_barrier;
- default:
- return barrier_none;
+ memory_sync_info sync = get_sync_info(instr);
+ if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
+ // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
+ sync.storage = (storage_class)(sync.storage | storage_buffer);
+ sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
}
+ return sync;
}
-barrier_interaction parse_barrier(Instruction *instr)
-{
- if (instr->format == Format::PSEUDO_BARRIER) {
- switch (instr->opcode) {
- case aco_opcode::p_memory_barrier_atomic:
- return barrier_atomic;
- /* For now, buffer and image barriers are treated the same. this is because of
- * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
- * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
- * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
- * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
- * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
- * Either way, this solution should work. */
- case aco_opcode::p_memory_barrier_buffer:
- case aco_opcode::p_memory_barrier_image:
- return (barrier_interaction)(barrier_image | barrier_buffer);
- case aco_opcode::p_memory_barrier_shared:
- return barrier_shared;
- case aco_opcode::p_memory_barrier_common:
- return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
- case aco_opcode::p_memory_barrier_gs_data:
- return barrier_gs_data;
- case aco_opcode::p_memory_barrier_gs_sendmsg:
- return barrier_gs_sendmsg;
- default:
- break;
- }
- } else if (instr->opcode == aco_opcode::s_barrier) {
- return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
- }
- return barrier_none;
-}
+struct memory_event_set {
+ bool has_control_barrier;
+
+ unsigned bar_acquire;
+ unsigned bar_release;
+ unsigned bar_classes;
+
+ unsigned access_acquire;
+ unsigned access_release;
+ unsigned access_relaxed;
+ unsigned access_atomic;
+};
struct hazard_query {
bool contains_spill;
- int barriers;
- int barrier_interaction;
- bool can_reorder_vmem;
- bool can_reorder_smem;
+ bool contains_sendmsg;
+ memory_event_set mem_events;
+ unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
+ unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
};
void init_hazard_query(hazard_query *query) {
query->contains_spill = false;
- query->barriers = 0;
- query->barrier_interaction = 0;
- query->can_reorder_vmem = true;
- query->can_reorder_smem = true;
+ query->contains_sendmsg = false;
+ memset(&query->mem_events, 0, sizeof(query->mem_events));
+ query->aliasing_storage = 0;
+ query->aliasing_storage_smem = 0;
+}
+
+void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
+{
+ set->has_control_barrier |= is_done_sendmsg(instr);
+ if (instr->opcode == aco_opcode::p_barrier) {
+ Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
+ if (bar->sync.semantics & semantic_acquire)
+ set->bar_acquire |= bar->sync.storage;
+ if (bar->sync.semantics & semantic_release)
+ set->bar_release |= bar->sync.storage;
+ set->bar_classes |= bar->sync.storage;
+
+ set->has_control_barrier |= bar->exec_scope > scope_invocation;
+ }
+
+ if (!sync->storage)
+ return;
+
+ if (sync->semantics & semantic_acquire)
+ set->access_acquire |= sync->storage;
+ if (sync->semantics & semantic_release)
+ set->access_release |= sync->storage;
+
+ if (!(sync->semantics & semantic_private)) {
+ if (sync->semantics & semantic_atomic)
+ set->access_atomic |= sync->storage;
+ else
+ set->access_relaxed |= sync->storage;
+ }
}
void add_to_hazard_query(hazard_query *query, Instruction *instr)
{
- query->barriers |= parse_barrier(instr);
- query->barrier_interaction |= get_barrier_interaction(instr);
if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
query->contains_spill = true;
+ query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
+
+ memory_sync_info sync = get_sync_info_with_hack(instr);
- bool can_reorder_instr = can_reorder(instr);
- query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr;
- query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr;
+ add_memory_event(&query->mem_events, instr, &sync);
+
+ if (!(sync.semantics & semantic_can_reorder)) {
+ unsigned storage = sync.storage;
+ /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
+ if (storage & (storage_buffer | storage_image))
+ storage |= storage_buffer | storage_image;
+ if (instr->format == Format::SMEM)
+ query->aliasing_storage_smem |= storage;
+ else
+ query->aliasing_storage |= storage;
+ }
}
enum HazardResult {
hazard_fail_unreorderable,
};
-HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
+HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
{
- bool can_reorder_candidate = can_reorder(instr);
-
if (instr->opcode == aco_opcode::p_exit_early_if)
return hazard_fail_exec;
for (const Definition& def : instr->definitions) {
instr->opcode == aco_opcode::s_setprio)
return hazard_fail_unreorderable;
- barrier_interaction bar = parse_barrier(instr);
- if (query->barrier_interaction && (query->barrier_interaction & bar))
+ memory_event_set instr_set;
+ memset(&instr_set, 0, sizeof(instr_set));
+ memory_sync_info sync = get_sync_info_with_hack(instr);
+ add_memory_event(&instr_set, instr, &sync);
+
+ memory_event_set *first = &instr_set;
+ memory_event_set *second = &query->mem_events;
+ if (upwards)
+ std::swap(first, second);
+
+ /* everything after barrier(acquire) happens after the atomics/control_barriers before
+ * everything after load(acquire) happens after the load
+ */
+ if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
+ return hazard_fail_barrier;
+ if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
+ ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
+ return hazard_fail_barrier;
+
+ /* everything before barrier(release) happens before the atomics/control_barriers after *
+ * everything before store(release) happens before the store
+ */
+ if (first->bar_release && (second->has_control_barrier || second->access_atomic))
return hazard_fail_barrier;
- if (bar && query->barriers && (query->barriers & ~bar))
+ if ((first->bar_classes && (second->bar_release || second->access_release)) ||
+ ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
return hazard_fail_barrier;
- if (query->barriers && (query->barriers & get_barrier_interaction(instr)))
+
+ /* don't move memory barriers around other memory barriers */
+ if (first->bar_classes && second->bar_classes)
return hazard_fail_barrier;
- if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate)
- return hazard_fail_reorder_vmem_smem;
- if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate)
+ /* Don't move memory loads/stores to before control barriers. This is to make
+ * memory barriers followed by control barriers work. */
+ if (first->has_control_barrier && (second->access_atomic | second->access_relaxed))
+ return hazard_fail_barrier;
+
+ /* don't move memory loads/stores past potentially aliasing loads/stores */
+ unsigned aliasing_storage = instr->format == Format::SMEM ?
+ query->aliasing_storage_smem :
+ query->aliasing_storage;
+ if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
+ unsigned intersect = sync.storage & aliasing_storage;
+ if (intersect & storage_shared)
+ return hazard_fail_reorder_ds;
return hazard_fail_reorder_vmem_smem;
- if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS)
- return hazard_fail_reorder_ds;
- if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr)))
- return hazard_fail_reorder_sendmsg;
+ }
if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
query->contains_spill)
return hazard_fail_spill;
+ if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
+ return hazard_fail_reorder_sendmsg;
+
return hazard_success;
}
bool can_move_down = true;
- HazardResult haz = perform_hazard_query(&hq, candidate.get());
+ HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
can_move_down = false;
else if (haz != hazard_success)
break;
if (found_dependency) {
- HazardResult haz = perform_hazard_query(&hq, candidate.get());
+ HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
haz == hazard_fail_export)
/* if current depends on candidate, add additional dependencies and continue */
bool can_move_down = !is_vmem || part_of_clause;
- HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get());
+ HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
haz == hazard_fail_export)
/* check if candidate depends on current */
bool is_dependency = false;
if (found_dependency) {
- HazardResult haz = perform_hazard_query(&indep_hq, candidate.get());
+ HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
haz == hazard_fail_barrier || haz == hazard_fail_export)
if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
break;
- HazardResult haz = perform_hazard_query(&hq, candidate.get());
+ HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
break;
for (unsigned i = 0; i < temp.size(); i++)
split->definitions[i] = bld.def(v1);
bld.insert(split);
- for (unsigned i = 0; i < temp.size(); i++)
- bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+ for (unsigned i = 0; i < temp.size(); i++) {
+ Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+ static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
+ }
} else {
- bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+ Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+ static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
}
} else {
ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
for (unsigned i = 0; i < def.size(); i++) {
Temp tmp = bld.tmp(v1);
vec->operands[i] = Operand(tmp);
- bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+ Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+ static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
}
bld.insert(vec);
} else {
- bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+ Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+ static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
}
} else {
uint32_t spill_slot = slots[spill_id];