assert(exp == unset_counter || exp <= 0x7);
switch (chip) {
case GFX10:
+ case GFX10_3:
assert(lgkm == unset_counter || lgkm <= 0x3f);
assert(vm == unset_counter || vm <= 0x3f);
imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
bool pending_flat_vm = false;
bool pending_s_buffer_store = false; /* GFX10 workaround */
- wait_imm barrier_imm[barrier_count];
- uint16_t barrier_events[barrier_count] = {}; /* use wait_event notion */
+ wait_imm barrier_imm[storage_count];
+ uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
std::map<PhysReg,wait_entry> gpr_map;
max_exp_cnt(6),
max_lgkm_cnt(program_->chip_class >= GFX10 ? 62 : 14),
max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0),
- unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {}
+ unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)),
+ collect_statistics(program_->collect_statistics) {}
bool join(const wait_ctx* other, bool logical)
{
}
}
- for (unsigned i = 0; i < barrier_count; i++) {
+ for (unsigned i = 0; i < storage_count; i++) {
changed |= barrier_imm[i].combine(other->barrier_imm[i]);
changed |= other->barrier_events[i] & ~barrier_events[i];
barrier_events[i] |= other->barrier_events[i];
return wait_imm();
}
-wait_imm kill(Instruction* instr, wait_ctx& ctx)
+wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
{
wait_imm imm;
+ sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
+ if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
+ unsigned storage = sync.storage;
+ while (storage) {
+ unsigned idx = u_bit_scan(&storage);
+
+ /* LDS is private to the workgroup */
+ sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
+
+ uint16_t events = ctx.barrier_events[idx];
+ if (bar_scope_lds <= subgroup_scope)
+ events &= ~event_lds;
+
+ /* in non-WGP, the L1/L0 cache keeps all memory operations in-order for the same workgroup */
+ if (ctx.chip_class < GFX10 && sync.scope <= scope_workgroup)
+ events &= ~(event_vmem | event_vmem_store | event_smem);
+
+ if (events)
+ imm.combine(ctx.barrier_imm[idx]);
+ }
+ }
+
+ return imm;
+}
+
+void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
+{
+ if (ctx.vm_cnt)
+ imm.vm = 0;
+ if (ctx.exp_cnt)
+ imm.exp = 0;
+ if (ctx.lgkm_cnt)
+ imm.lgkm = 0;
+
+ if (ctx.chip_class >= GFX10) {
+ if (ctx.vs_cnt)
+ imm.vs = 0;
+ }
+}
+
+wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
+{
+ wait_imm imm;
+
+ if (debug_flags & DEBUG_FORCE_WAITCNT) {
+ /* Force emitting waitcnt states right after the instruction if there is
+ * something to wait for.
+ */
+ force_waitcnt(ctx, imm);
+ }
+
if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
imm.combine(check_instr(instr, ctx));
imm.lgkm = 0;
}
- if (ctx.chip_class >= GFX10) {
+ if (ctx.chip_class >= GFX10 && instr->format == Format::SMEM) {
/* GFX10: A store followed by a load at the same address causes a problem because
* the load doesn't load the correct values unless we wait for the store first.
* This is NOT mitigated by an s_nop.
SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
if (ctx.pending_s_buffer_store &&
!smem->definitions.empty() &&
- !smem->can_reorder && smem->barrier == barrier_buffer) {
+ !smem->sync.can_reorder()) {
imm.lgkm = 0;
}
}
- if (instr->format == Format::PSEUDO_BARRIER) {
- switch (instr->opcode) {
- case aco_opcode::p_memory_barrier_common:
- imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
- imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
- imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
- if (ctx.program->workgroup_size > ctx.program->wave_size)
- imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
- break;
- case aco_opcode::p_memory_barrier_atomic:
- imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
- break;
- /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
- case aco_opcode::p_memory_barrier_buffer:
- case aco_opcode::p_memory_barrier_image:
- imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
- imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
- break;
- case aco_opcode::p_memory_barrier_shared:
- if (ctx.program->workgroup_size > ctx.program->wave_size)
- imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
- break;
- case aco_opcode::p_memory_barrier_gs_data:
- imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]);
- break;
- case aco_opcode::p_memory_barrier_gs_sendmsg:
- imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]);
- break;
- default:
- assert(false);
- break;
- }
- }
+ if (instr->opcode == aco_opcode::p_barrier)
+ imm.combine(perform_barrier(ctx, static_cast<Pseudo_barrier_instruction *>(instr)->sync, semantic_acqrel));
+ else
+ imm.combine(perform_barrier(ctx, sync_info, semantic_release));
if (!imm.empty()) {
if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
/* update barrier wait imms */
- for (unsigned i = 0; i < barrier_count; i++) {
+ for (unsigned i = 0; i < storage_count; i++) {
wait_imm& bar = ctx.barrier_imm[i];
uint16_t& bar_ev = ctx.barrier_events[i];
if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
(*ctr)++;
}
-void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barrier_interaction barrier)
+void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
{
- for (unsigned i = 0; i < barrier_count; i++) {
+ for (unsigned i = 0; i < storage_count; i++) {
wait_imm& bar = ctx.barrier_imm[i];
uint16_t& bar_ev = ctx.barrier_events[i];
- if (barrier & (1 << i)) {
+ if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
bar_ev |= event;
if (counters & counter_lgkm)
bar.lgkm = 0;
}
}
-void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
+void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
{
uint8_t counters = get_counters_for_event(event);
if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
ctx.vs_cnt++;
- update_barrier_imm(ctx, counters, event, barrier);
+ update_barrier_imm(ctx, counters, event, sync);
if (ctx.unordered_events & event)
return;
}
}
-void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
+void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
{
assert(ctx.chip_class < GFX10);
if (ctx.vm_cnt <= ctx.max_vm_cnt)
ctx.vm_cnt++;
- update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, barrier);
+ update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
{
break;
}
case Format::FLAT: {
+ FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
if (ctx.chip_class < GFX10 && !instr->definitions.empty())
- update_counters_for_flat_load(ctx, barrier_buffer);
+ update_counters_for_flat_load(ctx, flat->sync);
else
- update_counters(ctx, event_flat, barrier_buffer);
+ update_counters(ctx, event_flat, flat->sync);
if (!instr->definitions.empty())
insert_wait_entry(ctx, instr->definitions[0], event_flat);
}
case Format::SMEM: {
SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
- update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
+ update_counters(ctx, event_smem, smem->sync);
if (!instr->definitions.empty())
insert_wait_entry(ctx, instr->definitions[0], event_smem);
else if (ctx.chip_class >= GFX10 &&
- !smem->can_reorder &&
- smem->barrier == barrier_buffer)
+ !smem->sync.can_reorder())
ctx.pending_s_buffer_store = true;
break;
}
case Format::DS: {
- bool gds = static_cast<DS_instruction*>(instr)->gds;
- update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
- if (gds)
+ DS_instruction *ds = static_cast<DS_instruction*>(instr);
+ update_counters(ctx, ds->gds ? event_gds : event_lds, ds->sync);
+ if (ds->gds)
update_counters(ctx, event_gds_gpr_lock);
if (!instr->definitions.empty())
- insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
+ insert_wait_entry(ctx, instr->definitions[0], ds->gds ? event_gds : event_lds);
- if (gds) {
+ if (ds->gds) {
for (const Operand& op : instr->operands)
insert_wait_entry(ctx, op, event_gds_gpr_lock);
insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
case Format::MIMG:
case Format::GLOBAL: {
wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
- update_counters(ctx, ev, get_barrier_interaction(instr));
+ update_counters(ctx, ev, get_sync_info(instr));
bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
case Format::SOPP: {
if (instr->opcode == aco_opcode::s_sendmsg ||
instr->opcode == aco_opcode::s_sendmsghalt)
- update_counters(ctx, event_sendmsg, get_barrier_interaction(instr));
+ update_counters(ctx, event_sendmsg);
}
default:
break;
wait_imm queued_imm;
- ctx.collect_statistics = program->collect_statistics;
-
for (aco_ptr<Instruction>& instr : block.instructions) {
bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
- queued_imm.combine(kill(instr.get(), ctx));
+ memory_sync_info sync_info = get_sync_info(instr.get());
+ queued_imm.combine(kill(instr.get(), ctx, sync_info));
ctx.gen_instr = instr.get();
gen(instr.get(), ctx);
}
new_instructions.emplace_back(std::move(instr));
+ queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire));
+
if (ctx.collect_statistics)
ctx.advance_unwaited_instrs();
}