assert(exp == unset_counter || exp <= 0x7);
switch (chip) {
case GFX10:
+ case GFX10_3:
assert(lgkm == unset_counter || lgkm <= 0x3f);
assert(vm == unset_counter || vm <= 0x3f);
imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
{
wait_imm imm;
sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
- if (sync.semantics & semantics) {
+ if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
unsigned storage = sync.storage;
while (storage) {
unsigned idx = u_bit_scan(&storage);
if (bar_scope_lds <= subgroup_scope)
events &= ~event_lds;
+ /* in non-WGP, the L1/L0 cache keeps all memory operations in-order for the same workgroup */
+ if (ctx.chip_class < GFX10 && sync.scope <= scope_workgroup)
+ events &= ~(event_vmem | event_vmem_store | event_smem);
+
if (events)
imm.combine(ctx.barrier_imm[idx]);
}
return imm;
}
+void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
+{
+ if (ctx.vm_cnt)
+ imm.vm = 0;
+ if (ctx.exp_cnt)
+ imm.exp = 0;
+ if (ctx.lgkm_cnt)
+ imm.lgkm = 0;
+
+ if (ctx.chip_class >= GFX10) {
+ if (ctx.vs_cnt)
+ imm.vs = 0;
+ }
+}
+
wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
{
wait_imm imm;
+
+ if (debug_flags & DEBUG_FORCE_WAITCNT) {
+ /* Force emitting waitcnt states right after the instruction if there is
+ * something to wait for.
+ */
+ force_waitcnt(ctx, imm);
+ }
+
if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
imm.combine(check_instr(instr, ctx));