amd/registers: switch to new generated register definitions

[mesa.git] / src / amd / compiler / aco_insert_waitcnt.cpp
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp

index fa2b5234d2e214ad33009914cffaa072a066d111..751892e44368a476d1f1ff44261826d536037691 100644 (file)
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -155,6 +155,7 @@ struct wait_imm {
        assert(exp == unset_counter || exp <= 0x7);
        switch (chip) {
        case GFX10:
+      case GFX10_3:
           assert(lgkm == unset_counter || lgkm <= 0x3f);
           assert(vm == unset_counter || vm <= 0x3f);
           imm = ((vm & 0x30) << 10) | ((lgkm & 0x3f) << 8) | ((exp & 0x7) << 4) | (vm & 0xf);
@@ -448,7 +449,7 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
  {
     wait_imm imm;
     sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
-   if (sync.semantics & semantics) {
+   if ((sync.semantics & semantics) && sync.scope > subgroup_scope) {
        unsigned storage = sync.storage;
        while (storage) {
           unsigned idx = u_bit_scan(&storage);
@@ -460,6 +461,10 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
           if (bar_scope_lds <= subgroup_scope)
              events &= ~event_lds;
  
+         /* in non-WGP, the L1/L0 cache keeps all memory operations in-order for the same workgroup */
+         if (ctx.chip_class < GFX10 && sync.scope <= scope_workgroup)
+            events &= ~(event_vmem | event_vmem_store | event_smem);
+
           if (events)
              imm.combine(ctx.barrier_imm[idx]);
        }
@@ -468,9 +473,32 @@ wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantic
     return imm;
  }
  
+void force_waitcnt(wait_ctx& ctx, wait_imm& imm)
+{
+   if (ctx.vm_cnt)
+      imm.vm = 0;
+   if (ctx.exp_cnt)
+      imm.exp = 0;
+   if (ctx.lgkm_cnt)
+      imm.lgkm = 0;
+
+   if (ctx.chip_class >= GFX10) {
+      if (ctx.vs_cnt)
+         imm.vs = 0;
+   }
+}
+
  wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
  {
     wait_imm imm;
+
+   if (debug_flags & DEBUG_FORCE_WAITCNT) {
+      /* Force emitting waitcnt states right after the instruction if there is
+       * something to wait for.
+       */
+      force_waitcnt(ctx, imm);
+   }
+
     if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
        imm.combine(check_instr(instr, ctx));