From d1f992f3c2d138faa0c89a2486c4252a06886106 Mon Sep 17 00:00:00 2001
From: Rhys Perry <pendingchaos02@gmail.com>
Date: Fri, 26 Jun 2020 15:54:22 +0100
Subject: [PATCH] aco: rework barriers and replace can_reorder
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

fossil-db (Navi):
Totals from 273 (0.21% of 132058) affected shaders:
CodeSize: 937472 -> 936556 (-0.10%)
Instrs: 158874 -> 158648 (-0.14%)
Cycles: 13563516 -> 13562612 (-0.01%)
VMEM: 85246 -> 85244 (-0.00%)
SMEM: 21407 -> 21310 (-0.45%); split: +0.05%, -0.50%
VClause: 9321 -> 9317 (-0.04%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Daniel SchÃ¼rmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4905>
---
 src/amd/compiler/aco_dead_code_analysis.cpp   |   2 +-
 src/amd/compiler/aco_insert_waitcnt.cpp       | 116 +++++----
 .../compiler/aco_instruction_selection.cpp    | 182 +++++++-------
 src/amd/compiler/aco_ir.cpp                   |  22 ++
 src/amd/compiler/aco_ir.h                     | 138 ++++++++---
 src/amd/compiler/aco_lower_to_hw_instr.cpp    |  13 +
 src/amd/compiler/aco_opcodes.py               |  15 +-
 src/amd/compiler/aco_opt_value_numbering.cpp  |  23 +-
 src/amd/compiler/aco_optimizer.cpp            |   3 +-
 src/amd/compiler/aco_print_ir.cpp             |  97 ++++++--
 src/amd/compiler/aco_scheduler.cpp            | 234 +++++++++---------
 src/amd/compiler/aco_spill.cpp                |  15 +-
 12 files changed, 520 insertions(+), 340 deletions(-)

diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp
index 87b3112dc97..f43d784c55f 100644
--- a/src/amd/compiler/aco_dead_code_analysis.cpp
+++ b/src/amd/compiler/aco_dead_code_analysis.cpp
@@ -84,7 +84,7 @@ bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr)
    if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
           [&uses] (const Definition& def) { return uses[def.tempId()];}))
       return false;
-   return !instr_info.is_atomic[(int)instr->opcode];
+   return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
 }
 
 std::vector<uint16_t> dead_code_analysis(Program *program) {
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index aedcad1b573..fa2b5234d2e 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -273,8 +273,8 @@ struct wait_ctx {
    bool pending_flat_vm = false;
    bool pending_s_buffer_store = false; /* GFX10 workaround */
 
-   wait_imm barrier_imm[barrier_count];
-   uint16_t barrier_events[barrier_count] = {}; /* use wait_event notion */
+   wait_imm barrier_imm[storage_count];
+   uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
 
    std::map<PhysReg,wait_entry> gpr_map;
 
@@ -327,7 +327,7 @@ struct wait_ctx {
          }
       }
 
-      for (unsigned i = 0; i < barrier_count; i++) {
+      for (unsigned i = 0; i < storage_count; i++) {
          changed |= barrier_imm[i].combine(other->barrier_imm[i]);
          changed |= other->barrier_events[i] & ~barrier_events[i];
          barrier_events[i] |= other->barrier_events[i];
@@ -444,7 +444,31 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
    return wait_imm();
 }
 
-wait_imm kill(Instruction* instr, wait_ctx& ctx)
+wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
+{
+   wait_imm imm;
+   sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
+   if (sync.semantics & semantics) {
+      unsigned storage = sync.storage;
+      while (storage) {
+         unsigned idx = u_bit_scan(&storage);
+
+         /* LDS is private to the workgroup */
+         sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
+
+         uint16_t events = ctx.barrier_events[idx];
+         if (bar_scope_lds <= subgroup_scope)
+            events &= ~event_lds;
+
+         if (events)
+            imm.combine(ctx.barrier_imm[idx]);
+      }
+   }
+
+   return imm;
+}
+
+wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
 {
    wait_imm imm;
    if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
@@ -471,44 +495,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
       SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
       if (ctx.pending_s_buffer_store &&
           !smem->definitions.empty() &&
-          !smem->can_reorder && smem->barrier == barrier_buffer) {
+          !smem->sync.can_reorder()) {
          imm.lgkm = 0;
       }
    }
 
-   if (instr->format == Format::PSEUDO_BARRIER) {
-      switch (instr->opcode) {
-      case aco_opcode::p_memory_barrier_common:
-         imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
-         if (ctx.program->workgroup_size > ctx.program->wave_size)
-            imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_atomic:
-         imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
-         break;
-      /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
-      case aco_opcode::p_memory_barrier_buffer:
-      case aco_opcode::p_memory_barrier_image:
-         imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_shared:
-         if (ctx.program->workgroup_size > ctx.program->wave_size)
-            imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_gs_data:
-         imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_gs_sendmsg:
-         imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]);
-         break;
-      default:
-         assert(false);
-         break;
-      }
-   }
+   if (instr->opcode == aco_opcode::p_barrier)
+      imm.combine(perform_barrier(ctx, static_cast<Pseudo_barrier_instruction *>(instr)->sync, semantic_acqrel));
+   else
+      imm.combine(perform_barrier(ctx, sync_info, semantic_release));
 
    if (!imm.empty()) {
       if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
@@ -523,7 +518,7 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
       ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
 
       /* update barrier wait imms */
-      for (unsigned i = 0; i < barrier_count; i++) {
+      for (unsigned i = 0; i < storage_count; i++) {
          wait_imm& bar = ctx.barrier_imm[i];
          uint16_t& bar_ev = ctx.barrier_events[i];
          if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
@@ -581,12 +576,12 @@ void update_barrier_counter(uint8_t *ctr, unsigned max)
       (*ctr)++;
 }
 
-void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barrier_interaction barrier)
+void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
 {
-   for (unsigned i = 0; i < barrier_count; i++) {
+   for (unsigned i = 0; i < storage_count; i++) {
       wait_imm& bar = ctx.barrier_imm[i];
       uint16_t& bar_ev = ctx.barrier_events[i];
-      if (barrier & (1 << i)) {
+      if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
          bar_ev |= event;
          if (counters & counter_lgkm)
             bar.lgkm = 0;
@@ -609,7 +604,7 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barri
    }
 }
 
-void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
+void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
 {
    uint8_t counters = get_counters_for_event(event);
 
@@ -622,7 +617,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie
    if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
       ctx.vs_cnt++;
 
-   update_barrier_imm(ctx, counters, event, barrier);
+   update_barrier_imm(ctx, counters, event, sync);
 
    if (ctx.unordered_events & event)
       return;
@@ -651,7 +646,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie
    }
 }
 
-void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
+void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
 {
    assert(ctx.chip_class < GFX10);
 
@@ -660,7 +655,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=ba
    if (ctx.vm_cnt <= ctx.max_vm_cnt)
       ctx.vm_cnt++;
 
-   update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, barrier);
+   update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
 
    for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
    {
@@ -748,10 +743,11 @@ void gen(Instruction* instr, wait_ctx& ctx)
       break;
    }
    case Format::FLAT: {
+      FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
       if (ctx.chip_class < GFX10 && !instr->definitions.empty())
-         update_counters_for_flat_load(ctx, barrier_buffer);
+         update_counters_for_flat_load(ctx, flat->sync);
       else
-         update_counters(ctx, event_flat, barrier_buffer);
+         update_counters(ctx, event_flat, flat->sync);
 
       if (!instr->definitions.empty())
          insert_wait_entry(ctx, instr->definitions[0], event_flat);
@@ -759,27 +755,26 @@ void gen(Instruction* instr, wait_ctx& ctx)
    }
    case Format::SMEM: {
       SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
-      update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
+      update_counters(ctx, event_smem, smem->sync);
 
       if (!instr->definitions.empty())
          insert_wait_entry(ctx, instr->definitions[0], event_smem);
       else if (ctx.chip_class >= GFX10 &&
-               !smem->can_reorder &&
-               smem->barrier == barrier_buffer)
+               !smem->sync.can_reorder())
          ctx.pending_s_buffer_store = true;
 
       break;
    }
    case Format::DS: {
-      bool gds = static_cast<DS_instruction*>(instr)->gds;
-      update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
-      if (gds)
+      DS_instruction *ds = static_cast<DS_instruction*>(instr);
+      update_counters(ctx, ds->gds ? event_gds : event_lds, ds->sync);
+      if (ds->gds)
          update_counters(ctx, event_gds_gpr_lock);
 
       if (!instr->definitions.empty())
-         insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
+         insert_wait_entry(ctx, instr->definitions[0], ds->gds ? event_gds : event_lds);
 
-      if (gds) {
+      if (ds->gds) {
          for (const Operand& op : instr->operands)
             insert_wait_entry(ctx, op, event_gds_gpr_lock);
          insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
@@ -791,7 +786,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
    case Format::MIMG:
    case Format::GLOBAL: {
       wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
-      update_counters(ctx, ev, get_barrier_interaction(instr));
+      update_counters(ctx, ev, get_sync_info(instr));
 
       bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
 
@@ -817,7 +812,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
    case Format::SOPP: {
       if (instr->opcode == aco_opcode::s_sendmsg ||
           instr->opcode == aco_opcode::s_sendmsghalt)
-         update_counters(ctx, event_sendmsg, get_barrier_interaction(instr));
+         update_counters(ctx, event_sendmsg);
    }
    default:
       break;
@@ -851,7 +846,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
    for (aco_ptr<Instruction>& instr : block.instructions) {
       bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
 
-      queued_imm.combine(kill(instr.get(), ctx));
+      memory_sync_info sync_info = get_sync_info(instr.get());
+      queued_imm.combine(kill(instr.get(), ctx, sync_info));
 
       ctx.gen_instr = instr.get();
       gen(instr.get(), ctx);
@@ -863,6 +859,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
          }
          new_instructions.emplace_back(std::move(instr));
 
+         queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire));
+
          if (ctx.collect_statistics)
             ctx.advance_unwaited_instrs();
       }
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index 80ea1e133d6..81c50ebbc9c 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3137,8 +3137,7 @@ struct LoadEmitInfo {
 
    bool glc = false;
    unsigned swizzle_component_size = 0;
-   barrier_interaction barrier = barrier_none;
-   bool can_reorder = true;
+   memory_sync_info sync;
    Temp soffset = Temp(0, s1);
 };
 
@@ -3441,10 +3440,12 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
 
    RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
    Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+   Instruction *instr;
    if (read2)
-      bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+      instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
    else
-      bld.ds(op, Definition(val), offset, m, const_offset);
+      instr = bld.ds(op, Definition(val), offset, m, const_offset);
+   static_cast<DS_instruction *>(instr)->sync = info->sync;
 
    if (size < 4)
       val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
@@ -3490,8 +3491,7 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
    load->definitions[0] = Definition(val);
    load->glc = info->glc;
    load->dlc = info->glc && bld.program->chip_class >= GFX10;
-   load->barrier = info->barrier;
-   load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
+   load->sync = info->sync;
    bld.insert(std::move(load));
    return val;
 }
@@ -3540,8 +3540,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
    mubuf->offen = (offset.type() == RegType::vgpr);
    mubuf->glc = info->glc;
    mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
-   mubuf->barrier = info->barrier;
-   mubuf->can_reorder = info->can_reorder;
+   mubuf->sync = info->sync;
    mubuf->offset = const_offset;
    mubuf->swizzled = info->swizzle_component_size != 0;
    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
@@ -3605,7 +3604,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
       mubuf->offset = 0;
       mubuf->addr64 = offset.type() == RegType::vgpr;
       mubuf->disable_wqm = false;
-      mubuf->barrier = info->barrier;
+      mubuf->sync = info->sync;
       mubuf->definitions[0] = Definition(val);
       bld.insert(std::move(mubuf));
    } else {
@@ -3616,7 +3615,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
       flat->operands[1] = Operand(s1);
       flat->glc = info->glc;
       flat->dlc = info->glc && bld.program->chip_class >= GFX10;
-      flat->barrier = info->barrier;
+      flat->sync = info->sync;
       flat->offset = 0u;
       flat->definitions[0] = Definition(val);
       bld.insert(std::move(flat));
@@ -3638,8 +3637,7 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
    info.align_mul = align;
    info.align_offset = 0;
-   info.barrier = barrier_shared;
-   info.can_reorder = false;
+   info.sync = memory_sync_info(storage_shared);
    info.const_offset = base_offset;
    emit_lds_load(ctx, bld, &info);
 
@@ -3848,13 +3846,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
       }
       assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
 
+      Instruction *instr;
       if (write2) {
          Temp second_data = write_datas[second];
          inline_offset /= data.bytes();
-         bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+         instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
       } else {
-         bld.ds(op, address_offset, data, m, inline_offset);
+         instr = bld.ds(op, address_offset, data, m, inline_offset);
       }
+      static_cast<DS_instruction *>(instr)->sync =
+         memory_sync_info(storage_shared);
    }
 }
 
@@ -4017,7 +4018,8 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
                                  /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
                                  /* dlc*/ false, /* slc */ slc);
 
-   static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+   if (!allow_reorder)
+      static_cast<MUBUF_instruction *>(r.instr)->sync = memory_sync_info(storage_buffer, semantic_private);
 }
 
 void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
@@ -4847,15 +4849,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
          }
 
          if (use_mubuf) {
-            Instruction *mubuf = bld.mubuf(opcode,
-                                           Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_offset, false, false, true).instr;
-            static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
+            bld.mubuf(opcode,
+                      Definition(fetch_dst), list, fetch_index, soffset,
+                      fetch_offset, false, false, true).instr;
          } else {
-            Instruction *mtbuf = bld.mtbuf(opcode,
-                                           Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_dfmt, nfmt, fetch_offset, false, true).instr;
-            static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
+            bld.mtbuf(opcode,
+                      Definition(fetch_dst), list, fetch_index, soffset,
+                      fetch_dfmt, nfmt, fetch_offset, false, true).instr;
          }
 
          emit_split_vector(ctx, fetch_dst, fetch_dst.size());
@@ -5208,7 +5208,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
 
 void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
                  Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
-                 bool glc=false, bool readonly=true, bool allow_smem=true)
+                 bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
 {
    Builder bld(ctx->program, ctx->block);
 
@@ -5218,8 +5218,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_
 
    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
    info.glc = glc;
-   info.barrier = readonly ? barrier_none : barrier_buffer;
-   info.can_reorder = readonly;
+   info.sync = sync;
    info.align_mul = align_mul;
    info.align_offset = align_offset;
    if (use_smem)
@@ -5737,7 +5736,6 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
    load->unrm = true;
    load->da = da;
    load->dim = dim;
-   load->can_reorder = true; /* fmask images shouldn't be modified */
    ctx->block->instructions.emplace_back(std::move(load));
 
    Operand sample_index4;
@@ -5837,6 +5835,22 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr
 }
 
 
+memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
+{
+   /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
+   if (semantics & semantic_atomicrmw)
+      return memory_sync_info(storage, semantics);
+
+   unsigned access = nir_intrinsic_access(instr);
+
+   if (access & ACCESS_VOLATILE)
+      semantics |= semantic_volatile;
+   if (access & ACCESS_CAN_REORDER)
+      semantics |= semantic_can_reorder | semantic_private;
+
+   return memory_sync_info(storage, semantics);
+}
+
 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
 {
    Builder bld(ctx->program, ctx->block);
@@ -5846,6 +5860,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
    bool is_array = glsl_sampler_type_is_array(type);
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
 
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+
    if (dim == GLSL_SAMPLER_DIM_BUF) {
       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
       unsigned num_channels = util_last_bit(mask);
@@ -5882,7 +5898,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
       load->idxen = true;
       load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
-      load->barrier = barrier_image;
+      load->sync = sync;
       ctx->block->instructions.emplace_back(std::move(load));
 
       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
@@ -5914,7 +5930,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
    load->dmask = dmask;
    load->unrm = true;
    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
-   load->barrier = barrier_image;
+   load->sync = sync;
    ctx->block->instructions.emplace_back(std::move(load));
 
    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
@@ -5929,6 +5945,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
    bool is_array = glsl_sampler_type_is_array(type);
    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
 
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
    bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
 
    if (dim == GLSL_SAMPLER_DIM_BUF) {
@@ -5960,7 +5977,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
       store->glc = glc;
       store->dlc = false;
       store->disable_wqm = true;
-      store->barrier = barrier_image;
+      store->sync = sync;
       ctx->program->needs_exact = true;
       ctx->block->instructions.emplace_back(std::move(store));
       return;
@@ -5984,7 +6001,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
    store->unrm = true;
    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
    store->disable_wqm = true;
-   store->barrier = barrier_image;
+   store->sync = sync;
    ctx->program->needs_exact = true;
    ctx->block->instructions.emplace_back(std::move(store));
    return;
@@ -6062,6 +6079,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
 
    if (dim == GLSL_SAMPLER_DIM_BUF) {
       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
@@ -6079,7 +6097,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
       mubuf->glc = return_previous;
       mubuf->dlc = false; /* Not needed for atomics */
       mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_image;
+      mubuf->sync = sync;
       ctx->program->needs_exact = true;
       ctx->block->instructions.emplace_back(std::move(mubuf));
       return;
@@ -6100,7 +6118,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    mimg->unrm = true;
    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
    mimg->disable_wqm = true;
-   mimg->barrier = barrier_image;
+   mimg->sync = sync;
    ctx->program->needs_exact = true;
    ctx->block->instructions.emplace_back(std::move(mimg));
    return;
@@ -6164,7 +6182,6 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
    mimg->da = glsl_sampler_type_is_array(type);
-   mimg->can_reorder = true;
    Definition& def = mimg->definitions[0];
    ctx->block->instructions.emplace_back(std::move(mimg));
 
@@ -6219,7 +6236,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
 
    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
-               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
+               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
+               get_memory_sync_info(instr, storage_buffer, 0));
 }
 
 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -6233,6 +6251,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
 
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
    uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
    /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
@@ -6275,7 +6294,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
          store->glc = glc;
          store->dlc = false;
          store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
          ctx->block->instructions.emplace_back(std::move(store));
          ctx->program->wb_smem_l1_on_end = true;
          if (op == aco_opcode::p_fs_buffer_store_smem) {
@@ -6293,7 +6312,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
          store->glc = glc;
          store->dlc = false;
          store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
          ctx->program->needs_exact = true;
          ctx->block->instructions.emplace_back(std::move(store));
       }
@@ -6384,7 +6403,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    mubuf->glc = return_previous;
    mubuf->dlc = false; /* Not needed for atomics */
    mubuf->disable_wqm = true;
-   mubuf->barrier = barrier_buffer;
+   mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
    ctx->program->needs_exact = true;
    ctx->block->instructions.emplace_back(std::move(mubuf));
 }
@@ -6409,8 +6428,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
    info.align_mul = nir_intrinsic_align_mul(instr);
    info.align_offset = nir_intrinsic_align_offset(instr);
-   info.barrier = barrier_buffer;
-   info.can_reorder = false;
+   info.sync = get_memory_sync_info(instr, storage_buffer, 0);
    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
     * it's safe to use SMEM */
    bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
@@ -6430,6 +6448,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
    Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
 
    if (ctx->options->chip_class >= GFX7)
@@ -6495,7 +6514,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
          flat->dlc = false;
          flat->offset = offset;
          flat->disable_wqm = true;
-         flat->barrier = barrier_buffer;
+         flat->sync = sync;
          ctx->program->needs_exact = true;
          ctx->block->instructions.emplace_back(std::move(flat));
       } else {
@@ -6515,7 +6534,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
          mubuf->offset = offsets[i];
          mubuf->addr64 = addr.type() == RegType::vgpr;
          mubuf->disable_wqm = true;
-         mubuf->barrier = barrier_buffer;
+         mubuf->sync = sync;
          ctx->program->needs_exact = true;
          ctx->block->instructions.emplace_back(std::move(mubuf));
       }
@@ -6608,7 +6627,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
       flat->dlc = false; /* Not needed for atomics */
       flat->offset = 0;
       flat->disable_wqm = true;
-      flat->barrier = barrier_buffer;
+      flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
       ctx->program->needs_exact = true;
       ctx->block->instructions.emplace_back(std::move(flat));
    } else {
@@ -6675,7 +6694,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
       mubuf->offset = 0;
       mubuf->addr64 = addr.type() == RegType::vgpr;
       mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_buffer;
+      mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
       ctx->program->needs_exact = true;
       ctx->block->instructions.emplace_back(std::move(mubuf));
    }
@@ -6683,20 +6702,30 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
 
 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
    Builder bld(ctx->program, ctx->block);
+   storage_class all_mem = (storage_class)(storage_buffer | storage_image | storage_atomic_counter | storage_shared);
    switch(instr->intrinsic) {
       case nir_intrinsic_group_memory_barrier:
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(all_mem, semantic_acqrel, scope_workgroup));
+         break;
       case nir_intrinsic_memory_barrier:
-         bld.barrier(aco_opcode::p_memory_barrier_common);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(all_mem, semantic_acqrel, scope_device));
          break;
       case nir_intrinsic_memory_barrier_buffer:
-         bld.barrier(aco_opcode::p_memory_barrier_buffer);
-         break;
       case nir_intrinsic_memory_barrier_image:
-         bld.barrier(aco_opcode::p_memory_barrier_image);
+         /* since NIR splits barriers, we have to unify buffer and image barriers
+          * for now so dEQP-VK.memory_model.message_passing.core11.u32.coherent.
+          * fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
+          * passes
+          */
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info((storage_class)(storage_buffer | storage_image), semantic_acqrel, scope_device));
          break;
       case nir_intrinsic_memory_barrier_tcs_patch:
       case nir_intrinsic_memory_barrier_shared:
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup));
          break;
       default:
          unreachable("Unimplemented memory barrier intrinsic");
@@ -6844,6 +6873,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
    ds->offset0 = offset;
    if (return_previous)
       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
+   ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
    ctx->block->instructions.emplace_back(std::move(ds));
 }
 
@@ -6884,7 +6914,7 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
    info.align_mul = nir_intrinsic_align_mul(instr);
    info.align_offset = nir_intrinsic_align_offset(instr);
    info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
-   info.can_reorder = false;
+   info.sync = memory_sync_info(storage_buffer, semantic_private);
    info.soffset = ctx->program->scratch_offset;
    emit_scratch_load(ctx, bld, &info);
 }
@@ -6907,7 +6937,8 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
 
    for (unsigned i = 0; i < write_count; i++) {
       aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_buffer, semantic_private);
    }
 }
 
@@ -7021,8 +7052,7 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
             mtbuf->offset = const_offset;
             mtbuf->glc = true;
             mtbuf->slc = true;
-            mtbuf->barrier = barrier_gs_data;
-            mtbuf->can_reorder = true;
+            mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
             bld.insert(std::move(mtbuf));
          }
 
@@ -7347,8 +7377,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
          load->glc = false;
          load->dlc = false;
          load->disable_wqm = false;
-         load->barrier = barrier_none;
-         load->can_reorder = true;
          ctx->block->instructions.emplace_back(std::move(load));
       }
 
@@ -7531,17 +7559,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       visit_get_buffer_size(ctx, instr);
       break;
    case nir_intrinsic_control_barrier: {
-      if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
-         /* GFX6 only (thanks to a hw bug workaround):
-          * The real barrier instruction isnât needed, because an entire patch
-          * always fits into a single wave.
-          */
-         break;
-      }
-
-      if (ctx->program->workgroup_size > ctx->program->wave_size)
-         bld.sopp(aco_opcode::s_barrier);
-
+      bld.barrier(aco_opcode::p_barrier, memory_sync_info(0, 0, scope_invocation), scope_workgroup);
       break;
    }
    case nir_intrinsic_memory_barrier_tcs_patch:
@@ -8093,7 +8111,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       aco_opcode opcode =
          nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
             aco_opcode::s_memrealtime : aco_opcode::s_memtime;
-      bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
+      bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), memory_sync_info(0, semantic_volatile));
       emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
       break;
    }
@@ -8674,7 +8692,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       tex->da = da;
       tex->definitions[0] = Definition(tmp_dst);
       tex->dim = dim;
-      tex->can_reorder = true;
       ctx->block->instructions.emplace_back(std::move(tex));
 
       if (div_by_6) {
@@ -8707,7 +8724,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       tex->da = da;
       Temp size = bld.tmp(v2);
       tex->definitions[0] = Definition(size);
-      tex->can_reorder = true;
       ctx->block->instructions.emplace_back(std::move(tex));
       emit_split_vector(ctx, size, size.size());
 
@@ -8809,7 +8825,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       mubuf->operands[2] = Operand((uint32_t) 0);
       mubuf->definitions[0] = Definition(tmp_dst);
       mubuf->idxen = true;
-      mubuf->can_reorder = true;
       ctx->block->instructions.emplace_back(std::move(mubuf));
 
       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
@@ -8858,7 +8873,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       tex->unrm = true;
       tex->da = da;
       tex->definitions[0] = Definition(tmp_dst);
-      tex->can_reorder = true;
       ctx->block->instructions.emplace_back(std::move(tex));
 
       if (instr->op == nir_texop_samples_identical) {
@@ -9002,7 +9016,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
    tex->dmask = dmask;
    tex->da = da;
    tex->definitions[0] = Definition(tmp_dst);
-   tex->can_reorder = true;
    ctx->block->instructions.emplace_back(std::move(tex));
 
    if (tg4_integer_cube_workaround) {
@@ -10285,6 +10298,13 @@ static void create_fs_exports(isel_context *ctx)
       create_null_export(ctx);
 }
 
+static void create_workgroup_barrier(Builder& bld)
+{
+   bld.barrier(aco_opcode::p_barrier,
+               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
+               scope_workgroup);
+}
+
 static void write_tcs_tess_factors(isel_context *ctx)
 {
    unsigned outer_comps;
@@ -10309,9 +10329,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
 
    Builder bld(ctx->program, ctx->block);
 
-   bld.barrier(aco_opcode::p_memory_barrier_shared);
-   if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
-      bld.sopp(aco_opcode::s_barrier);
+   create_workgroup_barrier(bld);
 
    Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
    Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
@@ -10470,7 +10488,6 @@ static void emit_stream_output(isel_context *ctx,
       store->glc = true;
       store->dlc = false;
       store->slc = true;
-      store->can_reorder = true;
       ctx->block->instructions.emplace_back(std::move(store));
    }
 }
@@ -10890,8 +10907,7 @@ void ngg_emit_nogs_output(isel_context *ctx)
 
       if (ctx->stage == ngg_vertex_gs) {
          /* Wait for GS threads to store primitive ID in LDS. */
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);
 
          /* Calculate LDS address where the GS threads stored the primitive ID. */
          Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
@@ -10975,8 +10991,7 @@ void select_program(Program *program,
       if (i) {
          Builder bld(ctx.program, ctx.block);
 
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);
 
          if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
             ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
@@ -10999,7 +11014,8 @@ void select_program(Program *program,
          ngg_emit_nogs_output(&ctx);
       } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
          Builder bld(ctx.program, ctx.block);
-         bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(storage_vmem_output, semantic_release, scope_device));
          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
          write_tcs_tess_factors(&ctx);
@@ -11031,7 +11047,7 @@ void select_program(Program *program,
    ctx.block->kind |= block_kind_uniform;
    Builder bld(ctx.program, ctx.block);
    if (ctx.program->wb_smem_l1_on_end)
-      bld.smem(aco_opcode::s_dcache_wb, false);
+      bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
    bld.sopp(aco_opcode::s_endpgm);
 
    cleanup_cfg(program);
@@ -11116,8 +11132,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
             mubuf->glc = true;
             mubuf->slc = true;
             mubuf->dlc = args->options->chip_class >= GFX10;
-            mubuf->barrier = barrier_none;
-            mubuf->can_reorder = true;
 
             ctx.outputs.mask[i] |= 1 << j;
             ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 92c50cdd94e..75ca80cf2ac 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -127,6 +127,28 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
    program->next_fp_mode.round32 = fp_round_ne;
 }
 
+memory_sync_info get_sync_info(const Instruction* instr)
+{
+   switch (instr->format) {
+   case Format::SMEM:
+      return static_cast<const SMEM_instruction*>(instr)->sync;
+   case Format::MUBUF:
+      return static_cast<const MUBUF_instruction*>(instr)->sync;
+   case Format::MIMG:
+      return static_cast<const MIMG_instruction*>(instr)->sync;
+   case Format::MTBUF:
+      return static_cast<const MTBUF_instruction*>(instr)->sync;
+   case Format::FLAT:
+   case Format::GLOBAL:
+   case Format::SCRATCH:
+      return static_cast<const FLAT_instruction*>(instr)->sync;
+   case Format::DS:
+      return static_cast<const DS_instruction*>(instr)->sync;
+   default:
+      return memory_sync_info();
+   }
+}
+
 bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr)
 {
    if (!instr->isVALU())
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 6f2a4a46b91..9dda1ffb723 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -103,22 +103,79 @@ enum class Format : std::uint16_t {
    SDWA = 1 << 14,
 };
 
-enum barrier_interaction : uint8_t {
-   barrier_none = 0,
-   barrier_buffer = 0x1,
-   barrier_image = 0x2,
-   barrier_atomic = 0x4,
-   barrier_shared = 0x8,
-   /* used for geometry shaders to ensure vertex data writes are before the
-    * GS_DONE s_sendmsg. */
-   barrier_gs_data = 0x10,
-   /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
-   barrier_gs_sendmsg = 0x20,
-   /* used by barriers. created by s_barrier */
-   barrier_barrier = 0x40,
-   barrier_count = 7,
+enum storage_class : uint8_t {
+   storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
+   storage_buffer = 0x1, /* SSBOs and global memory */
+   storage_atomic_counter = 0x2, /* not used for Vulkan */
+   storage_image = 0x4,
+   storage_shared = 0x8, /* or TCS output */
+   storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
+   storage_scratch = 0x20,
+   storage_vgpr_spill = 0x40,
+   storage_count = 8,
 };
 
+enum memory_semantics : uint8_t {
+   semantic_none = 0x0,
+   /* for loads: don't move any access after this load to before this load (even other loads)
+    * for barriers: don't move any access after the barrier to before any
+    * atomics/control_barriers/sendmsg_gs_done before the barrier */
+   semantic_acquire = 0x1,
+   /* for stores: don't move any access before this store to after this store
+    * for barriers: don't move any access before the barrier to after any
+    * atomics/control_barriers/sendmsg_gs_done after the barrier */
+   semantic_release = 0x2,
+
+   /* the rest are for load/stores/atomics only */
+   /* cannot be DCE'd or CSE'd */
+   semantic_volatile = 0x4,
+   /* does not interact with barriers and assumes this lane is the only lane
+    * accessing this memory */
+   semantic_private = 0x8,
+   /* this operation can be reordered around operations of the same storage. says nothing about barriers */
+   semantic_can_reorder = 0x10,
+   /* this is a atomic instruction (may only read or write memory) */
+   semantic_atomic = 0x20,
+   /* this is instruction both reads and writes memory */
+   semantic_rmw = 0x40,
+
+   semantic_acqrel = semantic_acquire | semantic_release,
+   semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
+};
+
+enum sync_scope : uint8_t {
+   scope_invocation = 0,
+   scope_subgroup = 1,
+   scope_workgroup = 2,
+   scope_queuefamily = 3,
+   scope_device = 4,
+};
+
+struct memory_sync_info {
+   memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
+   memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
+      : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
+
+   storage_class storage:8;
+   memory_semantics semantics:8;
+   sync_scope scope:8;
+
+   bool operator == (const memory_sync_info& rhs) const {
+      return storage == rhs.storage &&
+             semantics == rhs.semantics &&
+             scope == rhs.scope;
+   }
+
+   bool can_reorder() const {
+      if (semantics & semantic_acqrel)
+         return false;
+      /* Also check storage so that zero-initialized memory_sync_info can be
+       * reordered. */
+      return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
+   }
+};
+static_assert(sizeof(memory_sync_info) == 3);
+
 enum fp_round {
    fp_round_ne = 0,
    fp_round_pi = 1,
@@ -931,14 +988,13 @@ static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected p
  *
  */
 struct SMEM_instruction : public Instruction {
-   barrier_interaction barrier;
+   memory_sync_info sync;
    bool glc : 1; /* VI+: globally coherent */
    bool dlc : 1; /* NAVI: device level coherent */
    bool nv : 1; /* VEGA only: Non-volatile */
-   bool can_reorder : 1;
    bool disable_wqm : 1;
    bool prevent_overflow : 1; /* avoid overflow when combining additions */
-   uint32_t padding: 18;
+   uint32_t padding: 3;
 };
 static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 
@@ -1066,11 +1122,13 @@ static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected
  *
  */
 struct DS_instruction : public Instruction {
+   memory_sync_info sync;
+   bool gds;
    int16_t offset0;
    int8_t offset1;
-   bool gds;
+   uint8_t padding;
 };
-static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
 /**
  * Vector Memory Untyped-buffer Instructions
@@ -1081,7 +1139,7 @@ static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected pad
  *
  */
 struct MUBUF_instruction : public Instruction {
-   uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+   memory_sync_info sync;
    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
    bool idxen : 1; /* Supply an index from VGPR (VADDR) */
    bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
@@ -1091,12 +1149,11 @@ struct MUBUF_instruction : public Instruction {
    bool tfe : 1; /* texture fail enable */
    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   bool swizzled:1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+   bool swizzled : 1;
+   uint32_t padding1 : 18;
 };
-static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
 /**
  * Vector Memory Typed-buffer Instructions
@@ -1107,8 +1164,7 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected
  *
  */
 struct MTBUF_instruction : public Instruction {
-   uint16_t offset; /* Unsigned byte offset - 12 bit */
-   barrier_interaction barrier;
+   memory_sync_info sync;
    uint8_t dfmt : 4; /* Data Format of data in memory buffer */
    uint8_t nfmt : 3; /* Numeric format of data in memory */
    bool offen : 1; /* Supply an offset from VGPR (VADDR) */
@@ -1118,8 +1174,8 @@ struct MTBUF_instruction : public Instruction {
    bool slc : 1; /* system level coherent */
    bool tfe : 1; /* texture fail enable */
    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint32_t padding : 25;
+   uint32_t padding : 10;
+   uint16_t offset; /* Unsigned byte offset - 12 bit */
 };
 static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
@@ -1133,6 +1189,7 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
  *
  */
 struct MIMG_instruction : public Instruction {
+   memory_sync_info sync;
    uint8_t dmask; /* Data VGPR enable mask */
    uint8_t dim : 3; /* NAVI: dimensionality */
    bool unrm : 1; /* Force address to be un-normalized */
@@ -1146,11 +1203,9 @@ struct MIMG_instruction : public Instruction {
    bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
    bool d16 : 1; /* Convert 32-bit data to 16-bit data */
    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint32_t padding : 18;
 };
-static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
 /**
  * Flat/Scratch/Global Instructions
@@ -1160,18 +1215,18 @@ static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected p
  *
  */
 struct FLAT_instruction : public Instruction {
-   uint16_t offset; /* Vega/Navi only */
+   memory_sync_info sync;
    bool slc : 1; /* system level coherent */
    bool glc : 1; /* globally coherent */
    bool dlc : 1; /* NAVI: device level coherent */
    bool lds : 1;
    bool nv : 1;
    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint32_t padding0 : 2;
+   uint16_t offset; /* Vega/Navi only */
+   uint16_t padding1;
 };
-static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
 struct Export_instruction : public Instruction {
    uint8_t enabled_mask;
@@ -1200,8 +1255,10 @@ struct Pseudo_branch_instruction : public Instruction {
 static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
 
 struct Pseudo_barrier_instruction : public Instruction {
+   memory_sync_info sync;
+   sync_scope exec_scope;
 };
-static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
+static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
 
 enum ReduceOp : uint16_t {
    iadd8, iadd16, iadd32, iadd64,
@@ -1298,7 +1355,8 @@ static inline bool is_phi(aco_ptr<Instruction>& instr)
    return is_phi(instr.get());
 }
 
-barrier_interaction get_barrier_interaction(const Instruction* instr);
+memory_sync_info get_sync_info(const Instruction* instr);
+
 bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
 
 bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 9af52aad905..c7e4acd542e 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -1854,6 +1854,7 @@ void lower_to_hw_instr(Program* program)
                   emit_gfx10_wave64_bpermute(program, instr, bld);
                else
                   unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
+               break;
             }
             default:
                break;
@@ -1908,6 +1909,18 @@ void lower_to_hw_instr(Program* program)
                            reduce->operands[2].physReg(), // vtmp
                            reduce->definitions[2].physReg(), // sitmp
                            reduce->operands[0], reduce->definitions[0]);
+         } else if (instr->format == Format::PSEUDO_BARRIER) {
+            Pseudo_barrier_instruction* barrier = static_cast<Pseudo_barrier_instruction*>(instr.get());
+
+            /* Anything larger than a workgroup isn't possible. Anything
+             * smaller requires no instructions and this pseudo instruction
+             * would only be included to control optimizations. */
+            bool emit_s_barrier = barrier->exec_scope == scope_workgroup &&
+                                  program->workgroup_size > program->wave_size;
+
+            bld.insert(std::move(instr));
+            if (emit_s_barrier)
+               bld.sopp(aco_opcode::s_barrier);
          } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
             float_mode new_mode = block->fp_mode;
             new_mode.round16_64 = fp_round_ne;
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index 46246cbea01..019ad9c0e0d 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -66,7 +66,7 @@ class Format(Enum):
          return [('uint32_t', 'block', '-1'),
                  ('uint32_t', 'imm', '0')]
       elif self == Format.SMEM:
-         return [('bool', 'can_reorder', 'true'),
+         return [('memory_sync_info', 'sync', 'memory_sync_info()'),
                  ('bool', 'glc', 'false'),
                  ('bool', 'dlc', 'false'),
                  ('bool', 'nv', 'false')]
@@ -123,6 +123,9 @@ class Format(Enum):
       elif self == Format.PSEUDO_REDUCTION:
          return [('ReduceOp', 'op', None, 'reduce_op'),
                  ('unsigned', 'cluster_size', '0')]
+      elif self == Format.PSEUDO_BARRIER:
+         return [('memory_sync_info', 'sync', None),
+                 ('sync_scope', 'exec_scope', 'scope_invocation')]
       elif self == Format.VINTRP:
          return [('unsigned', 'attribute', None),
                  ('unsigned', 'component', None)]
@@ -133,7 +136,7 @@ class Format(Enum):
                  ('bool', 'bound_ctrl', 'true')]
       elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
          return [('uint16_t', 'offset', 0),
-                 ('bool', 'can_reorder', 'true'),
+                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
                  ('bool', 'glc', 'false'),
                  ('bool', 'slc', 'false'),
                  ('bool', 'lds', 'false'),
@@ -265,13 +268,7 @@ opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
 opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
 opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
 
-opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared
-opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER)
+opcode("p_barrier", format=Format.PSEUDO_BARRIER)
 
 opcode("p_spill")
 opcode("p_reload")
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp
index 2fdbfaabd4a..de3d894bf54 100644
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -224,12 +224,15 @@ struct InstrPred {
             return aK->imm == bK->imm;
          }
          case Format::SMEM: {
+            if (!a->operands.empty() && a->operands[0].bytes() == 16)
+               return false;
             SMEM_instruction* aS = static_cast<SMEM_instruction*>(a);
             SMEM_instruction* bS = static_cast<SMEM_instruction*>(b);
             /* isel shouldn't be creating situations where this assertion fails */
             assert(aS->prevent_overflow == bS->prevent_overflow);
-            return aS->can_reorder && bS->can_reorder &&
-                   aS->glc == bS->glc && aS->nv == bS->nv &&
+            return aS->sync.can_reorder() && bS->sync.can_reorder() &&
+                   aS->sync == bS->sync && aS->glc == bS->glc && aS->dlc == bS->dlc &&
+                   aS->nv == bS->nv && aS->disable_wqm == bS->disable_wqm &&
                    aS->prevent_overflow == bS->prevent_overflow;
          }
          case Format::VINTRP: {
@@ -251,8 +254,8 @@ struct InstrPred {
          case Format::MTBUF: {
             MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a);
             MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                    aM->dfmt == bM->dfmt &&
                    aM->nfmt == bM->nfmt &&
                    aM->offset == bM->offset &&
@@ -267,8 +270,8 @@ struct InstrPred {
          case Format::MUBUF: {
             MUBUF_instruction* aM = static_cast<MUBUF_instruction *>(a);
             MUBUF_instruction* bM = static_cast<MUBUF_instruction *>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                    aM->offset == bM->offset &&
                    aM->offen == bM->offen &&
                    aM->idxen == bM->idxen &&
@@ -295,7 +298,9 @@ struct InstrPred {
                return false;
             DS_instruction* aD = static_cast<DS_instruction *>(a);
             DS_instruction* bD = static_cast<DS_instruction *>(b);
-            return aD->pass_flags == bD->pass_flags &&
+            return aD->sync.can_reorder() && bD->sync.can_reorder() &&
+                   aD->sync == bD->sync &&
+                   aD->pass_flags == bD->pass_flags &&
                    aD->gds == bD->gds &&
                    aD->offset0 == bD->offset0 &&
                    aD->offset1 == bD->offset1;
@@ -303,8 +308,8 @@ struct InstrPred {
          case Format::MIMG: {
             MIMG_instruction* aM = static_cast<MIMG_instruction*>(a);
             MIMG_instruction* bM = static_cast<MIMG_instruction*>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                    aM->dmask == bM->dmask &&
                    aM->unrm == bM->unrm &&
                    aM->glc == bM->glc &&
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index e00c8c1fcaf..c1f0bc20f9e 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -1026,8 +1026,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
                new_instr->operands.back() = Operand(base);
                if (!smem->definitions.empty())
                   new_instr->definitions[0] = smem->definitions[0];
-               new_instr->can_reorder = smem->can_reorder;
-               new_instr->barrier = smem->barrier;
+               new_instr->sync = smem->sync;
                new_instr->glc = smem->glc;
                new_instr->dlc = smem->dlc;
                new_instr->nv = smem->nv;
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp
index 9172fc1ac60..8b8b5d0f306 100644
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -189,23 +189,73 @@ static void print_definition(const Definition *definition, FILE *output)
       print_physReg(definition->physReg(), definition->bytes(), output);
 }
 
-static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, FILE *output)
+static void print_storage(storage_class storage, FILE *output)
 {
-   if (can_reorder)
-      fprintf(output, " reorder");
+   fprintf(output, " storage:");
+   int printed = 0;
+   if (storage & storage_buffer)
+      printed += fprintf(output, "%sbuffer", printed ? "," : "");
+   if (storage & storage_atomic_counter)
+      printed += fprintf(output, "%satomic_counter", printed ? "," : "");
+   if (storage & storage_image)
+      printed += fprintf(output, "%simage", printed ? "," : "");
+   if (storage & storage_shared)
+      printed += fprintf(output, "%sshared", printed ? "," : "");
+   if (storage & storage_vmem_output)
+      printed += fprintf(output, "%svmem_output", printed ? "," : "");
+   if (storage & storage_scratch)
+      printed += fprintf(output, "%sscratch", printed ? "," : "");
+   if (storage & storage_vgpr_spill)
+      printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
+}
+
+static void print_semantics(memory_semantics sem, FILE *output)
+{
+   fprintf(output, " semantics:");
+   int printed = 0;
+   if (sem & semantic_acquire)
+      printed += fprintf(output, "%sacquire", printed ? "," : "");
+   if (sem & semantic_release)
+      printed += fprintf(output, "%srelease", printed ? "," : "");
+   if (sem & semantic_volatile)
+      printed += fprintf(output, "%svolatile", printed ? "," : "");
+   if (sem & semantic_private)
+      printed += fprintf(output, "%sprivate", printed ? "," : "");
+   if (sem & semantic_can_reorder)
+      printed += fprintf(output, "%sreorder", printed ? "," : "");
+   if (sem & semantic_atomic)
+      printed += fprintf(output, "%satomic", printed ? "," : "");
+   if (sem & semantic_rmw)
+      printed += fprintf(output, "%srmw", printed ? "," : "");
+}
+
+static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
+{
+   fprintf(output, " %s:", prefix);
+   switch (scope) {
+   case scope_invocation:
+      fprintf(output, "invocation");
+      break;
+   case scope_subgroup:
+      fprintf(output, "subgroup");
+      break;
+   case scope_workgroup:
+      fprintf(output, "workgroup");
+      break;
+   case scope_queuefamily:
+      fprintf(output, "queuefamily");
+      break;
+   case scope_device:
+      fprintf(output, "device");
+      break;
+   }
+}
 
-   if (barrier & barrier_buffer)
-      fprintf(output, " buffer");
-   if (barrier & barrier_image)
-      fprintf(output, " image");
-   if (barrier & barrier_atomic)
-      fprintf(output, " atomic");
-   if (barrier & barrier_shared)
-      fprintf(output, " shared");
-   if (barrier & barrier_gs_data)
-      fprintf(output, " gs_data");
-   if (barrier & barrier_gs_sendmsg)
-      fprintf(output, " gs_sendmsg");
+static void print_sync(memory_sync_info sync, FILE *output)
+{
+   print_storage(sync.storage, output);
+   print_semantics(sync.semantics, output);
+   print_scope(sync.scope, output);
 }
 
 static void print_instr_format_specific(const Instruction *instr, FILE *output)
@@ -292,7 +342,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
          fprintf(output, " dlc");
       if (smem->nv)
          fprintf(output, " nv");
-      print_barrier_reorder(smem->can_reorder, smem->barrier, output);
+      print_sync(smem->sync, output);
       break;
    }
    case Format::VINTRP: {
@@ -308,6 +358,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
          fprintf(output, " offset1:%u", ds->offset1);
       if (ds->gds)
          fprintf(output, " gds");
+      print_sync(ds->sync, output);
       break;
    }
    case Format::MUBUF: {
@@ -332,7 +383,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
          fprintf(output, " lds");
       if (mubuf->disable_wqm)
          fprintf(output, " disable_wqm");
-      print_barrier_reorder(mubuf->can_reorder, mubuf->barrier, output);
+      print_sync(mubuf->sync, output);
       break;
    }
    case Format::MIMG: {
@@ -392,7 +443,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
          fprintf(output, " d16");
       if (mimg->disable_wqm)
          fprintf(output, " disable_wqm");
-      print_barrier_reorder(mimg->can_reorder, mimg->barrier, output);
+      print_sync(mimg->sync, output);
       break;
    }
    case Format::EXP: {
@@ -439,6 +490,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
          fprintf(output, " cluster_size:%u", reduce->cluster_size);
       break;
    }
+   case Format::PSEUDO_BARRIER: {
+      const Pseudo_barrier_instruction* barrier = static_cast<const Pseudo_barrier_instruction*>(instr);
+      print_sync(barrier->sync, output);
+      print_scope(barrier->exec_scope, output, "exec_scope");
+      break;
+   }
    case Format::FLAT:
    case Format::GLOBAL:
    case Format::SCRATCH: {
@@ -457,7 +514,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
          fprintf(output, " nv");
       if (flat->disable_wqm)
          fprintf(output, " disable_wqm");
-      print_barrier_reorder(flat->can_reorder, flat->barrier, output);
+      print_sync(flat->sync, output);
       break;
    }
    case Format::MTBUF: {
@@ -507,7 +564,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
          fprintf(output, " tfe");
       if (mtbuf->disable_wqm)
          fprintf(output, " disable_wqm");
-      print_barrier_reorder(mtbuf->can_reorder, mtbuf->barrier, output);
+      print_sync(mtbuf->sync, output);
       break;
    }
    case Format::VOP3P: {
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index d837059cfd4..261d58ceb03 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -318,26 +318,6 @@ void MoveState::upwards_skip()
    source_idx++;
 }
 
-bool can_reorder(Instruction* candidate)
-{
-   switch (candidate->format) {
-   case Format::SMEM:
-      return static_cast<SMEM_instruction*>(candidate)->can_reorder;
-   case Format::MUBUF:
-      return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
-   case Format::MIMG:
-      return static_cast<MIMG_instruction*>(candidate)->can_reorder;
-   case Format::MTBUF:
-      return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
-   case Format::FLAT:
-   case Format::GLOBAL:
-   case Format::SCRATCH:
-      return static_cast<FLAT_instruction*>(candidate)->can_reorder;
-   default:
-      return true;
-   }
-}
-
 bool is_gs_or_done_sendmsg(const Instruction *instr)
 {
    if (instr->opcode == aco_opcode::s_sendmsg) {
@@ -357,96 +337,96 @@ bool is_done_sendmsg(const Instruction *instr)
    return false;
 }
 
-barrier_interaction get_barrier_interaction(const Instruction* instr)
+memory_sync_info get_sync_info_with_hack(const Instruction* instr)
 {
-   switch (instr->format) {
-   case Format::SMEM:
-      return static_cast<const SMEM_instruction*>(instr)->barrier;
-   case Format::MUBUF:
-      return static_cast<const MUBUF_instruction*>(instr)->barrier;
-   case Format::MIMG:
-      return static_cast<const MIMG_instruction*>(instr)->barrier;
-   case Format::MTBUF:
-      return static_cast<const MTBUF_instruction*>(instr)->barrier;
-   case Format::FLAT:
-   case Format::GLOBAL:
-   case Format::SCRATCH:
-      return static_cast<const FLAT_instruction*>(instr)->barrier;
-   case Format::DS:
-      return barrier_shared;
-   case Format::SOPP:
-      if (is_done_sendmsg(instr))
-         return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
-      else if (is_gs_or_done_sendmsg(instr))
-         return barrier_gs_sendmsg;
-      else
-         return barrier_none;
-   case Format::PSEUDO_BARRIER:
-      return barrier_barrier;
-   default:
-      return barrier_none;
+   memory_sync_info sync = get_sync_info(instr);
+   if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
+      // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
+      sync.storage = (storage_class)(sync.storage | storage_buffer);
+      sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
    }
+   return sync;
 }
 
-barrier_interaction parse_barrier(Instruction *instr)
-{
-   if (instr->format == Format::PSEUDO_BARRIER) {
-      switch (instr->opcode) {
-      case aco_opcode::p_memory_barrier_atomic:
-         return barrier_atomic;
-      /* For now, buffer and image barriers are treated the same. this is because of
-       * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
-       * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
-       * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
-       * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
-       * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
-       * Either way, this solution should work. */
-      case aco_opcode::p_memory_barrier_buffer:
-      case aco_opcode::p_memory_barrier_image:
-         return (barrier_interaction)(barrier_image | barrier_buffer);
-      case aco_opcode::p_memory_barrier_shared:
-         return barrier_shared;
-      case aco_opcode::p_memory_barrier_common:
-         return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
-      case aco_opcode::p_memory_barrier_gs_data:
-         return barrier_gs_data;
-      case aco_opcode::p_memory_barrier_gs_sendmsg:
-         return barrier_gs_sendmsg;
-      default:
-         break;
-      }
-   } else if (instr->opcode == aco_opcode::s_barrier) {
-      return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
-   }
-   return barrier_none;
-}
+struct memory_event_set {
+   bool has_control_barrier;
+
+   unsigned bar_acquire;
+   unsigned bar_release;
+   unsigned bar_classes;
+
+   unsigned access_acquire;
+   unsigned access_release;
+   unsigned access_relaxed;
+   unsigned access_atomic;
+};
 
 struct hazard_query {
    bool contains_spill;
-   int barriers;
-   int barrier_interaction;
-   bool can_reorder_vmem;
-   bool can_reorder_smem;
+   bool contains_sendmsg;
+   memory_event_set mem_events;
+   unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
+   unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
 };
 
 void init_hazard_query(hazard_query *query) {
    query->contains_spill = false;
-   query->barriers = 0;
-   query->barrier_interaction = 0;
-   query->can_reorder_vmem = true;
-   query->can_reorder_smem = true;
+   query->contains_sendmsg = false;
+   memset(&query->mem_events, 0, sizeof(query->mem_events));
+   query->aliasing_storage = 0;
+   query->aliasing_storage_smem = 0;
+}
+
+void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
+{
+   set->has_control_barrier |= is_done_sendmsg(instr);
+   if (instr->opcode == aco_opcode::p_barrier) {
+      Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
+      if (bar->sync.semantics & semantic_acquire)
+         set->bar_acquire |= bar->sync.storage;
+      if (bar->sync.semantics & semantic_release)
+         set->bar_release |= bar->sync.storage;
+      set->bar_classes |= bar->sync.storage;
+
+      set->has_control_barrier |= bar->exec_scope > scope_invocation;
+   }
+
+   if (!sync->storage)
+      return;
+
+   if (sync->semantics & semantic_acquire)
+      set->access_acquire |= sync->storage;
+   if (sync->semantics & semantic_release)
+      set->access_release |= sync->storage;
+
+   if (!(sync->semantics & semantic_private)) {
+      if (sync->semantics & semantic_atomic)
+         set->access_atomic |= sync->storage;
+      else
+         set->access_relaxed |= sync->storage;
+   }
 }
 
 void add_to_hazard_query(hazard_query *query, Instruction *instr)
 {
-   query->barriers |= parse_barrier(instr);
-   query->barrier_interaction |= get_barrier_interaction(instr);
    if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
       query->contains_spill = true;
+   query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
+
+   memory_sync_info sync = get_sync_info_with_hack(instr);
 
-   bool can_reorder_instr = can_reorder(instr);
-   query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr;
-   query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr;
+   add_memory_event(&query->mem_events, instr, &sync);
+
+   if (!(sync.semantics & semantic_can_reorder)) {
+      unsigned storage = sync.storage;
+      /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
+      if (storage & (storage_buffer | storage_image))
+         storage |= storage_buffer | storage_image;
+      if (instr->format == Format::SMEM)
+         query->aliasing_storage_smem |= storage;
+      else
+         query->aliasing_storage |= storage;
+   }
 }
 
 enum HazardResult {
@@ -463,10 +443,8 @@ enum HazardResult {
    hazard_fail_unreorderable,
 };
 
-HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
+HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
 {
-   bool can_reorder_candidate = can_reorder(instr);
-
    if (instr->opcode == aco_opcode::p_exit_early_if)
       return hazard_fail_exec;
    for (const Definition& def : instr->definitions) {
@@ -484,27 +462,61 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
        instr->opcode == aco_opcode::s_setprio)
       return hazard_fail_unreorderable;
 
-   barrier_interaction bar = parse_barrier(instr);
-   if (query->barrier_interaction && (query->barrier_interaction & bar))
+   memory_event_set instr_set;
+   memset(&instr_set, 0, sizeof(instr_set));
+   memory_sync_info sync = get_sync_info_with_hack(instr);
+   add_memory_event(&instr_set, instr, &sync);
+
+   memory_event_set *first = &instr_set;
+   memory_event_set *second = &query->mem_events;
+   if (upwards)
+      std::swap(first, second);
+
+   /* everything after barrier(acquire) happens after the atomics/control_barriers before
+    * everything after load(acquire) happens after the load
+    */
+   if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
+      return hazard_fail_barrier;
+   if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
+       ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
+      return hazard_fail_barrier;
+
+   /* everything before barrier(release) happens before the atomics/control_barriers after *
+    * everything before store(release) happens before the store
+    */
+   if (first->bar_release && (second->has_control_barrier || second->access_atomic))
       return hazard_fail_barrier;
-   if (bar && query->barriers && (query->barriers & ~bar))
+   if ((first->bar_classes && (second->bar_release || second->access_release)) ||
+       ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
       return hazard_fail_barrier;
-   if (query->barriers && (query->barriers & get_barrier_interaction(instr)))
+
+   /* don't move memory barriers around other memory barriers */
+   if (first->bar_classes && second->bar_classes)
       return hazard_fail_barrier;
 
-   if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate)
-      return hazard_fail_reorder_vmem_smem;
-   if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate)
+   /* Don't move memory loads/stores to before control barriers. This is to make
+    * memory barriers followed by control barriers work. */
+   if (first->has_control_barrier && (second->access_atomic | second->access_relaxed))
+      return hazard_fail_barrier;
+
+   /* don't move memory loads/stores past potentially aliasing loads/stores */
+   unsigned aliasing_storage = instr->format == Format::SMEM ?
+                               query->aliasing_storage_smem :
+                               query->aliasing_storage;
+   if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
+      unsigned intersect = sync.storage & aliasing_storage;
+      if (intersect & storage_shared)
+         return hazard_fail_reorder_ds;
       return hazard_fail_reorder_vmem_smem;
-   if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS)
-      return hazard_fail_reorder_ds;
-   if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr)))
-      return hazard_fail_reorder_sendmsg;
+   }
 
    if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
        query->contains_spill)
       return hazard_fail_spill;
 
+   if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
+      return hazard_fail_reorder_sendmsg;
+
    return hazard_success;
 }
 
@@ -546,7 +558,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
 
       bool can_move_down = true;
 
-      HazardResult haz = perform_hazard_query(&hq, candidate.get());
+      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
          can_move_down = false;
       else if (haz != hazard_success)
@@ -594,7 +606,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
          break;
 
       if (found_dependency) {
-         HazardResult haz = perform_hazard_query(&hq, candidate.get());
+         HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
              haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
              haz == hazard_fail_export)
@@ -686,7 +698,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
       /* if current depends on candidate, add additional dependencies and continue */
       bool can_move_down = !is_vmem || part_of_clause;
 
-      HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get());
+      HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
           haz == hazard_fail_export)
@@ -735,7 +747,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
       /* check if candidate depends on current */
       bool is_dependency = false;
       if (found_dependency) {
-         HazardResult haz = perform_hazard_query(&indep_hq, candidate.get());
+         HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
              haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
              haz == hazard_fail_barrier || haz == hazard_fail_export)
@@ -802,7 +814,7 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
       if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
          break;
 
-      HazardResult haz = perform_hazard_query(&hq, candidate.get());
+      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
       if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
          break;
 
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
index d472f63ec57..69d5cb23b23 100644
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1565,10 +1565,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                   for (unsigned i = 0; i < temp.size(); i++)
                      split->definitions[i] = bld.def(v1);
                   bld.insert(split);
-                  for (unsigned i = 0; i < temp.size(); i++)
-                     bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+                  for (unsigned i = 0; i < temp.size(); i++) {
+                     Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+                     static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
+                  }
                } else {
-                  bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+                  Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+                  static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                }
             } else {
                ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
@@ -1632,11 +1635,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                   for (unsigned i = 0; i < def.size(); i++) {
                      Temp tmp = bld.tmp(v1);
                      vec->operands[i] = Operand(tmp);
-                     bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+                     Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+                     static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                   }
                   bld.insert(vec);
                } else {
-                  bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+                  Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+                  static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                }
             } else {
                uint32_t spill_slot = slots[spill_id];
-- 
2.30.2