aco: rework barriers and replace can_reorder

author Rhys Perry <pendingchaos02@gmail.com>

Fri, 26 Jun 2020 14:54:22 +0000 (15:54 +0100)

committer Marge Bot <eric+marge@anholt.net>

Tue, 28 Jul 2020 16:56:34 +0000 (16:56 +0000)
author Rhys Perry <pendingchaos02@gmail.com>
Fri, 26 Jun 2020 14:54:22 +0000 (15:54 +0100)
committer Marge Bot <eric+marge@anholt.net>
Tue, 28 Jul 2020 16:56:34 +0000 (16:56 +0000)
diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp

index 87b3112dc971083b1588e6af0a6abe6057655fa7..f43d784c55f69fe5434603924d8ae47d1abaa7a5 100644 (file)
--- a/src/amd/compiler/aco_dead_code_analysis.cpp
+++ b/src/amd/compiler/aco_dead_code_analysis.cpp
@@ -84,7 +84,7 @@ bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr)
     if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
            [&uses] (const Definition& def) { return uses[def.tempId()];}))
        return false;
-   return !instr_info.is_atomic[(int)instr->opcode];
+   return !(get_sync_info(instr).semantics & (semantic_volatile | semantic_acqrel));
  }
  
  std::vector<uint16_t> dead_code_analysis(Program *program) {
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp

index aedcad1b57384a3158513bfca3933f9b2d3aaa6e..fa2b5234d2e214ad33009914cffaa072a066d111 100644 (file)
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -273,8 +273,8 @@ struct wait_ctx {
     bool pending_flat_vm = false;
     bool pending_s_buffer_store = false; /* GFX10 workaround */
  
-   wait_imm barrier_imm[barrier_count];
-   uint16_t barrier_events[barrier_count] = {}; /* use wait_event notion */
+   wait_imm barrier_imm[storage_count];
+   uint16_t barrier_events[storage_count] = {}; /* use wait_event notion */
  
     std::map<PhysReg,wait_entry> gpr_map;
  
@@ -327,7 +327,7 @@ struct wait_ctx {
           }
        }
  
-      for (unsigned i = 0; i < barrier_count; i++) {
+      for (unsigned i = 0; i < storage_count; i++) {
           changed |= barrier_imm[i].combine(other->barrier_imm[i]);
           changed |= other->barrier_events[i] & ~barrier_events[i];
           barrier_events[i] |= other->barrier_events[i];
@@ -444,7 +444,31 @@ wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr)
     return wait_imm();
  }
  
-wait_imm kill(Instruction* instr, wait_ctx& ctx)
+wait_imm perform_barrier(wait_ctx& ctx, memory_sync_info sync, unsigned semantics)
+{
+   wait_imm imm;
+   sync_scope subgroup_scope = ctx.program->workgroup_size <= ctx.program->wave_size ? scope_workgroup : scope_subgroup;
+   if (sync.semantics & semantics) {
+      unsigned storage = sync.storage;
+      while (storage) {
+         unsigned idx = u_bit_scan(&storage);
+
+         /* LDS is private to the workgroup */
+         sync_scope bar_scope_lds = MIN2(sync.scope, scope_workgroup);
+
+         uint16_t events = ctx.barrier_events[idx];
+         if (bar_scope_lds <= subgroup_scope)
+            events &= ~event_lds;
+
+         if (events)
+            imm.combine(ctx.barrier_imm[idx]);
+      }
+   }
+
+   return imm;
+}
+
+wait_imm kill(Instruction* instr, wait_ctx& ctx, memory_sync_info sync_info)
  {
     wait_imm imm;
     if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt)
@@ -471,44 +495,15 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
        SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
        if (ctx.pending_s_buffer_store &&
            !smem->definitions.empty() &&
-          !smem->can_reorder && smem->barrier == barrier_buffer) {
+          !smem->sync.can_reorder()) {
           imm.lgkm = 0;
        }
     }
  
-   if (instr->format == Format::PSEUDO_BARRIER) {
-      switch (instr->opcode) {
-      case aco_opcode::p_memory_barrier_common:
-         imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
-         if (ctx.program->workgroup_size > ctx.program->wave_size)
-            imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_atomic:
-         imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
-         break;
-      /* see comment in aco_scheduler.cpp's can_move_instr() on why these barriers are merged */
-      case aco_opcode::p_memory_barrier_buffer:
-      case aco_opcode::p_memory_barrier_image:
-         imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
-         imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_shared:
-         if (ctx.program->workgroup_size > ctx.program->wave_size)
-            imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_gs_data:
-         imm.combine(ctx.barrier_imm[ffs(barrier_gs_data) - 1]);
-         break;
-      case aco_opcode::p_memory_barrier_gs_sendmsg:
-         imm.combine(ctx.barrier_imm[ffs(barrier_gs_sendmsg) - 1]);
-         break;
-      default:
-         assert(false);
-         break;
-      }
-   }
+   if (instr->opcode == aco_opcode::p_barrier)
+      imm.combine(perform_barrier(ctx, static_cast<Pseudo_barrier_instruction *>(instr)->sync, semantic_acqrel));
+   else
+      imm.combine(perform_barrier(ctx, sync_info, semantic_release));
  
     if (!imm.empty()) {
        if (ctx.pending_flat_vm && imm.vm != wait_imm::unset_counter)
@@ -523,7 +518,7 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
        ctx.vs_cnt = std::min(ctx.vs_cnt, imm.vs);
  
        /* update barrier wait imms */
-      for (unsigned i = 0; i < barrier_count; i++) {
+      for (unsigned i = 0; i < storage_count; i++) {
           wait_imm& bar = ctx.barrier_imm[i];
           uint16_t& bar_ev = ctx.barrier_events[i];
           if (bar.exp != wait_imm::unset_counter && imm.exp <= bar.exp) {
@@ -581,12 +576,12 @@ void update_barrier_counter(uint8_t *ctr, unsigned max)
        (*ctr)++;
  }
  
-void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barrier_interaction barrier)
+void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_sync_info sync)
  {
-   for (unsigned i = 0; i < barrier_count; i++) {
+   for (unsigned i = 0; i < storage_count; i++) {
        wait_imm& bar = ctx.barrier_imm[i];
        uint16_t& bar_ev = ctx.barrier_events[i];
-      if (barrier & (1 << i)) {
+      if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
           bar_ev |= event;
           if (counters & counter_lgkm)
              bar.lgkm = 0;
@@ -609,7 +604,7 @@ void update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, barri
     }
  }
  
-void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrier=barrier_none)
+void update_counters(wait_ctx& ctx, wait_event event, memory_sync_info sync=memory_sync_info())
  {
     uint8_t counters = get_counters_for_event(event);
  
@@ -622,7 +617,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie
     if (counters & counter_vs && ctx.vs_cnt <= ctx.max_vs_cnt)
        ctx.vs_cnt++;
  
-   update_barrier_imm(ctx, counters, event, barrier);
+   update_barrier_imm(ctx, counters, event, sync);
  
     if (ctx.unordered_events & event)
        return;
@@ -651,7 +646,7 @@ void update_counters(wait_ctx& ctx, wait_event event, barrier_interaction barrie
     }
  }
  
-void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=barrier_none)
+void update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync=memory_sync_info())
  {
     assert(ctx.chip_class < GFX10);
  
@@ -660,7 +655,7 @@ void update_counters_for_flat_load(wait_ctx& ctx, barrier_interaction barrier=ba
     if (ctx.vm_cnt <= ctx.max_vm_cnt)
        ctx.vm_cnt++;
  
-   update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, barrier);
+   update_barrier_imm(ctx, counter_vm | counter_lgkm, event_flat, sync);
  
     for (std::pair<PhysReg,wait_entry> e : ctx.gpr_map)
     {
@@ -748,10 +743,11 @@ void gen(Instruction* instr, wait_ctx& ctx)
        break;
     }
     case Format::FLAT: {
+      FLAT_instruction *flat = static_cast<FLAT_instruction*>(instr);
        if (ctx.chip_class < GFX10 && !instr->definitions.empty())
-         update_counters_for_flat_load(ctx, barrier_buffer);
+         update_counters_for_flat_load(ctx, flat->sync);
        else
-         update_counters(ctx, event_flat, barrier_buffer);
+         update_counters(ctx, event_flat, flat->sync);
  
        if (!instr->definitions.empty())
           insert_wait_entry(ctx, instr->definitions[0], event_flat);
@@ -759,27 +755,26 @@ void gen(Instruction* instr, wait_ctx& ctx)
     }
     case Format::SMEM: {
        SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
-      update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
+      update_counters(ctx, event_smem, smem->sync);
  
        if (!instr->definitions.empty())
           insert_wait_entry(ctx, instr->definitions[0], event_smem);
        else if (ctx.chip_class >= GFX10 &&
-               !smem->can_reorder &&
-               smem->barrier == barrier_buffer)
+               !smem->sync.can_reorder())
           ctx.pending_s_buffer_store = true;
  
        break;
     }
     case Format::DS: {
-      bool gds = static_cast<DS_instruction*>(instr)->gds;
-      update_counters(ctx, gds ? event_gds : event_lds, gds ? barrier_none : barrier_shared);
-      if (gds)
+      DS_instruction *ds = static_cast<DS_instruction*>(instr);
+      update_counters(ctx, ds->gds ? event_gds : event_lds, ds->sync);
+      if (ds->gds)
           update_counters(ctx, event_gds_gpr_lock);
  
        if (!instr->definitions.empty())
-         insert_wait_entry(ctx, instr->definitions[0], gds ? event_gds : event_lds);
+         insert_wait_entry(ctx, instr->definitions[0], ds->gds ? event_gds : event_lds);
  
-      if (gds) {
+      if (ds->gds) {
           for (const Operand& op : instr->operands)
              insert_wait_entry(ctx, op, event_gds_gpr_lock);
           insert_wait_entry(ctx, exec, s2, event_gds_gpr_lock, false);
@@ -791,7 +786,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
     case Format::MIMG:
     case Format::GLOBAL: {
        wait_event ev = !instr->definitions.empty() || ctx.chip_class < GFX10 ? event_vmem : event_vmem_store;
-      update_counters(ctx, ev, get_barrier_interaction(instr));
+      update_counters(ctx, ev, get_sync_info(instr));
  
        bool has_sampler = instr->format == Format::MIMG && !instr->operands[1].isUndefined() && instr->operands[1].regClass() == s4;
  
@@ -817,7 +812,7 @@ void gen(Instruction* instr, wait_ctx& ctx)
     case Format::SOPP: {
        if (instr->opcode == aco_opcode::s_sendmsg ||
            instr->opcode == aco_opcode::s_sendmsghalt)
-         update_counters(ctx, event_sendmsg, get_barrier_interaction(instr));
+         update_counters(ctx, event_sendmsg);
     }
     default:
        break;
@@ -851,7 +846,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
     for (aco_ptr<Instruction>& instr : block.instructions) {
        bool is_wait = !parse_wait_instr(ctx, instr.get()).empty();
  
-      queued_imm.combine(kill(instr.get(), ctx));
+      memory_sync_info sync_info = get_sync_info(instr.get());
+      queued_imm.combine(kill(instr.get(), ctx, sync_info));
  
        ctx.gen_instr = instr.get();
        gen(instr.get(), ctx);
@@ -863,6 +859,8 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx)
           }
           new_instructions.emplace_back(std::move(instr));
  
+         queued_imm.combine(perform_barrier(ctx, sync_info, semantic_acquire));
+
           if (ctx.collect_statistics)
              ctx.advance_unwaited_instrs();
        }
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 80ea1e133d661420025877f3f024807687d4555e..81c50ebbc9c8fc3e6e4409501255428ad6eeb194 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3137,8 +3137,7 @@ struct LoadEmitInfo {
  
     bool glc = false;
     unsigned swizzle_component_size = 0;
-   barrier_interaction barrier = barrier_none;
-   bool can_reorder = true;
+   memory_sync_info sync;
     Temp soffset = Temp(0, s1);
  };
  
@@ -3441,10 +3440,12 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
  
     RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
     Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+   Instruction *instr;
     if (read2)
-      bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+      instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
     else
-      bld.ds(op, Definition(val), offset, m, const_offset);
+      instr = bld.ds(op, Definition(val), offset, m, const_offset);
+   static_cast<DS_instruction *>(instr)->sync = info->sync;
  
     if (size < 4)
        val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
@@ -3490,8 +3491,7 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
     load->definitions[0] = Definition(val);
     load->glc = info->glc;
     load->dlc = info->glc && bld.program->chip_class >= GFX10;
-   load->barrier = info->barrier;
-   load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
+   load->sync = info->sync;
     bld.insert(std::move(load));
     return val;
  }
@@ -3540,8 +3540,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
     mubuf->offen = (offset.type() == RegType::vgpr);
     mubuf->glc = info->glc;
     mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
-   mubuf->barrier = info->barrier;
-   mubuf->can_reorder = info->can_reorder;
+   mubuf->sync = info->sync;
     mubuf->offset = const_offset;
     mubuf->swizzled = info->swizzle_component_size != 0;
     RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
@@ -3605,7 +3604,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
        mubuf->offset = 0;
        mubuf->addr64 = offset.type() == RegType::vgpr;
        mubuf->disable_wqm = false;
-      mubuf->barrier = info->barrier;
+      mubuf->sync = info->sync;
        mubuf->definitions[0] = Definition(val);
        bld.insert(std::move(mubuf));
     } else {
@@ -3616,7 +3615,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
        flat->operands[1] = Operand(s1);
        flat->glc = info->glc;
        flat->dlc = info->glc && bld.program->chip_class >= GFX10;
-      flat->barrier = info->barrier;
+      flat->sync = info->sync;
        flat->offset = 0u;
        flat->definitions[0] = Definition(val);
        bld.insert(std::move(flat));
@@ -3638,8 +3637,7 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
     LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
     info.align_mul = align;
     info.align_offset = 0;
-   info.barrier = barrier_shared;
-   info.can_reorder = false;
+   info.sync = memory_sync_info(storage_shared);
     info.const_offset = base_offset;
     emit_lds_load(ctx, bld, &info);
  
@@ -3848,13 +3846,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
        }
        assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
  
+      Instruction *instr;
        if (write2) {
           Temp second_data = write_datas[second];
           inline_offset /= data.bytes();
-         bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+         instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
        } else {
-         bld.ds(op, address_offset, data, m, inline_offset);
+         instr = bld.ds(op, address_offset, data, m, inline_offset);
        }
+      static_cast<DS_instruction *>(instr)->sync =
+         memory_sync_info(storage_shared);
     }
  }
  
@@ -4017,7 +4018,8 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
                                   /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
                                   /* dlc*/ false, /* slc */ slc);
  
-   static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+   if (!allow_reorder)
+      static_cast<MUBUF_instruction *>(r.instr)->sync = memory_sync_info(storage_buffer, semantic_private);
  }
  
  void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
@@ -4847,15 +4849,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
           }
  
           if (use_mubuf) {
-            Instruction *mubuf = bld.mubuf(opcode,
-                                           Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_offset, false, false, true).instr;
-            static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
+            bld.mubuf(opcode,
+                      Definition(fetch_dst), list, fetch_index, soffset,
+                      fetch_offset, false, false, true).instr;
           } else {
-            Instruction *mtbuf = bld.mtbuf(opcode,
-                                           Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_dfmt, nfmt, fetch_offset, false, true).instr;
-            static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
+            bld.mtbuf(opcode,
+                      Definition(fetch_dst), list, fetch_index, soffset,
+                      fetch_dfmt, nfmt, fetch_offset, false, true).instr;
           }
  
           emit_split_vector(ctx, fetch_dst, fetch_dst.size());
@@ -5208,7 +5208,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
  
  void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
                   Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
-                 bool glc=false, bool readonly=true, bool allow_smem=true)
+                 bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
  {
     Builder bld(ctx->program, ctx->block);
  
@@ -5218,8 +5218,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_
  
     LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
     info.glc = glc;
-   info.barrier = readonly ? barrier_none : barrier_buffer;
-   info.can_reorder = readonly;
+   info.sync = sync;
     info.align_mul = align_mul;
     info.align_offset = align_offset;
     if (use_smem)
@@ -5737,7 +5736,6 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
     load->unrm = true;
     load->da = da;
     load->dim = dim;
-   load->can_reorder = true; /* fmask images shouldn't be modified */
     ctx->block->instructions.emplace_back(std::move(load));
  
     Operand sample_index4;
@@ -5837,6 +5835,22 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr
  }
  
  
+memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
+{
+   /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
+   if (semantics & semantic_atomicrmw)
+      return memory_sync_info(storage, semantics);
+
+   unsigned access = nir_intrinsic_access(instr);
+
+   if (access & ACCESS_VOLATILE)
+      semantics |= semantic_volatile;
+   if (access & ACCESS_CAN_REORDER)
+      semantics |= semantic_can_reorder | semantic_private;
+
+   return memory_sync_info(storage, semantics);
+}
+
  void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
@@ -5846,6 +5860,8 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
     bool is_array = glsl_sampler_type_is_array(type);
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
  
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+
     if (dim == GLSL_SAMPLER_DIM_BUF) {
        unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
        unsigned num_channels = util_last_bit(mask);
@@ -5882,7 +5898,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
        load->idxen = true;
        load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
        load->dlc = load->glc && ctx->options->chip_class >= GFX10;
-      load->barrier = barrier_image;
+      load->sync = sync;
        ctx->block->instructions.emplace_back(std::move(load));
  
        expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
@@ -5914,7 +5930,7 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
     load->dmask = dmask;
     load->unrm = true;
     load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
-   load->barrier = barrier_image;
+   load->sync = sync;
     ctx->block->instructions.emplace_back(std::move(load));
  
     expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
@@ -5929,6 +5945,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
     bool is_array = glsl_sampler_type_is_array(type);
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
  
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
     bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
  
     if (dim == GLSL_SAMPLER_DIM_BUF) {
@@ -5960,7 +5977,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
        store->glc = glc;
        store->dlc = false;
        store->disable_wqm = true;
-      store->barrier = barrier_image;
+      store->sync = sync;
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(store));
        return;
@@ -5984,7 +6001,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
     store->unrm = true;
     store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
     store->disable_wqm = true;
-   store->barrier = barrier_image;
+   store->sync = sync;
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(store));
     return;
@@ -6062,6 +6079,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     }
  
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
  
     if (dim == GLSL_SAMPLER_DIM_BUF) {
        Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
@@ -6079,7 +6097,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        mubuf->glc = return_previous;
        mubuf->dlc = false; /* Not needed for atomics */
        mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_image;
+      mubuf->sync = sync;
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
        return;
@@ -6100,7 +6118,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     mimg->unrm = true;
     mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
     mimg->disable_wqm = true;
-   mimg->barrier = barrier_image;
+   mimg->sync = sync;
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(mimg));
     return;
@@ -6164,7 +6182,6 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
     mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
     mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
     mimg->da = glsl_sampler_type_is_array(type);
-   mimg->can_reorder = true;
     Definition& def = mimg->definitions[0];
     ctx->block->instructions.emplace_back(std::move(mimg));
  
@@ -6219,7 +6236,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
  
     load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
-               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
+               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
+               get_memory_sync_info(instr, storage_buffer, 0));
  }
  
  void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -6233,6 +6251,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
  
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
     bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
     uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
     /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
@@ -6275,7 +6294,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           store->glc = glc;
           store->dlc = false;
           store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
           ctx->block->instructions.emplace_back(std::move(store));
           ctx->program->wb_smem_l1_on_end = true;
           if (op == aco_opcode::p_fs_buffer_store_smem) {
@@ -6293,7 +6312,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           store->glc = glc;
           store->dlc = false;
           store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(store));
        }
@@ -6384,7 +6403,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     mubuf->glc = return_previous;
     mubuf->dlc = false; /* Not needed for atomics */
     mubuf->disable_wqm = true;
-   mubuf->barrier = barrier_buffer;
+   mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(mubuf));
  }
@@ -6409,8 +6428,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
     info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
     info.align_mul = nir_intrinsic_align_mul(instr);
     info.align_offset = nir_intrinsic_align_offset(instr);
-   info.barrier = barrier_buffer;
-   info.can_reorder = false;
+   info.sync = get_memory_sync_info(instr, storage_buffer, 0);
     /* VMEM stores don't update the SMEM cache and it's difficult to prove that
      * it's safe to use SMEM */
     bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
@@ -6430,6 +6448,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
     bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
  
     if (ctx->options->chip_class >= GFX7)
@@ -6495,7 +6514,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           flat->dlc = false;
           flat->offset = offset;
           flat->disable_wqm = true;
-         flat->barrier = barrier_buffer;
+         flat->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(flat));
        } else {
@@ -6515,7 +6534,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           mubuf->offset = offsets[i];
           mubuf->addr64 = addr.type() == RegType::vgpr;
           mubuf->disable_wqm = true;
-         mubuf->barrier = barrier_buffer;
+         mubuf->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(mubuf));
        }
@@ -6608,7 +6627,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        flat->dlc = false; /* Not needed for atomics */
        flat->offset = 0;
        flat->disable_wqm = true;
-      flat->barrier = barrier_buffer;
+      flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(flat));
     } else {
@@ -6675,7 +6694,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        mubuf->offset = 0;
        mubuf->addr64 = addr.type() == RegType::vgpr;
        mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_buffer;
+      mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
     }
@@ -6683,20 +6702,30 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
  
  void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
     Builder bld(ctx->program, ctx->block);
+   storage_class all_mem = (storage_class)(storage_buffer | storage_image | storage_atomic_counter | storage_shared);
     switch(instr->intrinsic) {
        case nir_intrinsic_group_memory_barrier:
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(all_mem, semantic_acqrel, scope_workgroup));
+         break;
        case nir_intrinsic_memory_barrier:
-         bld.barrier(aco_opcode::p_memory_barrier_common);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(all_mem, semantic_acqrel, scope_device));
           break;
        case nir_intrinsic_memory_barrier_buffer:
-         bld.barrier(aco_opcode::p_memory_barrier_buffer);
-         break;
        case nir_intrinsic_memory_barrier_image:
-         bld.barrier(aco_opcode::p_memory_barrier_image);
+         /* since NIR splits barriers, we have to unify buffer and image barriers
+          * for now so dEQP-VK.memory_model.message_passing.core11.u32.coherent.
+          * fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
+          * passes
+          */
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info((storage_class)(storage_buffer | storage_image), semantic_acqrel, scope_device));
           break;
        case nir_intrinsic_memory_barrier_tcs_patch:
        case nir_intrinsic_memory_barrier_shared:
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup));
           break;
        default:
           unreachable("Unimplemented memory barrier intrinsic");
@@ -6844,6 +6873,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     ds->offset0 = offset;
     if (return_previous)
        ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
+   ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
     ctx->block->instructions.emplace_back(std::move(ds));
  }
  
@@ -6884,7 +6914,7 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
     info.align_mul = nir_intrinsic_align_mul(instr);
     info.align_offset = nir_intrinsic_align_offset(instr);
     info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
-   info.can_reorder = false;
+   info.sync = memory_sync_info(storage_buffer, semantic_private);
     info.soffset = ctx->program->scratch_offset;
     emit_scratch_load(ctx, bld, &info);
  }
@@ -6907,7 +6937,8 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
  
     for (unsigned i = 0; i < write_count; i++) {
        aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_buffer, semantic_private);
     }
  }
  
@@ -7021,8 +7052,7 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
              mtbuf->offset = const_offset;
              mtbuf->glc = true;
              mtbuf->slc = true;
-            mtbuf->barrier = barrier_gs_data;
-            mtbuf->can_reorder = true;
+            mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
              bld.insert(std::move(mtbuf));
           }
  
@@ -7347,8 +7377,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           load->glc = false;
           load->dlc = false;
           load->disable_wqm = false;
-         load->barrier = barrier_none;
-         load->can_reorder = true;
           ctx->block->instructions.emplace_back(std::move(load));
        }
  
@@ -7531,17 +7559,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        visit_get_buffer_size(ctx, instr);
        break;
     case nir_intrinsic_control_barrier: {
-      if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
-         /* GFX6 only (thanks to a hw bug workaround):
-          * The real barrier instruction isn’t needed, because an entire patch
-          * always fits into a single wave.
-          */
-         break;
-      }
-
-      if (ctx->program->workgroup_size > ctx->program->wave_size)
-         bld.sopp(aco_opcode::s_barrier);
-
+      bld.barrier(aco_opcode::p_barrier, memory_sync_info(0, 0, scope_invocation), scope_workgroup);
        break;
     }
     case nir_intrinsic_memory_barrier_tcs_patch:
@@ -8093,7 +8111,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        aco_opcode opcode =
           nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
              aco_opcode::s_memrealtime : aco_opcode::s_memtime;
-      bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
+      bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), memory_sync_info(0, semantic_volatile));
        emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
        break;
     }
@@ -8674,7 +8692,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->da = da;
        tex->definitions[0] = Definition(tmp_dst);
        tex->dim = dim;
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
  
        if (div_by_6) {
@@ -8707,7 +8724,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->da = da;
        Temp size = bld.tmp(v2);
        tex->definitions[0] = Definition(size);
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
        emit_split_vector(ctx, size, size.size());
  
@@ -8809,7 +8825,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        mubuf->operands[2] = Operand((uint32_t) 0);
        mubuf->definitions[0] = Definition(tmp_dst);
        mubuf->idxen = true;
-      mubuf->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
  
        expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
@@ -8858,7 +8873,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->unrm = true;
        tex->da = da;
        tex->definitions[0] = Definition(tmp_dst);
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
  
        if (instr->op == nir_texop_samples_identical) {
@@ -9002,7 +9016,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
     tex->dmask = dmask;
     tex->da = da;
     tex->definitions[0] = Definition(tmp_dst);
-   tex->can_reorder = true;
     ctx->block->instructions.emplace_back(std::move(tex));
  
     if (tg4_integer_cube_workaround) {
@@ -10285,6 +10298,13 @@ static void create_fs_exports(isel_context *ctx)
        create_null_export(ctx);
  }
  
+static void create_workgroup_barrier(Builder& bld)
+{
+   bld.barrier(aco_opcode::p_barrier,
+               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
+               scope_workgroup);
+}
+
  static void write_tcs_tess_factors(isel_context *ctx)
  {
     unsigned outer_comps;
@@ -10309,9 +10329,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
  
     Builder bld(ctx->program, ctx->block);
  
-   bld.barrier(aco_opcode::p_memory_barrier_shared);
-   if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
-      bld.sopp(aco_opcode::s_barrier);
+   create_workgroup_barrier(bld);
  
     Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
     Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
@@ -10470,7 +10488,6 @@ static void emit_stream_output(isel_context *ctx,
        store->glc = true;
        store->dlc = false;
        store->slc = true;
-      store->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(store));
     }
  }
@@ -10890,8 +10907,7 @@ void ngg_emit_nogs_output(isel_context *ctx)
  
        if (ctx->stage == ngg_vertex_gs) {
           /* Wait for GS threads to store primitive ID in LDS. */
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);
  
           /* Calculate LDS address where the GS threads stored the primitive ID. */
           Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
@@ -10975,8 +10991,7 @@ void select_program(Program *program,
        if (i) {
           Builder bld(ctx.program, ctx.block);
  
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);
  
           if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
              ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
@@ -10999,7 +11014,8 @@ void select_program(Program *program,
           ngg_emit_nogs_output(&ctx);
        } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
           Builder bld(ctx.program, ctx.block);
-         bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(storage_vmem_output, semantic_release, scope_device));
           bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
        } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
           write_tcs_tess_factors(&ctx);
@@ -11031,7 +11047,7 @@ void select_program(Program *program,
     ctx.block->kind |= block_kind_uniform;
     Builder bld(ctx.program, ctx.block);
     if (ctx.program->wb_smem_l1_on_end)
-      bld.smem(aco_opcode::s_dcache_wb, false);
+      bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
     bld.sopp(aco_opcode::s_endpgm);
  
     cleanup_cfg(program);
@@ -11116,8 +11132,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
              mubuf->glc = true;
              mubuf->slc = true;
              mubuf->dlc = args->options->chip_class >= GFX10;
-            mubuf->barrier = barrier_none;
-            mubuf->can_reorder = true;
  
              ctx.outputs.mask[i] |= 1 << j;
              ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp

index 92c50cdd94e9f762e188c7188edd1a5d6caaa6ae..75ca80cf2acd0a25643c30c22c7b2371cc43cba3 100644 (file)
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -127,6 +127,28 @@ void init_program(Program *program, Stage stage, struct radv_shader_info *info,
     program->next_fp_mode.round32 = fp_round_ne;
  }
  
+memory_sync_info get_sync_info(const Instruction* instr)
+{
+   switch (instr->format) {
+   case Format::SMEM:
+      return static_cast<const SMEM_instruction*>(instr)->sync;
+   case Format::MUBUF:
+      return static_cast<const MUBUF_instruction*>(instr)->sync;
+   case Format::MIMG:
+      return static_cast<const MIMG_instruction*>(instr)->sync;
+   case Format::MTBUF:
+      return static_cast<const MTBUF_instruction*>(instr)->sync;
+   case Format::FLAT:
+   case Format::GLOBAL:
+   case Format::SCRATCH:
+      return static_cast<const FLAT_instruction*>(instr)->sync;
+   case Format::DS:
+      return static_cast<const DS_instruction*>(instr)->sync;
+   default:
+      return memory_sync_info();
+   }
+}
+
  bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr)
  {
     if (!instr->isVALU())
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h

index 6f2a4a46b9142b97e4f96c192b2a5f8cf4202b6e..9dda1ffb723d8fb7a06466628ff05dfbe6d5db3b 100644 (file)
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -103,22 +103,79 @@ enum class Format : std::uint16_t {
     SDWA = 1 << 14,
  };
  
-enum barrier_interaction : uint8_t {
-   barrier_none = 0,
-   barrier_buffer = 0x1,
-   barrier_image = 0x2,
-   barrier_atomic = 0x4,
-   barrier_shared = 0x8,
-   /* used for geometry shaders to ensure vertex data writes are before the
-    * GS_DONE s_sendmsg. */
-   barrier_gs_data = 0x10,
-   /* used for geometry shaders to ensure s_sendmsg instructions are in-order. */
-   barrier_gs_sendmsg = 0x20,
-   /* used by barriers. created by s_barrier */
-   barrier_barrier = 0x40,
-   barrier_count = 7,
+enum storage_class : uint8_t {
+   storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */
+   storage_buffer = 0x1, /* SSBOs and global memory */
+   storage_atomic_counter = 0x2, /* not used for Vulkan */
+   storage_image = 0x4,
+   storage_shared = 0x8, /* or TCS output */
+   storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */
+   storage_scratch = 0x20,
+   storage_vgpr_spill = 0x40,
+   storage_count = 8,
  };
  
+enum memory_semantics : uint8_t {
+   semantic_none = 0x0,
+   /* for loads: don't move any access after this load to before this load (even other loads)
+    * for barriers: don't move any access after the barrier to before any
+    * atomics/control_barriers/sendmsg_gs_done before the barrier */
+   semantic_acquire = 0x1,
+   /* for stores: don't move any access before this store to after this store
+    * for barriers: don't move any access before the barrier to after any
+    * atomics/control_barriers/sendmsg_gs_done after the barrier */
+   semantic_release = 0x2,
+
+   /* the rest are for load/stores/atomics only */
+   /* cannot be DCE'd or CSE'd */
+   semantic_volatile = 0x4,
+   /* does not interact with barriers and assumes this lane is the only lane
+    * accessing this memory */
+   semantic_private = 0x8,
+   /* this operation can be reordered around operations of the same storage. says nothing about barriers */
+   semantic_can_reorder = 0x10,
+   /* this is a atomic instruction (may only read or write memory) */
+   semantic_atomic = 0x20,
+   /* this is instruction both reads and writes memory */
+   semantic_rmw = 0x40,
+
+   semantic_acqrel = semantic_acquire | semantic_release,
+   semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw,
+};
+
+enum sync_scope : uint8_t {
+   scope_invocation = 0,
+   scope_subgroup = 1,
+   scope_workgroup = 2,
+   scope_queuefamily = 3,
+   scope_device = 4,
+};
+
+struct memory_sync_info {
+   memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {}
+   memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation)
+      : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {}
+
+   storage_class storage:8;
+   memory_semantics semantics:8;
+   sync_scope scope:8;
+
+   bool operator == (const memory_sync_info& rhs) const {
+      return storage == rhs.storage &&
+             semantics == rhs.semantics &&
+             scope == rhs.scope;
+   }
+
+   bool can_reorder() const {
+      if (semantics & semantic_acqrel)
+         return false;
+      /* Also check storage so that zero-initialized memory_sync_info can be
+       * reordered. */
+      return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile);
+   }
+};
+static_assert(sizeof(memory_sync_info) == 3);
+
  enum fp_round {
     fp_round_ne = 0,
     fp_round_pi = 1,
@@ -931,14 +988,13 @@ static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected p
   *
   */
  struct SMEM_instruction : public Instruction {
-   barrier_interaction barrier;
+   memory_sync_info sync;
     bool glc : 1; /* VI+: globally coherent */
     bool dlc : 1; /* NAVI: device level coherent */
     bool nv : 1; /* VEGA only: Non-volatile */
-   bool can_reorder : 1;
     bool disable_wqm : 1;
     bool prevent_overflow : 1; /* avoid overflow when combining additions */
-   uint32_t padding: 18;
+   uint32_t padding: 3;
  };
  static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
  
@@ -1066,11 +1122,13 @@ static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected
   *
   */
  struct DS_instruction : public Instruction {
+   memory_sync_info sync;
+   bool gds;
     int16_t offset0;
     int8_t offset1;
-   bool gds;
+   uint8_t padding;
  };
-static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  
  /**
   * Vector Memory Untyped-buffer Instructions
@@ -1081,7 +1139,7 @@ static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 4, "Unexpected pad
   *
   */
  struct MUBUF_instruction : public Instruction {
-   uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+   memory_sync_info sync;
     bool offen : 1; /* Supply an offset from VGPR (VADDR) */
     bool idxen : 1; /* Supply an index from VGPR (VADDR) */
     bool addr64 : 1; /* SI, CIK: Address size is 64-bit */
@@ -1091,12 +1149,11 @@ struct MUBUF_instruction : public Instruction {
     bool tfe : 1; /* texture fail enable */
     bool lds : 1; /* Return read-data to LDS instead of VGPRs */
     bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   bool swizzled:1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint16_t offset : 12; /* Unsigned byte offset - 12 bit */
+   bool swizzled : 1;
+   uint32_t padding1 : 18;
  };
-static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  
  /**
   * Vector Memory Typed-buffer Instructions
@@ -1107,8 +1164,7 @@ static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected
   *
   */
  struct MTBUF_instruction : public Instruction {
-   uint16_t offset; /* Unsigned byte offset - 12 bit */
-   barrier_interaction barrier;
+   memory_sync_info sync;
     uint8_t dfmt : 4; /* Data Format of data in memory buffer */
     uint8_t nfmt : 3; /* Numeric format of data in memory */
     bool offen : 1; /* Supply an offset from VGPR (VADDR) */
@@ -1118,8 +1174,8 @@ struct MTBUF_instruction : public Instruction {
     bool slc : 1; /* system level coherent */
     bool tfe : 1; /* texture fail enable */
     bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint32_t padding : 25;
+   uint32_t padding : 10;
+   uint16_t offset; /* Unsigned byte offset - 12 bit */
  };
  static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  
@@ -1133,6 +1189,7 @@ static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected
   *
   */
  struct MIMG_instruction : public Instruction {
+   memory_sync_info sync;
     uint8_t dmask; /* Data VGPR enable mask */
     uint8_t dim : 3; /* NAVI: dimensionality */
     bool unrm : 1; /* Force address to be un-normalized */
@@ -1146,11 +1203,9 @@ struct MIMG_instruction : public Instruction {
     bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */
     bool d16 : 1; /* Convert 32-bit data to 16-bit data */
     bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint32_t padding : 18;
  };
-static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  
  /**
   * Flat/Scratch/Global Instructions
@@ -1160,18 +1215,18 @@ static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 4, "Unexpected p
   *
   */
  struct FLAT_instruction : public Instruction {
-   uint16_t offset; /* Vega/Navi only */
+   memory_sync_info sync;
     bool slc : 1; /* system level coherent */
     bool glc : 1; /* globally coherent */
     bool dlc : 1; /* NAVI: device level coherent */
     bool lds : 1;
     bool nv : 1;
     bool disable_wqm : 1; /* Require an exec mask without helper invocations */
-   bool can_reorder : 1;
-   uint8_t padding : 1;
-   barrier_interaction barrier;
+   uint32_t padding0 : 2;
+   uint16_t offset; /* Vega/Navi only */
+   uint16_t padding1;
  };
-static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
+static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  
  struct Export_instruction : public Instruction {
     uint8_t enabled_mask;
@@ -1200,8 +1255,10 @@ struct Pseudo_branch_instruction : public Instruction {
  static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding");
  
  struct Pseudo_barrier_instruction : public Instruction {
+   memory_sync_info sync;
+   sync_scope exec_scope;
  };
-static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 0, "Unexpected padding");
+static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
  
  enum ReduceOp : uint16_t {
     iadd8, iadd16, iadd32, iadd64,
@@ -1298,7 +1355,8 @@ static inline bool is_phi(aco_ptr<Instruction>& instr)
     return is_phi(instr.get());
  }
  
-barrier_interaction get_barrier_interaction(const Instruction* instr);
+memory_sync_info get_sync_info(const Instruction* instr);
+
  bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr);
  
  bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high);
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp

index 9af52aad9050b018b3bd246f1a34c4390913d3cb..c7e4acd542e31883772f1e424891f897fa8a80a1 100644 (file)
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -1854,6 +1854,7 @@ void lower_to_hw_instr(Program* program)
                    emit_gfx10_wave64_bpermute(program, instr, bld);
                 else
                    unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
+               break;
              }
              default:
                 break;
@@ -1908,6 +1909,18 @@ void lower_to_hw_instr(Program* program)
                             reduce->operands[2].physReg(), // vtmp
                             reduce->definitions[2].physReg(), // sitmp
                             reduce->operands[0], reduce->definitions[0]);
+         } else if (instr->format == Format::PSEUDO_BARRIER) {
+            Pseudo_barrier_instruction* barrier = static_cast<Pseudo_barrier_instruction*>(instr.get());
+
+            /* Anything larger than a workgroup isn't possible. Anything
+             * smaller requires no instructions and this pseudo instruction
+             * would only be included to control optimizations. */
+            bool emit_s_barrier = barrier->exec_scope == scope_workgroup &&
+                                  program->workgroup_size > program->wave_size;
+
+            bld.insert(std::move(instr));
+            if (emit_s_barrier)
+               bld.sopp(aco_opcode::s_barrier);
           } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
              float_mode new_mode = block->fp_mode;
              new_mode.round16_64 = fp_round_ne;
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py

index 46246cbea01254e852340387b33462bde23c212a..019ad9c0e0dbdaae8eadacc6a9e670a840132ed2 100644 (file)
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -66,7 +66,7 @@ class Format(Enum):
           return [('uint32_t', 'block', '-1'),
                   ('uint32_t', 'imm', '0')]
        elif self == Format.SMEM:
-         return [('bool', 'can_reorder', 'true'),
+         return [('memory_sync_info', 'sync', 'memory_sync_info()'),
                   ('bool', 'glc', 'false'),
                   ('bool', 'dlc', 'false'),
                   ('bool', 'nv', 'false')]
@@ -123,6 +123,9 @@ class Format(Enum):
        elif self == Format.PSEUDO_REDUCTION:
           return [('ReduceOp', 'op', None, 'reduce_op'),
                   ('unsigned', 'cluster_size', '0')]
+      elif self == Format.PSEUDO_BARRIER:
+         return [('memory_sync_info', 'sync', None),
+                 ('sync_scope', 'exec_scope', 'scope_invocation')]
        elif self == Format.VINTRP:
           return [('unsigned', 'attribute', None),
                   ('unsigned', 'component', None)]
@@ -133,7 +136,7 @@ class Format(Enum):
                   ('bool', 'bound_ctrl', 'true')]
        elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
           return [('uint16_t', 'offset', 0),
-                 ('bool', 'can_reorder', 'true'),
+                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
                   ('bool', 'glc', 'false'),
                   ('bool', 'slc', 'false'),
                   ('bool', 'lds', 'false'),
@@ -265,13 +268,7 @@ opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
  opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
  opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
  
-opcode("p_memory_barrier_common", format=Format.PSEUDO_BARRIER) # atomic, buffer, image and shared
-opcode("p_memory_barrier_atomic", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_buffer", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_image", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_shared", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_gs_data", format=Format.PSEUDO_BARRIER)
-opcode("p_memory_barrier_gs_sendmsg", format=Format.PSEUDO_BARRIER)
+opcode("p_barrier", format=Format.PSEUDO_BARRIER)
  
  opcode("p_spill")
  opcode("p_reload")
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp

index 2fdbfaabd4a1bc6bee1cc4b27f78c7b100729dd6..de3d894bf54e012ff472055d87bd2b6cb507c093 100644 (file)
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -224,12 +224,15 @@ struct InstrPred {
              return aK->imm == bK->imm;
           }
           case Format::SMEM: {
+            if (!a->operands.empty() && a->operands[0].bytes() == 16)
+               return false;
              SMEM_instruction* aS = static_cast<SMEM_instruction*>(a);
              SMEM_instruction* bS = static_cast<SMEM_instruction*>(b);
              /* isel shouldn't be creating situations where this assertion fails */
              assert(aS->prevent_overflow == bS->prevent_overflow);
-            return aS->can_reorder && bS->can_reorder &&
-                   aS->glc == bS->glc && aS->nv == bS->nv &&
+            return aS->sync.can_reorder() && bS->sync.can_reorder() &&
+                   aS->sync == bS->sync && aS->glc == bS->glc && aS->dlc == bS->dlc &&
+                   aS->nv == bS->nv && aS->disable_wqm == bS->disable_wqm &&
                     aS->prevent_overflow == bS->prevent_overflow;
           }
           case Format::VINTRP: {
@@ -251,8 +254,8 @@ struct InstrPred {
           case Format::MTBUF: {
              MTBUF_instruction* aM = static_cast<MTBUF_instruction *>(a);
              MTBUF_instruction* bM = static_cast<MTBUF_instruction *>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                     aM->dfmt == bM->dfmt &&
                     aM->nfmt == bM->nfmt &&
                     aM->offset == bM->offset &&
@@ -267,8 +270,8 @@ struct InstrPred {
           case Format::MUBUF: {
              MUBUF_instruction* aM = static_cast<MUBUF_instruction *>(a);
              MUBUF_instruction* bM = static_cast<MUBUF_instruction *>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                     aM->offset == bM->offset &&
                     aM->offen == bM->offen &&
                     aM->idxen == bM->idxen &&
@@ -295,7 +298,9 @@ struct InstrPred {
                 return false;
              DS_instruction* aD = static_cast<DS_instruction *>(a);
              DS_instruction* bD = static_cast<DS_instruction *>(b);
-            return aD->pass_flags == bD->pass_flags &&
+            return aD->sync.can_reorder() && bD->sync.can_reorder() &&
+                   aD->sync == bD->sync &&
+                   aD->pass_flags == bD->pass_flags &&
                     aD->gds == bD->gds &&
                     aD->offset0 == bD->offset0 &&
                     aD->offset1 == bD->offset1;
@@ -303,8 +308,8 @@ struct InstrPred {
           case Format::MIMG: {
              MIMG_instruction* aM = static_cast<MIMG_instruction*>(a);
              MIMG_instruction* bM = static_cast<MIMG_instruction*>(b);
-            return aM->can_reorder && bM->can_reorder &&
-                   aM->barrier == bM->barrier &&
+            return aM->sync.can_reorder() && bM->sync.can_reorder() &&
+                   aM->sync == bM->sync &&
                     aM->dmask == bM->dmask &&
                     aM->unrm == bM->unrm &&
                     aM->glc == bM->glc &&
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp

index e00c8c1fcafa2c9d7efc52f445f756a8946481bf..c1f0bc20f9e41a38acd841cda76952346c1d235c 100644 (file)
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -1026,8 +1026,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
                 new_instr->operands.back() = Operand(base);
                 if (!smem->definitions.empty())
                    new_instr->definitions[0] = smem->definitions[0];
-               new_instr->can_reorder = smem->can_reorder;
-               new_instr->barrier = smem->barrier;
+               new_instr->sync = smem->sync;
                 new_instr->glc = smem->glc;
                 new_instr->dlc = smem->dlc;
                 new_instr->nv = smem->nv;
diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp

index 9172fc1ac608119c87f18d2527246078c603c7b7..8b8b5d0f306bc39f549317678a613342725a4985 100644 (file)
--- a/src/amd/compiler/aco_print_ir.cpp
+++ b/src/amd/compiler/aco_print_ir.cpp
@@ -189,23 +189,73 @@ static void print_definition(const Definition *definition, FILE *output)
        print_physReg(definition->physReg(), definition->bytes(), output);
  }
  
-static void print_barrier_reorder(bool can_reorder, barrier_interaction barrier, FILE *output)
+static void print_storage(storage_class storage, FILE *output)
  {
-   if (can_reorder)
-      fprintf(output, " reorder");
+   fprintf(output, " storage:");
+   int printed = 0;
+   if (storage & storage_buffer)
+      printed += fprintf(output, "%sbuffer", printed ? "," : "");
+   if (storage & storage_atomic_counter)
+      printed += fprintf(output, "%satomic_counter", printed ? "," : "");
+   if (storage & storage_image)
+      printed += fprintf(output, "%simage", printed ? "," : "");
+   if (storage & storage_shared)
+      printed += fprintf(output, "%sshared", printed ? "," : "");
+   if (storage & storage_vmem_output)
+      printed += fprintf(output, "%svmem_output", printed ? "," : "");
+   if (storage & storage_scratch)
+      printed += fprintf(output, "%sscratch", printed ? "," : "");
+   if (storage & storage_vgpr_spill)
+      printed += fprintf(output, "%svgpr_spill", printed ? "," : "");
+}
+
+static void print_semantics(memory_semantics sem, FILE *output)
+{
+   fprintf(output, " semantics:");
+   int printed = 0;
+   if (sem & semantic_acquire)
+      printed += fprintf(output, "%sacquire", printed ? "," : "");
+   if (sem & semantic_release)
+      printed += fprintf(output, "%srelease", printed ? "," : "");
+   if (sem & semantic_volatile)
+      printed += fprintf(output, "%svolatile", printed ? "," : "");
+   if (sem & semantic_private)
+      printed += fprintf(output, "%sprivate", printed ? "," : "");
+   if (sem & semantic_can_reorder)
+      printed += fprintf(output, "%sreorder", printed ? "," : "");
+   if (sem & semantic_atomic)
+      printed += fprintf(output, "%satomic", printed ? "," : "");
+   if (sem & semantic_rmw)
+      printed += fprintf(output, "%srmw", printed ? "," : "");
+}
+
+static void print_scope(sync_scope scope, FILE *output, const char *prefix="scope")
+{
+   fprintf(output, " %s:", prefix);
+   switch (scope) {
+   case scope_invocation:
+      fprintf(output, "invocation");
+      break;
+   case scope_subgroup:
+      fprintf(output, "subgroup");
+      break;
+   case scope_workgroup:
+      fprintf(output, "workgroup");
+      break;
+   case scope_queuefamily:
+      fprintf(output, "queuefamily");
+      break;
+   case scope_device:
+      fprintf(output, "device");
+      break;
+   }
+}
  
-   if (barrier & barrier_buffer)
-      fprintf(output, " buffer");
-   if (barrier & barrier_image)
-      fprintf(output, " image");
-   if (barrier & barrier_atomic)
-      fprintf(output, " atomic");
-   if (barrier & barrier_shared)
-      fprintf(output, " shared");
-   if (barrier & barrier_gs_data)
-      fprintf(output, " gs_data");
-   if (barrier & barrier_gs_sendmsg)
-      fprintf(output, " gs_sendmsg");
+static void print_sync(memory_sync_info sync, FILE *output)
+{
+   print_storage(sync.storage, output);
+   print_semantics(sync.semantics, output);
+   print_scope(sync.scope, output);
  }
  
  static void print_instr_format_specific(const Instruction *instr, FILE *output)
@@ -292,7 +342,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
           fprintf(output, " dlc");
        if (smem->nv)
           fprintf(output, " nv");
-      print_barrier_reorder(smem->can_reorder, smem->barrier, output);
+      print_sync(smem->sync, output);
        break;
     }
     case Format::VINTRP: {
@@ -308,6 +358,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
           fprintf(output, " offset1:%u", ds->offset1);
        if (ds->gds)
           fprintf(output, " gds");
+      print_sync(ds->sync, output);
        break;
     }
     case Format::MUBUF: {
@@ -332,7 +383,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
           fprintf(output, " lds");
        if (mubuf->disable_wqm)
           fprintf(output, " disable_wqm");
-      print_barrier_reorder(mubuf->can_reorder, mubuf->barrier, output);
+      print_sync(mubuf->sync, output);
        break;
     }
     case Format::MIMG: {
@@ -392,7 +443,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
           fprintf(output, " d16");
        if (mimg->disable_wqm)
           fprintf(output, " disable_wqm");
-      print_barrier_reorder(mimg->can_reorder, mimg->barrier, output);
+      print_sync(mimg->sync, output);
        break;
     }
     case Format::EXP: {
@@ -439,6 +490,12 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
           fprintf(output, " cluster_size:%u", reduce->cluster_size);
        break;
     }
+   case Format::PSEUDO_BARRIER: {
+      const Pseudo_barrier_instruction* barrier = static_cast<const Pseudo_barrier_instruction*>(instr);
+      print_sync(barrier->sync, output);
+      print_scope(barrier->exec_scope, output, "exec_scope");
+      break;
+   }
     case Format::FLAT:
     case Format::GLOBAL:
     case Format::SCRATCH: {
@@ -457,7 +514,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
           fprintf(output, " nv");
        if (flat->disable_wqm)
           fprintf(output, " disable_wqm");
-      print_barrier_reorder(flat->can_reorder, flat->barrier, output);
+      print_sync(flat->sync, output);
        break;
     }
     case Format::MTBUF: {
@@ -507,7 +564,7 @@ static void print_instr_format_specific(const Instruction *instr, FILE *output)
           fprintf(output, " tfe");
        if (mtbuf->disable_wqm)
           fprintf(output, " disable_wqm");
-      print_barrier_reorder(mtbuf->can_reorder, mtbuf->barrier, output);
+      print_sync(mtbuf->sync, output);
        break;
     }
     case Format::VOP3P: {
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp

index d837059cfd4e8216c903d00340b5b16249fae89b..261d58ceb03e19daee85c2b0069d6eec27da219b 100644 (file)
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -318,26 +318,6 @@ void MoveState::upwards_skip()
     source_idx++;
  }
  
-bool can_reorder(Instruction* candidate)
-{
-   switch (candidate->format) {
-   case Format::SMEM:
-      return static_cast<SMEM_instruction*>(candidate)->can_reorder;
-   case Format::MUBUF:
-      return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
-   case Format::MIMG:
-      return static_cast<MIMG_instruction*>(candidate)->can_reorder;
-   case Format::MTBUF:
-      return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
-   case Format::FLAT:
-   case Format::GLOBAL:
-   case Format::SCRATCH:
-      return static_cast<FLAT_instruction*>(candidate)->can_reorder;
-   default:
-      return true;
-   }
-}
-
  bool is_gs_or_done_sendmsg(const Instruction *instr)
  {
     if (instr->opcode == aco_opcode::s_sendmsg) {
@@ -357,96 +337,96 @@ bool is_done_sendmsg(const Instruction *instr)
     return false;
  }
  
-barrier_interaction get_barrier_interaction(const Instruction* instr)
+memory_sync_info get_sync_info_with_hack(const Instruction* instr)
  {
-   switch (instr->format) {
-   case Format::SMEM:
-      return static_cast<const SMEM_instruction*>(instr)->barrier;
-   case Format::MUBUF:
-      return static_cast<const MUBUF_instruction*>(instr)->barrier;
-   case Format::MIMG:
-      return static_cast<const MIMG_instruction*>(instr)->barrier;
-   case Format::MTBUF:
-      return static_cast<const MTBUF_instruction*>(instr)->barrier;
-   case Format::FLAT:
-   case Format::GLOBAL:
-   case Format::SCRATCH:
-      return static_cast<const FLAT_instruction*>(instr)->barrier;
-   case Format::DS:
-      return barrier_shared;
-   case Format::SOPP:
-      if (is_done_sendmsg(instr))
-         return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
-      else if (is_gs_or_done_sendmsg(instr))
-         return barrier_gs_sendmsg;
-      else
-         return barrier_none;
-   case Format::PSEUDO_BARRIER:
-      return barrier_barrier;
-   default:
-      return barrier_none;
+   memory_sync_info sync = get_sync_info(instr);
+   if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
+      // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
+      sync.storage = (storage_class)(sync.storage | storage_buffer);
+      sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
     }
+   return sync;
  }
  
-barrier_interaction parse_barrier(Instruction *instr)
-{
-   if (instr->format == Format::PSEUDO_BARRIER) {
-      switch (instr->opcode) {
-      case aco_opcode::p_memory_barrier_atomic:
-         return barrier_atomic;
-      /* For now, buffer and image barriers are treated the same. this is because of
-       * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
-       * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
-       * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
-       * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
-       * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
-       * Either way, this solution should work. */
-      case aco_opcode::p_memory_barrier_buffer:
-      case aco_opcode::p_memory_barrier_image:
-         return (barrier_interaction)(barrier_image | barrier_buffer);
-      case aco_opcode::p_memory_barrier_shared:
-         return barrier_shared;
-      case aco_opcode::p_memory_barrier_common:
-         return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
-      case aco_opcode::p_memory_barrier_gs_data:
-         return barrier_gs_data;
-      case aco_opcode::p_memory_barrier_gs_sendmsg:
-         return barrier_gs_sendmsg;
-      default:
-         break;
-      }
-   } else if (instr->opcode == aco_opcode::s_barrier) {
-      return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
-   }
-   return barrier_none;
-}
+struct memory_event_set {
+   bool has_control_barrier;
+
+   unsigned bar_acquire;
+   unsigned bar_release;
+   unsigned bar_classes;
+
+   unsigned access_acquire;
+   unsigned access_release;
+   unsigned access_relaxed;
+   unsigned access_atomic;
+};
  
  struct hazard_query {
     bool contains_spill;
-   int barriers;
-   int barrier_interaction;
-   bool can_reorder_vmem;
-   bool can_reorder_smem;
+   bool contains_sendmsg;
+   memory_event_set mem_events;
+   unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
+   unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
  };
  
  void init_hazard_query(hazard_query *query) {
     query->contains_spill = false;
-   query->barriers = 0;
-   query->barrier_interaction = 0;
-   query->can_reorder_vmem = true;
-   query->can_reorder_smem = true;
+   query->contains_sendmsg = false;
+   memset(&query->mem_events, 0, sizeof(query->mem_events));
+   query->aliasing_storage = 0;
+   query->aliasing_storage_smem = 0;
+}
+
+void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
+{
+   set->has_control_barrier |= is_done_sendmsg(instr);
+   if (instr->opcode == aco_opcode::p_barrier) {
+      Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
+      if (bar->sync.semantics & semantic_acquire)
+         set->bar_acquire |= bar->sync.storage;
+      if (bar->sync.semantics & semantic_release)
+         set->bar_release |= bar->sync.storage;
+      set->bar_classes |= bar->sync.storage;
+
+      set->has_control_barrier |= bar->exec_scope > scope_invocation;
+   }
+
+   if (!sync->storage)
+      return;
+
+   if (sync->semantics & semantic_acquire)
+      set->access_acquire |= sync->storage;
+   if (sync->semantics & semantic_release)
+      set->access_release |= sync->storage;
+
+   if (!(sync->semantics & semantic_private)) {
+      if (sync->semantics & semantic_atomic)
+         set->access_atomic |= sync->storage;
+      else
+         set->access_relaxed |= sync->storage;
+   }
  }
  
  void add_to_hazard_query(hazard_query *query, Instruction *instr)
  {
-   query->barriers |= parse_barrier(instr);
-   query->barrier_interaction |= get_barrier_interaction(instr);
     if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
        query->contains_spill = true;
+   query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
+
+   memory_sync_info sync = get_sync_info_with_hack(instr);
  
-   bool can_reorder_instr = can_reorder(instr);
-   query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr;
-   query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr;
+   add_memory_event(&query->mem_events, instr, &sync);
+
+   if (!(sync.semantics & semantic_can_reorder)) {
+      unsigned storage = sync.storage;
+      /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
+      if (storage & (storage_buffer | storage_image))
+         storage |= storage_buffer | storage_image;
+      if (instr->format == Format::SMEM)
+         query->aliasing_storage_smem |= storage;
+      else
+         query->aliasing_storage |= storage;
+   }
  }
  
  enum HazardResult {
@@ -463,10 +443,8 @@ enum HazardResult {
     hazard_fail_unreorderable,
  };
  
-HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
+HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
  {
-   bool can_reorder_candidate = can_reorder(instr);
-
     if (instr->opcode == aco_opcode::p_exit_early_if)
        return hazard_fail_exec;
     for (const Definition& def : instr->definitions) {
@@ -484,27 +462,61 @@ HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
         instr->opcode == aco_opcode::s_setprio)
        return hazard_fail_unreorderable;
  
-   barrier_interaction bar = parse_barrier(instr);
-   if (query->barrier_interaction && (query->barrier_interaction & bar))
+   memory_event_set instr_set;
+   memset(&instr_set, 0, sizeof(instr_set));
+   memory_sync_info sync = get_sync_info_with_hack(instr);
+   add_memory_event(&instr_set, instr, &sync);
+
+   memory_event_set *first = &instr_set;
+   memory_event_set *second = &query->mem_events;
+   if (upwards)
+      std::swap(first, second);
+
+   /* everything after barrier(acquire) happens after the atomics/control_barriers before
+    * everything after load(acquire) happens after the load
+    */
+   if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
+      return hazard_fail_barrier;
+   if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
+       ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
+      return hazard_fail_barrier;
+
+   /* everything before barrier(release) happens before the atomics/control_barriers after *
+    * everything before store(release) happens before the store
+    */
+   if (first->bar_release && (second->has_control_barrier || second->access_atomic))
        return hazard_fail_barrier;
-   if (bar && query->barriers && (query->barriers & ~bar))
+   if ((first->bar_classes && (second->bar_release || second->access_release)) ||
+       ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
        return hazard_fail_barrier;
-   if (query->barriers && (query->barriers & get_barrier_interaction(instr)))
+
+   /* don't move memory barriers around other memory barriers */
+   if (first->bar_classes && second->bar_classes)
        return hazard_fail_barrier;
  
-   if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate)
-      return hazard_fail_reorder_vmem_smem;
-   if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate)
+   /* Don't move memory loads/stores to before control barriers. This is to make
+    * memory barriers followed by control barriers work. */
+   if (first->has_control_barrier && (second->access_atomic | second->access_relaxed))
+      return hazard_fail_barrier;
+
+   /* don't move memory loads/stores past potentially aliasing loads/stores */
+   unsigned aliasing_storage = instr->format == Format::SMEM ?
+                               query->aliasing_storage_smem :
+                               query->aliasing_storage;
+   if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
+      unsigned intersect = sync.storage & aliasing_storage;
+      if (intersect & storage_shared)
+         return hazard_fail_reorder_ds;
        return hazard_fail_reorder_vmem_smem;
-   if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS)
-      return hazard_fail_reorder_ds;
-   if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr)))
-      return hazard_fail_reorder_sendmsg;
+   }
  
     if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
         query->contains_spill)
        return hazard_fail_spill;
  
+   if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
+      return hazard_fail_reorder_sendmsg;
+
     return hazard_success;
  }
  
@@ -546,7 +558,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
  
        bool can_move_down = true;
  
-      HazardResult haz = perform_hazard_query(&hq, candidate.get());
+      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
        if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
           can_move_down = false;
        else if (haz != hazard_success)
@@ -594,7 +606,7 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
           break;
  
        if (found_dependency) {
-         HazardResult haz = perform_hazard_query(&hq, candidate.get());
+         HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
           if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
               haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
               haz == hazard_fail_export)
@@ -686,7 +698,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
        /* if current depends on candidate, add additional dependencies and continue */
        bool can_move_down = !is_vmem || part_of_clause;
  
-      HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get());
+      HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
        if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
            haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
            haz == hazard_fail_export)
@@ -735,7 +747,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
        /* check if candidate depends on current */
        bool is_dependency = false;
        if (found_dependency) {
-         HazardResult haz = perform_hazard_query(&indep_hq, candidate.get());
+         HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
           if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
               haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
               haz == hazard_fail_barrier || haz == hazard_fail_export)
@@ -802,7 +814,7 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
        if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
           break;
  
-      HazardResult haz = perform_hazard_query(&hq, candidate.get());
+      HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
        if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
           break;
  
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp

index d472f63ec57b543a05419b2bbedc048ca8c7e30b..69d5cb23b23dee5e407682ab78f924001791e480 100644 (file)
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1565,10 +1565,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                    for (unsigned i = 0; i < temp.size(); i++)
                       split->definitions[i] = bld.def(v1);
                    bld.insert(split);
-                  for (unsigned i = 0; i < temp.size(); i++)
-                     bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+                  for (unsigned i = 0; i < temp.size(); i++) {
+                     Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
+                     static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
+                  }
                 } else {
-                  bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+                  Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
+                  static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                 }
              } else {
                 ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
@@ -1632,11 +1635,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                    for (unsigned i = 0; i < def.size(); i++) {
                       Temp tmp = bld.tmp(v1);
                       vec->operands[i] = Operand(tmp);
-                     bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+                     Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
+                     static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                    }
                    bld.insert(vec);
                 } else {
-                  bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+                  Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
+                  static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private);
                 }
              } else {
                 uint32_t spill_slot = slots[spill_id];
author	Rhys Perry <pendingchaos02@gmail.com>
	Fri, 26 Jun 2020 14:54:22 +0000 (15:54 +0100)
committer	Marge Bot <eric+marge@anholt.net>
	Tue, 28 Jul 2020 16:56:34 +0000 (16:56 +0000)
src/amd/compiler/aco_dead_code_analysis.cpp		patch \| blob \| history
src/amd/compiler/aco_insert_waitcnt.cpp		patch \| blob \| history
src/amd/compiler/aco_instruction_selection.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.cpp		patch \| blob \| history
src/amd/compiler/aco_ir.h		patch \| blob \| history
src/amd/compiler/aco_lower_to_hw_instr.cpp		patch \| blob \| history
src/amd/compiler/aco_opcodes.py		patch \| blob \| history
src/amd/compiler/aco_opt_value_numbering.cpp		patch \| blob \| history
src/amd/compiler/aco_optimizer.cpp		patch \| blob \| history
src/amd/compiler/aco_print_ir.cpp		patch \| blob \| history
src/amd/compiler/aco_scheduler.cpp		patch \| blob \| history
src/amd/compiler/aco_spill.cpp		patch \| blob \| history