From 1749953ea3eb2f3e33a61243cc11860795c658f3 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Timur=20Krist=C3=B3f?= <timur.kristof@gmail.com>
Date: Mon, 14 Oct 2019 15:18:31 +0200
Subject: [PATCH] aco/gfx10: Wait for pending SMEM stores before loads
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Currently if you have an SMEM store followed by an SMEM load that
loads the same location as was written, it won't work because the
store isn't finished before the load is executed. This is NOT
mitigated by an s_nop instruction on GFX10.

Since we currently don't have proper alias analysis, this commit adds
a workaround which will insert an s_waitcnt lgkmcnt(0) before each
SSBO load if they follow a store. We should further refine this in
the future when we can make sure to only add the wait when we load the
same thing as has been stored.

Signed-off-by: Timur KristÃ³f <timur.kristof@gmail.com>
Reviewed-by: Daniel SchÃ¼rmann <daniel@schuermann.dev>
---
 src/amd/compiler/README                 |  9 +++++++++
 src/amd/compiler/aco_insert_waitcnt.cpp | 25 ++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/amd/compiler/README b/src/amd/compiler/README
index cd5861b1f9a..16e7daee3e4 100644
--- a/src/amd/compiler/README
+++ b/src/amd/compiler/README
@@ -131,3 +131,12 @@ finish and then write to vcc (for example, `s_mov_b64 vcc, vcc`) to correct vccz
 
 Currently, we don't do this.
 
+## RDNA / GFX10 hazards
+
+### SMEM store followed by a load with the same address
+
+We found that an `s_buffer_load` will produce incorrect results if it is preceded
+by an `s_buffer_store` with the same address. Inserting an `s_nop` between them
+does not mitigate the issue, so an `s_waitcnt lgkmcnt(0)` must be inserted.
+This is not mentioned by LLVM among the other GFX10 bugs, but LLVM doesn't use
+SMEM stores, so it's not surprising that they didn't notice it.
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index 76fa54f2c17..af609132dee 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -221,6 +221,7 @@ struct wait_ctx {
    uint8_t vs_cnt = 0;
    bool pending_flat_lgkm = false;
    bool pending_flat_vm = false;
+   bool pending_s_buffer_store = false; /* GFX10 workaround */
 
    wait_imm barrier_imm[barrier_count];
 
@@ -244,6 +245,7 @@ struct wait_ctx {
       vs_cnt = std::max(vs_cnt, other->vs_cnt);
       pending_flat_lgkm |= other->pending_flat_lgkm;
       pending_flat_vm |= other->pending_flat_vm;
+      pending_s_buffer_store |= other->pending_s_buffer_store;
 
       for (std::pair<PhysReg,wait_entry> entry : other->gpr_map)
       {
@@ -329,6 +331,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
        */
       if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb)
          imm.lgkm = 0;
+
+      /* GFX10: A store followed by a load at the same address causes a problem because
+       * the load doesn't load the correct values unless we wait for the store first.
+       * This is NOT mitigated by an s_nop.
+       *
+       * TODO: Refine this when we have proper alias analysis.
+       */
+      SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
+      if (ctx.pending_s_buffer_store &&
+          !smem->definitions.empty() &&
+          !smem->can_reorder && smem->barrier == barrier_buffer) {
+         imm.lgkm = 0;
+      }
    }
 
    if (instr->format == Format::PSEUDO_BARRIER) {
@@ -407,8 +422,10 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
 
    if (imm.vm == 0)
       ctx.pending_flat_vm = false;
-   if (imm.lgkm == 0)
+   if (imm.lgkm == 0) {
       ctx.pending_flat_lgkm = false;
+      ctx.pending_s_buffer_store = false;
+   }
 
    return imm;
 }
@@ -576,10 +593,16 @@ void gen(Instruction* instr, wait_ctx& ctx)
       break;
    }
    case Format::SMEM: {
+      SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
       update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
 
       if (!instr->definitions.empty())
          insert_wait_entry(ctx, instr->definitions[0], event_smem);
+      else if (ctx.chip_class >= GFX10 &&
+               !smem->can_reorder &&
+               smem->barrier == barrier_buffer)
+         ctx.pending_s_buffer_store = true;
+
       break;
    }
    case Format::DS: {
-- 
2.30.2