aco/gfx10: Wait for pending SMEM stores before loads

author Timur Kristóf <timur.kristof@gmail.com>

Mon, 14 Oct 2019 13:18:31 +0000 (15:18 +0200)

committer Rhys Perry <pendingchaos02@gmail.com>

Mon, 21 Oct 2019 14:33:54 +0000 (14:33 +0000)
author Timur Kristóf <timur.kristof@gmail.com>
Mon, 14 Oct 2019 13:18:31 +0000 (15:18 +0200)
committer Rhys Perry <pendingchaos02@gmail.com>
Mon, 21 Oct 2019 14:33:54 +0000 (14:33 +0000)
diff --git a/src/amd/compiler/README b/src/amd/compiler/README

index cd5861b1f9a3cd4e9a31cb9dbab6362dd1514012..16e7daee3e43fb72cf2c3877c07c6f2c9a2eb8bd 100644 (file)
--- a/src/amd/compiler/README
+++ b/src/amd/compiler/README
@@ -131,3 +131,12 @@ finish and then write to vcc (for example, `s_mov_b64 vcc, vcc`) to correct vccz
  
  Currently, we don't do this.
  
+## RDNA / GFX10 hazards
+
+### SMEM store followed by a load with the same address
+
+We found that an `s_buffer_load` will produce incorrect results if it is preceded
+by an `s_buffer_store` with the same address. Inserting an `s_nop` between them
+does not mitigate the issue, so an `s_waitcnt lgkmcnt(0)` must be inserted.
+This is not mentioned by LLVM among the other GFX10 bugs, but LLVM doesn't use
+SMEM stores, so it's not surprising that they didn't notice it.
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp

index 76fa54f2c173b2075238d91ca8ef325560629429..af609132deed64473516875f947420a7c0532240 100644 (file)
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -221,6 +221,7 @@ struct wait_ctx {
     uint8_t vs_cnt = 0;
     bool pending_flat_lgkm = false;
     bool pending_flat_vm = false;
+   bool pending_s_buffer_store = false; /* GFX10 workaround */
  
     wait_imm barrier_imm[barrier_count];
  
@@ -244,6 +245,7 @@ struct wait_ctx {
        vs_cnt = std::max(vs_cnt, other->vs_cnt);
        pending_flat_lgkm |= other->pending_flat_lgkm;
        pending_flat_vm |= other->pending_flat_vm;
+      pending_s_buffer_store |= other->pending_s_buffer_store;
  
        for (std::pair<PhysReg,wait_entry> entry : other->gpr_map)
        {
@@ -329,6 +331,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
         */
        if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb)
           imm.lgkm = 0;
+
+      /* GFX10: A store followed by a load at the same address causes a problem because
+       * the load doesn't load the correct values unless we wait for the store first.
+       * This is NOT mitigated by an s_nop.
+       *
+       * TODO: Refine this when we have proper alias analysis.
+       */
+      SMEM_instruction *smem = static_cast<SMEM_instruction *>(instr);
+      if (ctx.pending_s_buffer_store &&
+          !smem->definitions.empty() &&
+          !smem->can_reorder && smem->barrier == barrier_buffer) {
+         imm.lgkm = 0;
+      }
     }
  
     if (instr->format == Format::PSEUDO_BARRIER) {
@@ -407,8 +422,10 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx)
  
     if (imm.vm == 0)
        ctx.pending_flat_vm = false;
-   if (imm.lgkm == 0)
+   if (imm.lgkm == 0) {
        ctx.pending_flat_lgkm = false;
+      ctx.pending_s_buffer_store = false;
+   }
  
     return imm;
  }
@@ -576,10 +593,16 @@ void gen(Instruction* instr, wait_ctx& ctx)
        break;
     }
     case Format::SMEM: {
+      SMEM_instruction *smem = static_cast<SMEM_instruction*>(instr);
        update_counters(ctx, event_smem, static_cast<SMEM_instruction*>(instr)->barrier);
  
        if (!instr->definitions.empty())
           insert_wait_entry(ctx, instr->definitions[0], event_smem);
+      else if (ctx.chip_class >= GFX10 &&
+               !smem->can_reorder &&
+               smem->barrier == barrier_buffer)
+         ctx.pending_s_buffer_store = true;
+
        break;
     }
     case Format::DS: {
author	Timur Kristóf <timur.kristof@gmail.com>
	Mon, 14 Oct 2019 13:18:31 +0000 (15:18 +0200)
committer	Rhys Perry <pendingchaos02@gmail.com>
	Mon, 21 Oct 2019 14:33:54 +0000 (14:33 +0000)
src/amd/compiler/README		patch \| blob \| history
src/amd/compiler/aco_insert_waitcnt.cpp		patch \| blob \| history