From 1749953ea3eb2f3e33a61243cc11860795c658f3 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Mon, 14 Oct 2019 15:18:31 +0200 Subject: [PATCH] aco/gfx10: Wait for pending SMEM stores before loads MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Currently if you have an SMEM store followed by an SMEM load that loads the same location as was written, it won't work because the store isn't finished before the load is executed. This is NOT mitigated by an s_nop instruction on GFX10. Since we currently don't have proper alias analysis, this commit adds a workaround which will insert an s_waitcnt lgkmcnt(0) before each SSBO load if they follow a store. We should further refine this in the future when we can make sure to only add the wait when we load the same thing as has been stored. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann --- src/amd/compiler/README | 9 +++++++++ src/amd/compiler/aco_insert_waitcnt.cpp | 25 ++++++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/amd/compiler/README b/src/amd/compiler/README index cd5861b1f9a..16e7daee3e4 100644 --- a/src/amd/compiler/README +++ b/src/amd/compiler/README @@ -131,3 +131,12 @@ finish and then write to vcc (for example, `s_mov_b64 vcc, vcc`) to correct vccz Currently, we don't do this. +## RDNA / GFX10 hazards + +### SMEM store followed by a load with the same address + +We found that an `s_buffer_load` will produce incorrect results if it is preceded +by an `s_buffer_store` with the same address. Inserting an `s_nop` between them +does not mitigate the issue, so an `s_waitcnt lgkmcnt(0)` must be inserted. +This is not mentioned by LLVM among the other GFX10 bugs, but LLVM doesn't use +SMEM stores, so it's not surprising that they didn't notice it. diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 76fa54f2c17..af609132dee 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -221,6 +221,7 @@ struct wait_ctx { uint8_t vs_cnt = 0; bool pending_flat_lgkm = false; bool pending_flat_vm = false; + bool pending_s_buffer_store = false; /* GFX10 workaround */ wait_imm barrier_imm[barrier_count]; @@ -244,6 +245,7 @@ struct wait_ctx { vs_cnt = std::max(vs_cnt, other->vs_cnt); pending_flat_lgkm |= other->pending_flat_lgkm; pending_flat_vm |= other->pending_flat_vm; + pending_s_buffer_store |= other->pending_s_buffer_store; for (std::pair entry : other->gpr_map) { @@ -329,6 +331,19 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) */ if (ctx.lgkm_cnt && instr->opcode == aco_opcode::s_dcache_wb) imm.lgkm = 0; + + /* GFX10: A store followed by a load at the same address causes a problem because + * the load doesn't load the correct values unless we wait for the store first. + * This is NOT mitigated by an s_nop. + * + * TODO: Refine this when we have proper alias analysis. + */ + SMEM_instruction *smem = static_cast(instr); + if (ctx.pending_s_buffer_store && + !smem->definitions.empty() && + !smem->can_reorder && smem->barrier == barrier_buffer) { + imm.lgkm = 0; + } } if (instr->format == Format::PSEUDO_BARRIER) { @@ -407,8 +422,10 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) if (imm.vm == 0) ctx.pending_flat_vm = false; - if (imm.lgkm == 0) + if (imm.lgkm == 0) { ctx.pending_flat_lgkm = false; + ctx.pending_s_buffer_store = false; + } return imm; } @@ -576,10 +593,16 @@ void gen(Instruction* instr, wait_ctx& ctx) break; } case Format::SMEM: { + SMEM_instruction *smem = static_cast(instr); update_counters(ctx, event_smem, static_cast(instr)->barrier); if (!instr->definitions.empty()) insert_wait_entry(ctx, instr->definitions[0], event_smem); + else if (ctx.chip_class >= GFX10 && + !smem->can_reorder && + smem->barrier == barrier_buffer) + ctx.pending_s_buffer_store = true; + break; } case Format::DS: { -- 2.30.2