From 9e403dc56e3ab702abc68fd65ed4ab324ba69e69 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Wed, 22 May 2019 12:36:17 -0500 Subject: [PATCH] intel/fs: Do a stalling MFENCE in endInvocationInterlock() Fixes: 939312702e "i965: Add ARB_fragment_shader_interlock support" Reviewed-by: Kenneth Graunke --- src/intel/compiler/brw_eu.h | 3 ++- src/intel/compiler/brw_eu_emit.c | 8 ++++++-- src/intel/compiler/brw_fs_generator.cpp | 5 +++-- src/intel/compiler/brw_fs_nir.cpp | 18 ++++++++++++++++-- src/intel/compiler/brw_vec4_generator.cpp | 2 +- 5 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h index 8ef953d5aa4..29965e60a7f 100644 --- a/src/intel/compiler/brw_eu.h +++ b/src/intel/compiler/brw_eu.h @@ -1114,7 +1114,8 @@ void brw_memory_fence(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src, - enum opcode send_op); + enum opcode send_op, + bool stall); void brw_pixel_interpolator_query(struct brw_codegen *p, diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c index 181932705f3..7b8783ee3d1 100644 --- a/src/intel/compiler/brw_eu_emit.c +++ b/src/intel/compiler/brw_eu_emit.c @@ -3038,10 +3038,11 @@ void brw_memory_fence(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src, - enum opcode send_op) + enum opcode send_op, + bool stall) { const struct gen_device_info *devinfo = p->devinfo; - const bool commit_enable = + const bool commit_enable = stall || devinfo->gen >= 10 || /* HSD ES # 1404612949 */ (devinfo->gen == 7 && !devinfo->is_haswell); struct brw_inst *insn; @@ -3080,6 +3081,9 @@ brw_memory_fence(struct brw_codegen *p, brw_MOV(p, dst, offset(dst, 1)); } + if (stall) + brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst); + brw_pop_insn_state(p); } diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 1149e98ecd6..f91c857678a 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -2071,13 +2071,14 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) break; case SHADER_OPCODE_MEMORY_FENCE: - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND); + assert(src[1].file == BRW_IMMEDIATE_VALUE); + brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud); break; case SHADER_OPCODE_INTERLOCK: assert(devinfo->gen >= 9); /* The interlock is basically a memory fence issued via sendc */ - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC); + brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false); break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 6856eca687a..77b131272ca 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4273,7 +4273,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_memory_barrier: { const fs_builder ubld = bld.group(8, 0); const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); - ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0)) + ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, + brw_vec8_grf(0, 0), brw_imm_ud(0)) ->size_written = 2 * REG_SIZE; break; } @@ -5080,7 +5081,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr } case nir_intrinsic_end_invocation_interlock: { - /* We don't need to do anything here */ + /* For endInvocationInterlock(), we need to insert a memory fence which + * stalls in the shader until the memory transactions prior to that + * fence are complete. This ensures that the shader does not end before + * any writes from its critical section have landed. Otherwise, you can + * end up with a case where the next invocation on that pixel properly + * stalls for previous FS invocation on its pixel to complete but + * doesn't actually wait for the dataport memory transactions from that + * thread to land before submitting its own. + */ + const fs_builder ubld = bld.group(8, 0); + const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); + ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, + brw_vec8_grf(0, 0), brw_imm_ud(1)) + ->size_written = 2 * REG_SIZE; break; } diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp index 38181bf1469..8f9e4f16677 100644 --- a/src/intel/compiler/brw_vec4_generator.cpp +++ b/src/intel/compiler/brw_vec4_generator.cpp @@ -1886,7 +1886,7 @@ generate_code(struct brw_codegen *p, break; case SHADER_OPCODE_MEMORY_FENCE: - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND); + brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false); break; case SHADER_OPCODE_FIND_LIVE_CHANNEL: { -- 2.30.2