intel/fs: Do a stalling MFENCE in endInvocationInterlock()
authorJason Ekstrand <jason@jlekstrand.net>
Wed, 22 May 2019 17:36:17 +0000 (12:36 -0500)
committerJason Ekstrand <jason@jlekstrand.net>
Thu, 30 May 2019 14:00:26 +0000 (14:00 +0000)
Fixes: 939312702e "i965: Add ARB_fragment_shader_interlock support"
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
src/intel/compiler/brw_eu.h
src/intel/compiler/brw_eu_emit.c
src/intel/compiler/brw_fs_generator.cpp
src/intel/compiler/brw_fs_nir.cpp
src/intel/compiler/brw_vec4_generator.cpp

index 8ef953d5aa472e017f2abab13f64a711e6f1d1e7..29965e60a7f1646a32555ad7d892761093a2f665 100644 (file)
@@ -1114,7 +1114,8 @@ void
 brw_memory_fence(struct brw_codegen *p,
                  struct brw_reg dst,
                  struct brw_reg src,
-                 enum opcode send_op);
+                 enum opcode send_op,
+                 bool stall);
 
 void
 brw_pixel_interpolator_query(struct brw_codegen *p,
index 181932705f3b15e8f4aec470833c5d756f661ada..7b8783ee3d168226c1d99af099f0cf1b6acb15a6 100644 (file)
@@ -3038,10 +3038,11 @@ void
 brw_memory_fence(struct brw_codegen *p,
                  struct brw_reg dst,
                  struct brw_reg src,
-                 enum opcode send_op)
+                 enum opcode send_op,
+                 bool stall)
 {
    const struct gen_device_info *devinfo = p->devinfo;
-   const bool commit_enable =
+   const bool commit_enable = stall ||
       devinfo->gen >= 10 || /* HSD ES # 1404612949 */
       (devinfo->gen == 7 && !devinfo->is_haswell);
    struct brw_inst *insn;
@@ -3080,6 +3081,9 @@ brw_memory_fence(struct brw_codegen *p,
       brw_MOV(p, dst, offset(dst, 1));
    }
 
+   if (stall)
+      brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
+
    brw_pop_insn_state(p);
 }
 
index 1149e98ecd6257e3bd9ab962e976a74c343c2e90..f91c857678a3408e9cc67f07de14a869f25a6cfb 100644 (file)
@@ -2071,13 +2071,14 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
 
       case SHADER_OPCODE_MEMORY_FENCE:
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND);
+         assert(src[1].file == BRW_IMMEDIATE_VALUE);
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud);
          break;
 
       case SHADER_OPCODE_INTERLOCK:
          assert(devinfo->gen >= 9);
          /* The interlock is basically a memory fence issued via sendc */
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC);
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false);
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
index 6856eca687aac153eb9b473a640f91245b3a2b8f..77b131272ca0869696696809a46c8d3a6e09f3d7 100644 (file)
@@ -4273,7 +4273,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_memory_barrier: {
       const fs_builder ubld = bld.group(8, 0);
       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0))
+      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
+                brw_vec8_grf(0, 0), brw_imm_ud(0))
          ->size_written = 2 * REG_SIZE;
       break;
    }
@@ -5080,7 +5081,20 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    }
 
    case nir_intrinsic_end_invocation_interlock: {
-      /* We don't need to do anything here */
+      /* For endInvocationInterlock(), we need to insert a memory fence which
+       * stalls in the shader until the memory transactions prior to that
+       * fence are complete.  This ensures that the shader does not end before
+       * any writes from its critical section have landed.  Otherwise, you can
+       * end up with a case where the next invocation on that pixel properly
+       * stalls for previous FS invocation on its pixel to complete but
+       * doesn't actually wait for the dataport memory transactions from that
+       * thread to land before submitting its own.
+       */
+      const fs_builder ubld = bld.group(8, 0);
+      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
+                brw_vec8_grf(0, 0), brw_imm_ud(1))
+         ->size_written = 2 * REG_SIZE;
       break;
    }
 
index 38181bf14697fd69342850150f03e6f596627f56..8f9e4f16677dc2070fc34fb624b0a55acebe3a69 100644 (file)
@@ -1886,7 +1886,7 @@ generate_code(struct brw_codegen *p,
          break;
 
       case SHADER_OPCODE_MEMORY_FENCE:
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND);
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false);
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {