intel/fs,vec4: Pull stall logic for memory fences up into the IR

author Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>

Fri, 17 Jan 2020 23:07:44 +0000 (15:07 -0800)

committer Marge Bot <eric+marge@anholt.net>

Wed, 29 Apr 2020 07:17:27 +0000 (07:17 +0000)
author Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Fri, 17 Jan 2020 23:07:44 +0000 (15:07 -0800)
committer Marge Bot <eric+marge@anholt.net>
Wed, 29 Apr 2020 07:17:27 +0000 (07:17 +0000)
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h

index 96c22ab429a3e5b0d33b5a9a97fee302a066b68c..7ae17dbdd37c0210e640d2440d2c22718a3af4d0 100644 (file)
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1148,12 +1148,13 @@ brw_untyped_surface_write(struct brw_codegen *p,
                            unsigned num_channels,
                            bool header_present);
  
-unsigned
+void
  brw_memory_fence(struct brw_codegen *p,
                   struct brw_reg dst,
                   struct brw_reg src,
                   enum opcode send_op,
-                 bool stall,
+                 enum brw_message_target sfid,
+                 bool commit_enable,
                   unsigned bti);
  
  void
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h

index 146d3b3f0672b8c2429589387de2c0527780646d..742e12c2830d64699314bfec7dc6d3ea6c227b7b 100644 (file)
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -454,8 +454,8 @@ enum opcode {
      * Memory fence messages.
      *
      * Source 0: Must be register g0, used as header.
-    * Source 1: Immediate bool to indicate whether or not we need to stall
-    *           until memory transactions prior to the fence are completed.
+    * Source 1: Immediate bool to indicate whether control is returned to the
+    *           thread only after the fence has been honored.
      * Source 2: Immediate byte indicating which memory to fence.  Zero means
      *           global memory; GEN7_BTI_SLM means SLM (for Gen11+ only).
      *
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c

index 8786903dadac405337f6d7d65f883ed33b98ff61..12042131999b18039a4d071e11c69919fb46a5d0 100644 (file)
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3145,68 +3145,29 @@ brw_set_memory_fence_message(struct brw_codegen *p,
     brw_inst_set_binding_table_index(devinfo, insn, bti);
  }
  
-unsigned
+void
  brw_memory_fence(struct brw_codegen *p,
                   struct brw_reg dst,
                   struct brw_reg src,
                   enum opcode send_op,
-                 bool stall,
+                 enum brw_message_target sfid,
+                 bool commit_enable,
                   unsigned bti)
  {
     const struct gen_device_info *devinfo = p->devinfo;
-   const bool commit_enable = stall ||
-      devinfo->gen >= 10 || /* HSD ES # 1404612949 */
-      (devinfo->gen == 7 && !devinfo->is_haswell);
-   struct brw_inst *insn;
  
-   unsigned fences = 0;
-
-   brw_push_insn_state(p);
-   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-   brw_set_default_exec_size(p, BRW_EXECUTE_1);
     dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
     src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
  
     /* Set dst as destination for dependency tracking, the MEMORY_FENCE
      * message doesn't write anything back.
      */
-   insn = next_insn(p, send_op);
+   struct brw_inst *insn = next_insn(p, send_op);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
     brw_set_dest(p, insn, dst);
     brw_set_src0(p, insn, src);
-   brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
-                                commit_enable, bti);
-   fences++;
-
-   if (devinfo->gen == 7 && !devinfo->is_haswell) {
-      /* IVB does typed surface access through the render cache, so we need to
-       * flush it too.  Use a different register so both flushes can be
-       * pipelined by the hardware.
-       */
-      insn = next_insn(p, send_op);
-      brw_set_dest(p, insn, offset(dst, 1));
-      brw_set_src0(p, insn, src);
-      brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
-                                   commit_enable, bti);
-      fences++;
-
-      /* Now write the response of the second message into the response of the
-       * first to trigger a pipeline stall -- This way future render and data
-       * cache messages will be properly ordered with respect to past data and
-       * render cache messages.
-       */
-      brw_MOV(p, dst, offset(dst, 1));
-   }
-
-   if (stall) {
-      brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_DST,
-                                            brw_get_default_swsb(p).sbid));
-
-      brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
-   }
-
-   brw_pop_insn_state(p);
-
-   return fences;
+   brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
  }
  
  void
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp

index b055110ddf85bb514e530b6efaa10a37a6b212f4..5825e0770d41b68b53a56df8acffa6eebdb310aa 100644 (file)
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -2217,13 +2217,19 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
           generate_shader_time_add(inst, src[0], src[1], src[2]);
           break;
  
+      case SHADER_OPCODE_INTERLOCK:
        case SHADER_OPCODE_MEMORY_FENCE: {
           assert(src[1].file == BRW_IMMEDIATE_VALUE);
           assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         const unsigned sends =
-            brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud,
-                             src[2].ud);
-         send_count += sends;
+
+         const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
+            BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
+
+         brw_memory_fence(p, dst, src[0], send_op,
+                          brw_message_target(inst->sfid),
+                          /* commit_enable */ src[1].ud,
+                          /* bti */ src[2].ud);
+         send_count++;
           break;
        }
  
@@ -2257,12 +2263,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
  
           break;
  
-      case SHADER_OPCODE_INTERLOCK:
-         assert(devinfo->gen >= 9);
-         /* The interlock is basically a memory fence issued via sendc */
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
-         break;
-
        case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
           const struct brw_reg mask =
              brw_stage_has_packed_dispatch(devinfo, stage,
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp

index f7d94e618b42532484fdd76bfba985f0505f956a..6f415d5de827f14d0910de27b43056419d402dcd 100644 (file)
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4232,19 +4232,52 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
     case nir_intrinsic_memory_barrier_shared:
     case nir_intrinsic_memory_barrier_buffer:
     case nir_intrinsic_memory_barrier_image:
-   case nir_intrinsic_memory_barrier: {
+   case nir_intrinsic_memory_barrier:
+   case nir_intrinsic_begin_invocation_interlock:
+   case nir_intrinsic_end_invocation_interlock: {
        bool l3_fence, slm_fence;
-      if (instr->intrinsic == nir_intrinsic_scoped_memory_barrier) {
+      const enum opcode opcode =
+         instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
+         SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
+
+      switch (instr->intrinsic) {
+      case nir_intrinsic_scoped_memory_barrier: {
           nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
           l3_fence = modes & (nir_var_shader_out |
                               nir_var_mem_ssbo |
                               nir_var_mem_global);
           slm_fence = modes & nir_var_mem_shared;
-      } else {
+         break;
+      }
+
+      case nir_intrinsic_begin_invocation_interlock:
+      case nir_intrinsic_end_invocation_interlock:
+         /* For beginInvocationInterlockARB(), we will generate a memory fence
+          * but with a different opcode so that generator can pick SENDC
+          * instead of SEND.
+          *
+          * For endInvocationInterlockARB(), we need to insert a memory fence which
+          * stalls in the shader until the memory transactions prior to that
+          * fence are complete.  This ensures that the shader does not end before
+          * any writes from its critical section have landed.  Otherwise, you can
+          * end up with a case where the next invocation on that pixel properly
+          * stalls for previous FS invocation on its pixel to complete but
+          * doesn't actually wait for the dataport memory transactions from that
+          * thread to land before submitting its own.
+          *
+          * Handling them here will allow the logic for IVB render cache (see
+          * below) to be reused.
+          */
+         l3_fence = true;
+         slm_fence = false;
+         break;
+
+      default:
           l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
           slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
                       instr->intrinsic == nir_intrinsic_memory_barrier ||
                       instr->intrinsic == nir_intrinsic_memory_barrier_shared;
+         break;
        }
  
        if (stage != MESA_SHADER_COMPUTE)
@@ -4266,6 +4299,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
           l3_fence = true;
        }
  
+      /* IVB does typed surface access through the render cache, so we need
+       * to flush it too.
+       */
+      const bool needs_render_fence =
+         devinfo->gen == 7 && !devinfo->is_haswell;
+
        /* Be conservative in Gen11+ and always stall in a fence.  Since there
         * are two different fences, and shader might want to synchronize
         * between them.
@@ -4276,23 +4315,57 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
         * TODO: When emitting more than one fence, it might help emit all
         * the fences first and then generate the stall moves.
         */
-      const bool stall = devinfo->gen >= 11;
+      const bool stall = devinfo->gen >= 11 || needs_render_fence ||
+         instr->intrinsic == nir_intrinsic_end_invocation_interlock;
+
+      const bool commit_enable = stall ||
+         devinfo->gen >= 10; /* HSD ES # 1404612949 */
  
        const fs_builder ubld = bld.group(8, 0);
-      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
  
        if (l3_fence) {
-         ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
-                   brw_vec8_grf(0, 0), brw_imm_ud(stall),
-                   /* bti */ brw_imm_ud(0))
-            ->size_written = 2 * REG_SIZE;
+         fs_inst *fence =
+            ubld.emit(opcode,
+                      ubld.vgrf(BRW_REGISTER_TYPE_UD),
+                      brw_vec8_grf(0, 0),
+                      brw_imm_ud(commit_enable),
+                      brw_imm_ud(/* bti */ 0));
+         fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+         if (needs_render_fence) {
+            fs_inst *render_fence =
+               ubld.emit(opcode,
+                         ubld.vgrf(BRW_REGISTER_TYPE_UD),
+                         brw_vec8_grf(0, 0),
+                         brw_imm_ud(commit_enable),
+                         brw_imm_ud(/* bti */ 0));
+            render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+
+            ubld.exec_all().group(1, 0).emit(
+               FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+               fence->dst, render_fence->dst);
+         } else if (stall) {
+            ubld.exec_all().group(1, 0).emit(
+               FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+               fence->dst);
+         }
        }
  
        if (slm_fence) {
-         ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
-                   brw_vec8_grf(0, 0), brw_imm_ud(stall),
-                   brw_imm_ud(GEN7_BTI_SLM))
-            ->size_written = 2 * REG_SIZE;
+         assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
+         fs_inst *fence =
+            ubld.emit(opcode,
+                      ubld.vgrf(BRW_REGISTER_TYPE_UD),
+                      brw_vec8_grf(0, 0),
+                      brw_imm_ud(commit_enable),
+                      brw_imm_ud(GEN7_BTI_SLM));
+         fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+
+         if (stall) {
+            ubld.exec_all().group(1, 0).emit(
+               FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
+               fence->dst);
+         }
        }
  
        if (!l3_fence && !slm_fence)
@@ -5210,33 +5283,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
        break;
     }
  
-   case nir_intrinsic_begin_invocation_interlock: {
-      const fs_builder ubld = bld.group(8, 0);
-      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-
-      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0))
-         ->size_written = 2 * REG_SIZE;
-      break;
-   }
-
-   case nir_intrinsic_end_invocation_interlock: {
-      /* For endInvocationInterlock(), we need to insert a memory fence which
-       * stalls in the shader until the memory transactions prior to that
-       * fence are complete.  This ensures that the shader does not end before
-       * any writes from its critical section have landed.  Otherwise, you can
-       * end up with a case where the next invocation on that pixel properly
-       * stalls for previous FS invocation on its pixel to complete but
-       * doesn't actually wait for the dataport memory transactions from that
-       * thread to land before submitting its own.
-       */
-      const fs_builder ubld = bld.group(8, 0);
-      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
-                brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0))
-         ->size_written = 2 * REG_SIZE;
-      break;
-   }
-
     default:
        unreachable("unknown intrinsic");
     }
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp

index 7c038c97196b744156c0a31ded44a811a79e0e4e..be6697cc16c16d3edd58499ef70bbef280b5d1d2 100644 (file)
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1911,13 +1911,13 @@ generate_code(struct brw_codegen *p,
           send_count++;
           break;
  
-      case SHADER_OPCODE_MEMORY_FENCE: {
-         const unsigned sends =
-            brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false,
-                             /* bti */ 0);
-         send_count += sends;
+      case SHADER_OPCODE_MEMORY_FENCE:
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND,
+                          brw_message_target(inst->sfid),
+                          /* commit_enable */ false,
+                          /* bti */ 0);
+         send_count++;
           break;
-      }
  
        case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
           const struct brw_reg mask =
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp

index 1baf9e671774a3c6c5e7c4f9a8aa4bfb048ec345..76446adcf54fb1f73828b049a39f9b8304ca5174 100644 (file)
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -704,9 +704,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
     case nir_intrinsic_scoped_memory_barrier: {
        const vec4_builder bld =
           vec4_builder(this).at_end().annotate(current_annotation, base_ir);
-      const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0))
-         ->size_written = 2 * REG_SIZE;
+      const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      vec4_instruction *fence =
+         bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0));
+      fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
        break;
     }
author	Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
	Fri, 17 Jan 2020 23:07:44 +0000 (15:07 -0800)
committer	Marge Bot <eric+marge@anholt.net>
	Wed, 29 Apr 2020 07:17:27 +0000 (07:17 +0000)
src/intel/compiler/brw_eu.h		patch \| blob \| history
src/intel/compiler/brw_eu_defines.h		patch \| blob \| history
src/intel/compiler/brw_eu_emit.c		patch \| blob \| history
src/intel/compiler/brw_fs_generator.cpp		patch \| blob \| history
src/intel/compiler/brw_fs_nir.cpp		patch \| blob \| history
src/intel/compiler/brw_vec4_generator.cpp		patch \| blob \| history
src/intel/compiler/brw_vec4_nir.cpp		patch \| blob \| history