From 4f431e870c7239879bc7c7f54f65184d5b391815 Mon Sep 17 00:00:00 2001
From: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Date: Tue, 31 Dec 2019 01:01:27 -0800
Subject: [PATCH] intel/fs: Don't emit fence for shared memory if only one
 thread is used
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

When there's only one hardware thread (i.e. the dispatch width greater
or equal to the workgroup size), there's no need to synchronize shared
memory access (SLM) since all the requests from a single thread are
already synchronized.  In such case, we just add a scheduling fence.

To be able to identify that case for all platforms, move the handling
of platforms prior to Gen11 (which don't have a separate SLM fence)
after the optimization.

Results for SKL running Iris for shader-db tests with compute shaders

    total sends in shared programs: 18395 -> 18361 (-0.18%)
    sends in affected programs: 938 -> 904 (-3.62%)
    helped: 9
    HURT: 0
    helped stats (abs) min: 1 max: 5 xÌ: 3.78 xÌ: 4
    helped stats (rel) min: 1.56% max: 26.32% xÌ: 10.33% xÌ: 2.60%
    95% mean confidence interval for sends value: -4.85 -2.71
    95% mean confidence interval for sends %-change: -19.12% -1.54%
    Sends are helped.

Shaders from Aztec Ruins, Car Chase, Manhattan and DeusEx are helped.

Results for ICL and TGL are similar to SKL.

Results for BDW are similar to SKL except for DeusEx shader that has a
workgroup size 16 but in BDW picks the SIMD8.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3226>
---
 src/intel/compiler/brw_fs_nir.cpp | 36 ++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 3bed5406576..0b4d50c56e3 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4239,25 +4239,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          l3_fence = modes & (nir_var_shader_out |
                              nir_var_mem_ssbo |
                              nir_var_mem_global);
-         /* Prior to gen11, we only have one kind of fence. */
-         slm_fence = devinfo->gen >= 11 && (modes & nir_var_mem_shared);
-         l3_fence |= devinfo->gen < 11 && (modes & nir_var_mem_shared);
+         slm_fence = modes & nir_var_mem_shared;
       } else {
-         if (devinfo->gen >= 11) {
-            l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
-            slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
-                        instr->intrinsic == nir_intrinsic_memory_barrier ||
-                        instr->intrinsic == nir_intrinsic_memory_barrier_shared;
-         } else {
-            /* Prior to gen11, we only have one kind of fence. */
-            l3_fence = true;
-            slm_fence = false;
-         }
+         l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
+         slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
+                     instr->intrinsic == nir_intrinsic_memory_barrier ||
+                     instr->intrinsic == nir_intrinsic_memory_barrier_shared;
       }
 
       if (stage != MESA_SHADER_COMPUTE)
          slm_fence = false;
 
+      /* If the workgroup fits in a single HW thread, the messages for SLM are
+       * processed in-order and the shader itself is already synchronized so
+       * the memory fence is not necessary.
+       *
+       * TODO: Check if applies for many HW threads sharing same Data Port.
+       */
+      if (slm_fence && workgroup_size() <= dispatch_width)
+         slm_fence = false;
+
+      /* Prior to Gen11, there's only L3 fence, so emit that instead. */
+      if (slm_fence && devinfo->gen < 11) {
+         slm_fence = false;
+         l3_fence = true;
+      }
+
       /* Be conservative in Gen11+ and always stall in a fence.  Since there
        * are two different fences, and shader might want to synchronize
        * between them.
@@ -4287,6 +4294,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
             ->size_written = 2 * REG_SIZE;
       }
 
+      if (!l3_fence && !slm_fence)
+         ubld.emit(FS_OPCODE_SCHEDULING_FENCE);
+
       break;
    }
 
-- 
2.30.2