From 4f431e870c7239879bc7c7f54f65184d5b391815 Mon Sep 17 00:00:00 2001 From: Caio Marcelo de Oliveira Filho Date: Tue, 31 Dec 2019 01:01:27 -0800 Subject: [PATCH] intel/fs: Don't emit fence for shared memory if only one thread is used MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When there's only one hardware thread (i.e. the dispatch width greater or equal to the workgroup size), there's no need to synchronize shared memory access (SLM) since all the requests from a single thread are already synchronized. In such case, we just add a scheduling fence. To be able to identify that case for all platforms, move the handling of platforms prior to Gen11 (which don't have a separate SLM fence) after the optimization. Results for SKL running Iris for shader-db tests with compute shaders total sends in shared programs: 18395 -> 18361 (-0.18%) sends in affected programs: 938 -> 904 (-3.62%) helped: 9 HURT: 0 helped stats (abs) min: 1 max: 5 x̄: 3.78 x̃: 4 helped stats (rel) min: 1.56% max: 26.32% x̄: 10.33% x̃: 2.60% 95% mean confidence interval for sends value: -4.85 -2.71 95% mean confidence interval for sends %-change: -19.12% -1.54% Sends are helped. Shaders from Aztec Ruins, Car Chase, Manhattan and DeusEx are helped. Results for ICL and TGL are similar to SKL. Results for BDW are similar to SKL except for DeusEx shader that has a workgroup size 16 but in BDW picks the SIMD8. Reviewed-by: Francisco Jerez Part-of: --- src/intel/compiler/brw_fs_nir.cpp | 36 ++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 3bed5406576..0b4d50c56e3 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -4239,25 +4239,32 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr l3_fence = modes & (nir_var_shader_out | nir_var_mem_ssbo | nir_var_mem_global); - /* Prior to gen11, we only have one kind of fence. */ - slm_fence = devinfo->gen >= 11 && (modes & nir_var_mem_shared); - l3_fence |= devinfo->gen < 11 && (modes & nir_var_mem_shared); + slm_fence = modes & nir_var_mem_shared; } else { - if (devinfo->gen >= 11) { - l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; - slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || - instr->intrinsic == nir_intrinsic_memory_barrier || - instr->intrinsic == nir_intrinsic_memory_barrier_shared; - } else { - /* Prior to gen11, we only have one kind of fence. */ - l3_fence = true; - slm_fence = false; - } + l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared; + slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || + instr->intrinsic == nir_intrinsic_memory_barrier || + instr->intrinsic == nir_intrinsic_memory_barrier_shared; } if (stage != MESA_SHADER_COMPUTE) slm_fence = false; + /* If the workgroup fits in a single HW thread, the messages for SLM are + * processed in-order and the shader itself is already synchronized so + * the memory fence is not necessary. + * + * TODO: Check if applies for many HW threads sharing same Data Port. + */ + if (slm_fence && workgroup_size() <= dispatch_width) + slm_fence = false; + + /* Prior to Gen11, there's only L3 fence, so emit that instead. */ + if (slm_fence && devinfo->gen < 11) { + slm_fence = false; + l3_fence = true; + } + /* Be conservative in Gen11+ and always stall in a fence. Since there * are two different fences, and shader might want to synchronize * between them. @@ -4287,6 +4294,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr ->size_written = 2 * REG_SIZE; } + if (!l3_fence && !slm_fence) + ubld.emit(FS_OPCODE_SCHEDULING_FENCE); + break; } -- 2.30.2