From 991c4261604b136cac0770c7d6c7345ea134129c Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Sat, 14 Dec 2019 10:44:39 -0600 Subject: [PATCH] intel/nir: Enable load/store vectorization MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This commit enables the I/O vectorization pass that was originally written for ACO for Intel drivers. We enable it for UBOs, SSBOs, global memory, and SLM. We only enable vectorization for the scalar back-end because it vec4 makes certain alignment assumptions. Shader-db results with iris on ICL: total instructions in shared programs: 16077927 -> 16068236 (-0.06%) instructions in affected programs: 199839 -> 190148 (-4.85%) helped: 324 HURT: 0 helped stats (abs) min: 2 max: 458 x̄: 29.91 x̃: 4 helped stats (rel) min: 0.11% max: 38.94% x̄: 4.32% x̃: 1.64% 95% mean confidence interval for instructions value: -37.02 -22.80 95% mean confidence interval for instructions %-change: -5.07% -3.58% Instructions are helped. total cycles in shared programs: 336806135 -> 336151501 (-0.19%) cycles in affected programs: 16009735 -> 15355101 (-4.09%) helped: 458 HURT: 154 helped stats (abs) min: 1 max: 77812 x̄: 1542.50 x̃: 75 helped stats (rel) min: <.01% max: 34.46% x̄: 5.16% x̃: 2.01% HURT stats (abs) min: 1 max: 22800 x̄: 336.55 x̃: 20 HURT stats (rel) min: <.01% max: 17.11% x̄: 2.12% x̃: 1.00% 95% mean confidence interval for cycles value: -1596.83 -542.49 95% mean confidence interval for cycles %-change: -3.83% -2.82% Cycles are helped. total sends in shared programs: 814177 -> 809049 (-0.63%) sends in affected programs: 15422 -> 10294 (-33.25%) helped: 324 HURT: 0 helped stats (abs) min: 1 max: 256 x̄: 15.83 x̃: 2 helped stats (rel) min: 1.33% max: 67.90% x̄: 21.21% x̃: 15.38% 95% mean confidence interval for sends value: -19.67 -11.98 95% mean confidence interval for sends %-change: -23.03% -19.39% Sends are helped. LOST: 7 GAINED: 2 Most of the helped shaders were in the following titles: - Doom - Deus Ex: Mankind Divided - Aztec Ruins - Shadow of Mordor - DiRT Showdown - Tomb Raider (Rise, I think) Five of the lost programs are SIMD16 shaders we lost from dirt showdown. The other two are compute shaders in Aztec Ruins which switched from SIMD8 to SIMD16. Vulkan pipeline-db stats on ICL: Instructions in all programs: 296780486 -> 293493363 (-1.1%) Loops in all programs: 149669 -> 149669 (+0.0%) Cycles in all programs: 90999206722 -> 88513844563 (-2.7%) Spills in all programs: 1710217 -> 1730691 (+1.2%) Fills in all programs: 1931235 -> 1958138 (+1.4%) By far the most help was in the Tomb Raider games. A couple of Batman games with DXVK were also helped. In Shadow of the Tomb Raider: Instructions in all programs: 41614336 -> 39408023 (-5.3%) Loops in all programs: 32200 -> 32200 (+0.0%) Cycles in all programs: 1875498485 -> 1667034831 (-11.1%) Spills in all programs: 196307 -> 214945 (+9.5%) Fills in all programs: 282736 -> 307113 (+8.6%) Benchmarks of real games I've done on this patch: - Rise of the Tomb Raider: +3% - Shadow of the Tomb Raider: +10% Reviewed-by: Kenneth Graunke Reviewed-by: Ian Romanick Tested-by: Marge Bot Part-of: --- src/intel/compiler/brw_nir.c | 66 ++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 8a6cc8fe696..02a6f0cba9f 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -824,6 +824,31 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, } } +static bool +brw_nir_should_vectorize_mem(unsigned align, unsigned bit_size, + unsigned num_components, unsigned high_offset, + nir_intrinsic_instr *low, + nir_intrinsic_instr *high) +{ + /* Don't combine things to generate 64-bit loads/stores. We have to split + * those back into 32-bit ones anyway and UBO loads aren't split in NIR so + * we don't want to make a mess for the back-end. + */ + if (bit_size > 32) + return false; + + /* We can handle at most a vec4 right now. Anything bigger would get + * immediately split by brw_nir_lower_mem_access_bit_sizes anyway. + */ + if (num_components > 4) + return false; + + if (align < bit_size / 8) + return false; + + return true; +} + static bool combine_all_barriers(nir_intrinsic_instr *a, nir_intrinsic_instr *b, @@ -844,6 +869,35 @@ bool combine_all_barriers(nir_intrinsic_instr *a, return true; } +static void +brw_vectorize_lower_mem_access(nir_shader *nir, + const struct brw_compiler *compiler, + bool is_scalar) +{ + const struct gen_device_info *devinfo = compiler->devinfo; + bool progress = false; + + if (is_scalar) { + OPT(nir_opt_load_store_vectorize, + nir_var_mem_ubo | nir_var_mem_ssbo | + nir_var_mem_global | nir_var_mem_shared, + brw_nir_should_vectorize_mem); + } + + OPT(brw_nir_lower_mem_access_bit_sizes, devinfo); + + while (progress) { + progress = false; + + OPT(nir_lower_pack); + OPT(nir_copy_prop); + OPT(nir_opt_dce); + OPT(nir_opt_cse); + OPT(nir_opt_algebraic); + OPT(nir_opt_constant_folding); + } +} + /* Prepare the given shader for codegen * * This function is intended to be called right before going into the actual @@ -870,17 +924,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, brw_nir_optimize(nir, compiler, is_scalar, false); - if (OPT(brw_nir_lower_mem_access_bit_sizes, devinfo)) { - do { - progress = false; - OPT(nir_lower_pack); - OPT(nir_copy_prop); - OPT(nir_opt_dce); - OPT(nir_opt_cse); - OPT(nir_opt_algebraic); - OPT(nir_opt_constant_folding); - } while (progress); - } + brw_vectorize_lower_mem_access(nir, compiler, is_scalar); if (OPT(nir_lower_int64, nir->options->lower_int64_options)) brw_nir_optimize(nir, compiler, is_scalar, false); -- 2.30.2