From 8d8222461f9d7f497d657c2c0eff70820986429b Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 23 Jul 2018 22:20:41 -0700 Subject: [PATCH] intel/nir: Enable nir_opt_find_array_copies We have to be a bit careful with this one because we want it to run in the optimization loop but only in the first brw_nir_optimize call. Later calls assume that we've lowered away copy_deref instructions and we don't want to introduce any more. Shader-db results on Kaby Lake: total instructions in shared programs: 15176942 -> 15176942 (0.00%) instructions in affected programs: 0 -> 0 helped: 0 HURT: 0 In spite of the lack of any shader-db improvement, this patch completely eliminates spilling in the Batman: Arkham City tessellation shaders. This is because we are now able to detect that the temporary array created by DXVK for storing TCS inputs is a copy of the input arrays and use indirect URB reads instead of making a copy of 4.5 KiB of input data and then indirecting on it with if-ladders. Reviewed-by: Caio Marcelo de Oliveira Filho --- src/intel/compiler/brw_nir.c | 38 ++++++++++++++++++++++++------------ src/intel/compiler/brw_nir.h | 3 ++- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index c9a92c8aeb6..ef5034d1e1e 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -533,7 +533,7 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler, nir_shader * brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, - bool is_scalar) + bool is_scalar, bool allow_copies) { nir_variable_mode indirect_mask = brw_nir_no_indirect_mask(compiler, nir->info.stage); @@ -544,6 +544,13 @@ brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_split_array_vars, nir_var_local); OPT(nir_shrink_vec_array_vars, nir_var_local); OPT(nir_lower_vars_to_ssa); + if (allow_copies) { + /* Only run this pass in the first call to brw_nir_optimize. Later + * calls assume that we've lowered away any copy_deref instructions + * and we don't want to introduce any more. + */ + OPT(nir_opt_find_array_copies); + } OPT(nir_opt_copy_prop_vars); if (is_scalar) { @@ -664,7 +671,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir) nir_lower_isign64 | nir_lower_divmod64); - nir = brw_nir_optimize(nir, compiler, is_scalar); + nir = brw_nir_optimize(nir, compiler, is_scalar, true); /* This needs to be run after the first optimization pass but before we * lower indirect derefs away @@ -701,7 +708,7 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir) nir_lower_indirect_derefs(nir, indirect_mask); /* Get rid of split copies */ - nir = brw_nir_optimize(nir, compiler, is_scalar); + nir = brw_nir_optimize(nir, compiler, is_scalar, false); OPT(nir_remove_dead_variables, nir_var_local); @@ -716,6 +723,18 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, nir_validate_shader(*producer); nir_validate_shader(*consumer); + const bool p_is_scalar = + compiler->scalar_stage[(*producer)->info.stage]; + const bool c_is_scalar = + compiler->scalar_stage[(*consumer)->info.stage]; + + if (p_is_scalar && c_is_scalar) { + NIR_PASS_V(*producer, nir_lower_io_to_scalar_early, nir_var_shader_out); + NIR_PASS_V(*consumer, nir_lower_io_to_scalar_early, nir_var_shader_in); + *producer = brw_nir_optimize(*producer, compiler, p_is_scalar, false); + *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar, false); + } + NIR_PASS_V(*producer, nir_remove_dead_variables, nir_var_shader_out); NIR_PASS_V(*consumer, nir_remove_dead_variables, nir_var_shader_in); @@ -732,13 +751,8 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, NIR_PASS_V(*consumer, nir_lower_indirect_derefs, brw_nir_no_indirect_mask(compiler, (*consumer)->info.stage)); - const bool p_is_scalar = - compiler->scalar_stage[(*producer)->info.stage]; - *producer = brw_nir_optimize(*producer, compiler, p_is_scalar); - - const bool c_is_scalar = - compiler->scalar_stage[(*consumer)->info.stage]; - *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar); + *producer = brw_nir_optimize(*producer, compiler, p_is_scalar, false); + *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar, false); } } @@ -765,7 +779,7 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, OPT(nir_opt_algebraic_before_ffma); } while (progress); - nir = brw_nir_optimize(nir, compiler, is_scalar); + nir = brw_nir_optimize(nir, compiler, is_scalar, false); if (devinfo->gen >= 6) { /* Try and fuse multiply-adds */ @@ -861,7 +875,7 @@ brw_nir_apply_sampler_key(nir_shader *nir, if (nir_lower_tex(nir, &tex_options)) { nir_validate_shader(nir); - nir = brw_nir_optimize(nir, compiler, is_scalar); + nir = brw_nir_optimize(nir, compiler, is_scalar, false); } return nir; diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 00b61731526..5c75ef2324a 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -153,7 +153,8 @@ bool brw_nir_opt_peephole_ffma(nir_shader *shader); nir_shader *brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, - bool is_scalar); + bool is_scalar, + bool allow_copies); #define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0 #define BRW_NIR_FRAG_OUTPUT_INDEX_MASK INTEL_MASK(0, 0) -- 2.30.2