From 2663759af0edb1ebcee3aa1ff63f846911d16076 Mon Sep 17 00:00:00 2001 From: Caio Marcelo de Oliveira Filho Date: Tue, 28 Apr 2020 13:09:27 -0700 Subject: [PATCH] intel/fs: Add and use a new load_simd_width_intel intrinsic Intrinsic to get the SIMD width, which not always the same as subgroup size. Starting with a small scope (Intel), but we can rename it later to generalize if this turns out useful for other drivers. Change brw_nir_lower_cs_intrinsics() to use this intrinsic instead of a width will be passed as argument. The pass also used to optimized load_subgroup_id for the case that the workgroup fitted into a single thread (it will be constant zero). This optimization moved together with lowering of the SIMD. This is a preparation for letting the drivers call it before the brw_compile_cs() step. No shader-db changes in BDW, SKL, ICL and TGL. Reviewed-by: Kenneth Graunke Reviewed-by: Jordan Justen Part-of: --- src/compiler/nir/nir_intrinsics.py | 3 ++ src/intel/compiler/brw_fs.cpp | 54 ++++++++++++++++++- src/intel/compiler/brw_fs_nir.cpp | 5 ++ src/intel/compiler/brw_nir.h | 3 +- .../compiler/brw_nir_lower_cs_intrinsics.c | 45 +++++----------- 5 files changed, 76 insertions(+), 34 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 6ce3ce147b6..611955ffa02 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -645,6 +645,9 @@ system_value("color1", 4) # System value for internal compute shaders in radeonsi. system_value("user_data_amd", 4) +# Number of data items being operated on for a SIMD program. +system_value("simd_width_intel", 1) + # Barycentric coordinate intrinsics. # # These set up the barycentric coordinates for a particular interpolation. diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d22d2c7a905..8725b78a647 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -8946,6 +8946,56 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, prog_data->nr_params); } +static bool +filter_simd(const nir_instr *instr, const void *_options) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + switch (nir_instr_as_intrinsic(instr)->intrinsic) { + case nir_intrinsic_load_simd_width_intel: + case nir_intrinsic_load_subgroup_id: + return true; + + default: + return false; + } +} + +static nir_ssa_def * +lower_simd(nir_builder *b, nir_instr *instr, void *options) +{ + uintptr_t simd_width = (uintptr_t)options; + + switch (nir_instr_as_intrinsic(instr)->intrinsic) { + case nir_intrinsic_load_simd_width_intel: + return nir_imm_int(b, simd_width); + + case nir_intrinsic_load_subgroup_id: + /* If the whole workgroup fits in one thread, we can lower subgroup_id + * to a constant zero. + */ + if (!b->shader->info.cs.local_size_variable) { + unsigned local_workgroup_size = b->shader->info.cs.local_size[0] * + b->shader->info.cs.local_size[1] * + b->shader->info.cs.local_size[2]; + if (local_workgroup_size <= simd_width) + return nir_imm_int(b, 0); + } + return NULL; + + default: + return NULL; + } +} + +static void +brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width) +{ + nir_shader_lower_instructions(nir, filter_simd, lower_simd, + (void *)(uintptr_t)dispatch_width); +} + static nir_shader * compile_cs_to_nir(const struct brw_compiler *compiler, void *mem_ctx, @@ -8956,7 +9006,9 @@ compile_cs_to_nir(const struct brw_compiler *compiler, nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true); - NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width); + NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics); + + NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width); /* Clean up after the local index and ID calculations. */ NIR_PASS_V(shader, nir_opt_constant_folding); diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 852626c6172..383c99c9f45 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3879,6 +3879,11 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, break; } + case nir_intrinsic_load_simd_width_intel: { + bld.MOV(dest, brw_imm_ud(cs_prog_data->simd_size)); + break; + }; + default: nir_emit_intrinsic(bld, instr); break; diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index b0ef195c261..c2dd970647d 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -99,8 +99,7 @@ void brw_nir_link_shaders(const struct brw_compiler *compiler, nir_shader *producer, nir_shader *consumer); -bool brw_nir_lower_cs_intrinsics(nir_shader *nir, - unsigned dispatch_width); +bool brw_nir_lower_cs_intrinsics(nir_shader *nir); void brw_nir_lower_alpha_to_coverage(nir_shader *shader); void brw_nir_lower_legacy_clipping(nir_shader *nir, int nr_userclip_plane_consts, diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c index 2393011312c..883fc469924 100644 --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c @@ -26,7 +26,6 @@ struct lower_intrinsics_state { nir_shader *nir; - unsigned dispatch_width; nir_function_impl *impl; bool progress; nir_builder builder; @@ -61,14 +60,10 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, if (!local_index) { assert(!local_id); - nir_ssa_def *subgroup_id; - if (state->local_workgroup_size <= state->dispatch_width) - subgroup_id = nir_imm_int(b, 0); - else - subgroup_id = nir_load_subgroup_id(b); + nir_ssa_def *subgroup_id = nir_load_subgroup_id(b); nir_ssa_def *thread_local_id = - nir_imul_imm(b, subgroup_id, state->dispatch_width); + nir_imul(b, subgroup_id, nir_load_simd_width_intel(b)); nir_ssa_def *channel = nir_load_subgroup_invocation(b); nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id); @@ -151,35 +146,25 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, break; } - case nir_intrinsic_load_subgroup_id: - if (state->local_workgroup_size > 8) - continue; - - /* For small workgroup sizes, we know subgroup_id will be zero */ - sysval = nir_imm_int(b, 0); - break; - case nir_intrinsic_load_num_subgroups: { + nir_ssa_def *size; if (state->nir->info.cs.local_size_variable) { nir_ssa_def *size_xyz = nir_load_local_group_size(b); nir_ssa_def *size_x = nir_channel(b, size_xyz, 0); nir_ssa_def *size_y = nir_channel(b, size_xyz, 1); nir_ssa_def *size_z = nir_channel(b, size_xyz, 2); - nir_ssa_def *size = nir_imul(b, nir_imul(b, size_x, size_y), size_z); - - /* Calculate the equivalent of DIV_ROUND_UP. */ - sysval = nir_idiv(b, - nir_iadd_imm(b, - nir_iadd_imm(b, size, state->dispatch_width), -1), - nir_imm_int(b, state->dispatch_width)); + size = nir_imul(b, nir_imul(b, size_x, size_y), size_z); } else { - unsigned local_workgroup_size = - nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * - nir->info.cs.local_size[2]; - unsigned num_subgroups = - DIV_ROUND_UP(local_workgroup_size, state->dispatch_width); - sysval = nir_imm_int(b, num_subgroups); + size = nir_imm_int(b, nir->info.cs.local_size[0] * + nir->info.cs.local_size[1] * + nir->info.cs.local_size[2]); } + + /* Calculate the equivalent of DIV_ROUND_UP. */ + nir_ssa_def *simd_width = nir_load_simd_width_intel(b); + sysval = + nir_udiv(b, nir_iadd_imm(b, nir_iadd(b, size, simd_width), -1), + simd_width); break; } @@ -210,14 +195,12 @@ lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state) } bool -brw_nir_lower_cs_intrinsics(nir_shader *nir, - unsigned dispatch_width) +brw_nir_lower_cs_intrinsics(nir_shader *nir) { assert(nir->info.stage == MESA_SHADER_COMPUTE); struct lower_intrinsics_state state = { .nir = nir, - .dispatch_width = dispatch_width, }; if (!nir->info.cs.local_size_variable) { -- 2.30.2