From 80ddfab2f54d7cd9dd4b93d2fbfa239f061a1f2b Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 29 Sep 2017 17:57:32 -0700 Subject: [PATCH] intel/cs: Rework the way thread local ID is handled Previously, brw_nir_lower_intrinsics added the param and then emitted a load_uniform intrinsic to load it directly. This commit switches things over to use a specific NIR intrinsic for the thread id. The one thing I don't like about this approach is that we have to copy thread_local_id over to the new visitor in import_uniforms. Reviewed-by: Iago Toral Quiroga --- src/compiler/nir/nir_intrinsics.h | 3 ++ src/intel/compiler/brw_fs.cpp | 4 +- src/intel/compiler/brw_fs.h | 1 + src/intel/compiler/brw_fs_nir.cpp | 14 +++++ src/intel/compiler/brw_nir.h | 3 +- .../compiler/brw_nir_lower_cs_intrinsics.c | 53 ++++--------------- 6 files changed, 32 insertions(+), 46 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.h b/src/compiler/nir/nir_intrinsics.h index cefd18be904..47022dd135b 100644 --- a/src/compiler/nir/nir_intrinsics.h +++ b/src/compiler/nir/nir_intrinsics.h @@ -364,6 +364,9 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx) SYSTEM_VALUE(blend_const_color_rgba8888_unorm, 1, 0, xx, xx, xx) SYSTEM_VALUE(blend_const_color_aaaa8888_unorm, 1, 0, xx, xx, xx) +/* Intel specific system values */ +SYSTEM_VALUE(intel_thread_local_id, 1, 0, xx, xx, xx) + /** * Barycentric coordinate intrinsics. * diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 68a47bac841..c0b6047b954 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -996,6 +996,7 @@ fs_visitor::import_uniforms(fs_visitor *v) this->push_constant_loc = v->push_constant_loc; this->pull_constant_loc = v->pull_constant_loc; this->uniforms = v->uniforms; + this->thread_local_id = v->thread_local_id; } void @@ -6834,8 +6835,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, { nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); - - brw_nir_lower_cs_intrinsics(shader, prog_data); + brw_nir_lower_cs_intrinsics(shader); shader = brw_postprocess_nir(shader, compiler, true); prog_data->local_size[0] = shader->info.cs.local_size[0]; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index da3259323ec..f51a4d8889b 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -315,6 +315,7 @@ public: */ int *push_constant_loc; + fs_reg thread_local_id; fs_reg frag_depth; fs_reg frag_stencil; fs_reg sample_mask; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 04b6e5119a2..77d8bae4db6 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -88,6 +88,16 @@ fs_visitor::nir_setup_uniforms() } uniforms = nir->num_uniforms / 4; + + if (stage == MESA_SHADER_COMPUTE) { + /* Add a uniform for the thread local id. It must be the last uniform + * on the list. + */ + assert(uniforms == prog_data->nr_params); + uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); + *param = BRW_PARAM_BUILTIN_THREAD_LOCAL_ID; + thread_local_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); + } } static bool @@ -3412,6 +3422,10 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, cs_prog_data->uses_barrier = true; break; + case nir_intrinsic_load_intel_thread_local_id: + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), thread_local_id); + break; + case nir_intrinsic_load_local_invocation_id: case nir_intrinsic_load_work_group_id: { gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 1493b742e42..3e407122681 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -95,8 +95,7 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir); nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir); -bool brw_nir_lower_cs_intrinsics(nir_shader *nir, - struct brw_cs_prog_data *prog_data); +bool brw_nir_lower_cs_intrinsics(nir_shader *nir); void brw_nir_lower_vs_inputs(nir_shader *nir, bool use_legacy_snorm_formula, const uint8_t *vs_attrib_wa_flags); diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c index d27727624c6..07d2dccd041 100644 --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c @@ -26,47 +26,12 @@ struct lower_intrinsics_state { nir_shader *nir; - struct brw_cs_prog_data *prog_data; nir_function_impl *impl; bool progress; nir_builder builder; - int thread_local_id_index; + unsigned local_workgroup_size; }; -static nir_ssa_def * -read_thread_local_id(struct lower_intrinsics_state *state) -{ - struct brw_cs_prog_data *prog_data = state->prog_data; - nir_builder *b = &state->builder; - nir_shader *nir = state->nir; - const unsigned *sizes = nir->info.cs.local_size; - const unsigned group_size = sizes[0] * sizes[1] * sizes[2]; - - /* Some programs have local_size dimensions so small that the thread local - * ID will always be 0. - */ - if (group_size <= 8) - return nir_imm_int(b, 0); - - if (state->thread_local_id_index == -1) { - state->thread_local_id_index = prog_data->base.nr_params; - uint32_t *param = brw_stage_prog_data_add_params(&prog_data->base, 1); - *param = BRW_PARAM_BUILTIN_THREAD_LOCAL_ID; - nir->num_uniforms += 4; - } - unsigned id_index = state->thread_local_id_index; - - nir_intrinsic_instr *load = - nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform); - load->num_components = 1; - load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); - nir_intrinsic_set_base(load, id_index * sizeof(uint32_t)); - nir_intrinsic_set_range(load, sizeof(uint32_t)); - nir_builder_instr_insert(b, &load->instr); - return &load->dest.ssa; -} - static bool lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, nir_block *block) @@ -91,7 +56,12 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, * gl_LocalInvocationIndex = * cs_thread_local_id + subgroup_invocation; */ - nir_ssa_def *thread_local_id = read_thread_local_id(state); + nir_ssa_def *thread_local_id; + if (state->local_workgroup_size <= 8) + thread_local_id = nir_imm_int(b, 0); + else + thread_local_id = nir_load_intel_thread_local_id(b); + nir_ssa_def *channel = nir_load_subgroup_invocation(b); sysval = nir_iadd(b, channel, thread_local_id); break; @@ -157,8 +127,7 @@ lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state) } bool -brw_nir_lower_cs_intrinsics(nir_shader *nir, - struct brw_cs_prog_data *prog_data) +brw_nir_lower_cs_intrinsics(nir_shader *nir) { assert(nir->info.stage == MESA_SHADER_COMPUTE); @@ -166,9 +135,9 @@ brw_nir_lower_cs_intrinsics(nir_shader *nir, struct lower_intrinsics_state state; memset(&state, 0, sizeof(state)); state.nir = nir; - state.prog_data = prog_data; - - state.thread_local_id_index = -1; + state.local_workgroup_size = nir->info.cs.local_size[0] * + nir->info.cs.local_size[1] * + nir->info.cs.local_size[2]; do { state.progress = false; -- 2.30.2