From 0a3acff5b53d409181dcd2f31a4a50af06f73a57 Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Sun, 22 May 2016 22:31:06 -0700 Subject: [PATCH] i965: Remove old CS local ID handling The old method pushed data for each channels uvec3 data of gl_LocalInvocationID. The new method pushes 1 dword of data that is a 'thread local ID' value. Based on that value, we can generate gl_LocalInvocationIndex and gl_LocalInvocationID with some calculations. Cc: "12.0" Signed-off-by: Jordan Justen Reviewed-by: Jason Ekstrand --- src/intel/vulkan/anv_cmd_buffer.c | 5 +- src/mesa/drivers/dri/i965/brw_compiler.h | 8 -- src/mesa/drivers/dri/i965/brw_fs.cpp | 94 +------------------ src/mesa/drivers/dri/i965/brw_fs.h | 1 - src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 7 -- .../drivers/dri/i965/brw_nir_intrinsics.c | 7 -- src/mesa/drivers/dri/i965/gen7_cs_state.c | 5 +- 7 files changed, 3 insertions(+), 124 deletions(-) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index edaaa3d2efa..3d37de2fbaf 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1094,13 +1094,10 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) } if (cs_prog_data->push.per_thread.size > 0) { - brw_cs_fill_local_id_payload(cs_prog_data, u32_map, cs_prog_data->threads, - cs_prog_data->push.per_thread.size); for (unsigned t = 0; t < cs_prog_data->threads; t++) { unsigned dst = 8 * (cs_prog_data->push.per_thread.regs * t + - cs_prog_data->push.cross_thread.regs + - cs_prog_data->local_invocation_id_regs); + cs_prog_data->push.cross_thread.regs); unsigned src = cs_prog_data->push.cross_thread.dwords; for ( ; src < prog_data->nr_params; src++, dst++) { if (src != cs_prog_data->thread_local_id_index) { diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index dda62974297..6e6d20c7d38 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -439,7 +439,6 @@ struct brw_cs_prog_data { unsigned threads; bool uses_barrier; bool uses_num_work_groups; - unsigned local_invocation_id_regs; int thread_local_id_index; struct { @@ -831,13 +830,6 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, unsigned *final_assembly_size, char **error_str); -/** - * Fill out local id payload for compute shader according to cs_prog_data. - */ -void -brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, - void *buffer, uint32_t threads, uint32_t stride); - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 0b766a4a848..9abe73acef2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5581,31 +5581,6 @@ fs_visitor::setup_vs_payload() payload.num_regs = 2; } -/** - * We are building the local ID push constant data using the simplest possible - * method. We simply push the local IDs directly as they should appear in the - * registers for the uvec3 gl_LocalInvocationID variable. - * - * Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6 - * registers worth of push constant space. - * - * Note: Any updates to brw_cs_prog_local_id_payload_dwords, - * fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need - * to coordinated. - * - * FINISHME: There are a few easy optimizations to consider. - * - * 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is - * no need for using push constant space for that dimension. - * - * 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can - * easily use 16-bit words rather than 32-bit dwords in the push constant - * data. - * - * 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for - * conveying the data, and thereby reduce push constant usage. - * - */ void fs_visitor::setup_gs_payload() { @@ -5649,16 +5624,7 @@ void fs_visitor::setup_cs_payload() { assert(devinfo->gen >= 7); - brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; - payload.num_regs = 1; - - if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID && - prog_data->thread_local_id_index < 0) { - prog_data->local_invocation_id_regs = dispatch_width * 3 / 8; - payload.local_invocation_id_reg = payload.num_regs; - payload.num_regs += prog_data->local_invocation_id_regs; - } } void @@ -6532,25 +6498,6 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, return g.get_assembly(final_assembly_size); } -fs_reg * -fs_visitor::emit_cs_local_invocation_id_setup() -{ - assert(stage == MESA_SHADER_COMPUTE); - - fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type)); - - struct brw_reg src = - brw_vec8_grf(payload.local_invocation_id_reg, 0); - src = retype(src, BRW_REGISTER_TYPE_UD); - bld.MOV(*reg, src); - src.nr += dispatch_width / 8; - bld.MOV(offset(*reg, bld, 1), src); - src.nr += dispatch_width / 8; - bld.MOV(offset(*reg, bld, 2), src); - - return reg; -} - fs_reg * fs_visitor::emit_cs_work_group_id_setup() { @@ -6597,9 +6544,7 @@ cs_fill_push_const_info(const struct brw_device_info *devinfo, unsigned cross_thread_dwords, per_thread_dwords; if (!cross_thread_supported) { cross_thread_dwords = 0u; - per_thread_dwords = - 8 * cs_prog_data->local_invocation_id_regs + - prog_data->nr_params; + per_thread_dwords = prog_data->nr_params; } else if (fill_thread_id) { /* Fill all but the last register with cross-thread payload */ cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8); @@ -6623,7 +6568,6 @@ cs_fill_push_const_info(const struct brw_device_info *devinfo, cs_prog_data->push.per_thread.size == 0); assert(cs_prog_data->push.cross_thread.dwords + cs_prog_data->push.per_thread.dwords == - 8 * cs_prog_data->local_invocation_id_regs + prog_data->nr_params); } @@ -6768,39 +6712,3 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, return g.get_assembly(final_assembly_size); } - -void -brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, - void *buffer, uint32_t threads, uint32_t stride) -{ - if (prog_data->local_invocation_id_regs == 0) - return; - - /* 'stride' should be an integer number of registers, that is, a multiple - * of 32 bytes. - */ - assert(stride % 32 == 0); - - unsigned x = 0, y = 0, z = 0; - for (unsigned t = 0; t < threads; t++) { - uint32_t *param = (uint32_t *) buffer + stride * t / 4; - - for (unsigned i = 0; i < prog_data->simd_size; i++) { - param[0 * prog_data->simd_size + i] = x; - param[1 * prog_data->simd_size + i] = y; - param[2 * prog_data->simd_size + i] = z; - - x++; - if (x == prog_data->local_size[0]) { - x = 0; - y++; - if (y == prog_data->local_size[1]) { - y = 0; - z++; - if (z == prog_data->local_size[2]) - z = 0; - } - } - } - } -} diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 4c1ac9cedd2..4237197d8d2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -267,7 +267,6 @@ public: unsigned base_offset, const nir_src &offset_src, unsigned num_components); void emit_cs_terminate(); - fs_reg *emit_cs_local_invocation_id_setup(); fs_reg *emit_cs_work_group_id_setup(); void emit_barrier(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 81c72047e25..7fc43b5061d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -272,13 +272,6 @@ emit_system_values_block(nir_block *block, fs_visitor *v) *reg = *v->emit_samplemaskin_setup(); break; - case nir_intrinsic_load_local_invocation_id: - assert(v->stage == MESA_SHADER_COMPUTE); - reg = &v->nir_system_values[SYSTEM_VALUE_LOCAL_INVOCATION_ID]; - if (reg->file == BAD_FILE) - *reg = *v->emit_cs_local_invocation_id_setup(); - break; - case nir_intrinsic_load_work_group_id: assert(v->stage == MESA_SHADER_COMPUTE); reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID]; diff --git a/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c b/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c index 972b1171730..00155fbaffb 100644 --- a/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c +++ b/src/mesa/drivers/dri/i965/brw_nir_intrinsics.c @@ -161,13 +161,6 @@ brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data) state.nir = nir; state.prog_data = prog_data; - /* Currently this pass only lowers intrinsics using the uniform specified - * by thread_local_id_index. - */ - if (nir->stage == MESA_SHADER_COMPUTE && - state.cs_prog_data->thread_local_id_index < 0) - return false; - do { state.progress = false; nir_foreach_function(function, nir) { diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c index f97c26a7d4f..750aa2ccdf1 100644 --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c @@ -241,13 +241,10 @@ brw_upload_cs_push_constants(struct brw_context *brw, gl_constant_value thread_id; if (cs_prog_data->push.per_thread.size > 0) { - brw_cs_fill_local_id_payload(cs_prog_data, param, cs_prog_data->threads, - cs_prog_data->push.per_thread.size); for (unsigned t = 0; t < cs_prog_data->threads; t++) { unsigned dst = 8 * (cs_prog_data->push.per_thread.regs * t + - cs_prog_data->push.cross_thread.regs + - cs_prog_data->local_invocation_id_regs); + cs_prog_data->push.cross_thread.regs); unsigned src = cs_prog_data->push.cross_thread.dwords; for ( ; src < prog_data->nr_params; src++, dst++) { if (src != cs_prog_data->thread_local_id_index) -- 2.30.2