From 04fc72501a90af94b0b5699e57fea68ad6e8795b Mon Sep 17 00:00:00 2001 From: Jordan Justen Date: Sun, 22 May 2016 21:29:53 -0700 Subject: [PATCH] i965: Put CS local thread ID uniform in last push register This thread ID uniform will be used to compute the gl_LocalInvocationIndex and gl_LocalInvocationID values. It is important for this uniform to be added in the last push constant register. fs_visitor::assign_constant_locations is updated to make sure this happens. The reason this is important is that the cross-thread push constant registers are loaded first, and the per-thread push constant registers are loaded after that. (Broadwell adds another push constant upload mechanism which reverses this order, but we are ignoring this for now.) v2: * Add variable in intrinsics lowering pass * Make sure the ID is pushed last in assign_constant_locations, and that we save a spot for the ID in the push constants v3: * Simplify code based with Jason's suggestions. Cc: "12.0" Signed-off-by: Jordan Justen Reviewed-by: Jason Ekstrand --- src/mesa/drivers/dri/i965/brw_fs.cpp | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 645f2c702f5..6bd28718280 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2097,6 +2097,10 @@ fs_visitor::assign_constant_locations() bool contiguous[uniforms]; memset(contiguous, 0, sizeof(contiguous)); + int thread_local_id_index = + (stage == MESA_SHADER_COMPUTE) ? + ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1; + /* First, we walk through the instructions and do two things: * * 1) Figure out which uniforms are live. @@ -2141,6 +2145,9 @@ fs_visitor::assign_constant_locations() } } + if (thread_local_id_index >= 0 && !is_live[thread_local_id_index]) + thread_local_id_index = -1; + /* Only allow 16 registers (128 uniform components) as push constants. * * Just demote the end of the list. We could probably do better @@ -2149,7 +2156,9 @@ fs_visitor::assign_constant_locations() * If changing this value, note the limitation about total_regs in * brw_curbe.c. */ - const unsigned int max_push_components = 16 * 8; + unsigned int max_push_components = 16 * 8; + if (thread_local_id_index >= 0) + max_push_components--; /* Save a slot for the thread ID */ /* We push small arrays, but no bigger than 16 floats. This is big enough * for a vec4 but hopefully not large enough to push out other stuff. We @@ -2187,6 +2196,10 @@ fs_visitor::assign_constant_locations() if (!is_live[u] || is_live_64bit[u]) continue; + /* Skip thread_local_id_index to put it in the last push register. */ + if (thread_local_id_index == (int)u) + continue; + set_push_pull_constant_loc(u, &chunk_start, contiguous[u], push_constant_loc, pull_constant_loc, &num_push_constants, &num_pull_constants, @@ -2194,6 +2207,10 @@ fs_visitor::assign_constant_locations() stage_prog_data); } + /* Add the CS local thread ID uniform at the end of the push constants */ + if (thread_local_id_index >= 0) + push_constant_loc[thread_local_id_index] = num_push_constants++; + /* As the uniforms are going to be reordered, take the data from a temporary * copy of the original param[]. */ @@ -2212,6 +2229,7 @@ fs_visitor::assign_constant_locations() * push_constant_loc[i] <= i and we can do it in one smooth loop without * having to make a copy. */ + int new_thread_local_id_index = -1; for (unsigned int i = 0; i < uniforms; i++) { const gl_constant_value *value = param[i]; @@ -2219,9 +2237,15 @@ fs_visitor::assign_constant_locations() stage_prog_data->pull_param[pull_constant_loc[i]] = value; } else if (push_constant_loc[i] != -1) { stage_prog_data->param[push_constant_loc[i]] = value; + if (thread_local_id_index == (int)i) + new_thread_local_id_index = push_constant_loc[i]; } } ralloc_free(param); + + if (stage == MESA_SHADER_COMPUTE) + ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index = + new_thread_local_id_index; } /** -- 2.30.2