struct {
uint16_t local_size[3];
+ uint16_t max_variable_local_size;
bool local_size_variable:1;
uint8_t user_data_components_amd:3;
BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y,
BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z,
BRW_PARAM_BUILTIN_SUBGROUP_ID,
+ BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X,
+ BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y,
+ BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z,
};
#define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \
struct brw_stage_prog_data base;
unsigned local_size[3];
+ unsigned max_variable_local_size;
unsigned simd_size;
unsigned threads;
unsigned slm_size;
bool uses_barrier;
bool uses_num_work_groups;
+ bool uses_variable_group_size;
struct {
struct brw_push_const_block cross_thread;
this->pull_constant_loc = v->pull_constant_loc;
this->uniforms = v->uniforms;
this->subgroup_id = v->subgroup_id;
+ for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
+ this->group_size[i] = v->group_size[i];
}
void
cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
{
cs_prog_data->simd_size = size;
- unsigned group_size = cs_prog_data->local_size[0] *
- cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
- cs_prog_data->threads = (group_size + size - 1) / size;
+
+ unsigned group_size;
+ if (cs_prog_data->uses_variable_group_size) {
+ group_size = cs_prog_data->max_variable_local_size;
+ } else {
+ group_size = cs_prog_data->local_size[0] *
+ cs_prog_data->local_size[1] *
+ cs_prog_data->local_size[2];
+ }
+ cs_prog_data->threads = DIV_ROUND_UP(group_size, size);
}
static nir_shader *
char **error_str)
{
prog_data->base.total_shared = src_shader->info.cs.shared_size;
- prog_data->local_size[0] = src_shader->info.cs.local_size[0];
- prog_data->local_size[1] = src_shader->info.cs.local_size[1];
- prog_data->local_size[2] = src_shader->info.cs.local_size[2];
prog_data->slm_size = src_shader->num_shared;
- unsigned local_workgroup_size =
- src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
- src_shader->info.cs.local_size[2];
+
+ unsigned local_workgroup_size;
+ if (prog_data->uses_variable_group_size) {
+ prog_data->max_variable_local_size =
+ src_shader->info.cs.max_variable_local_size;
+ local_workgroup_size = src_shader->info.cs.max_variable_local_size;
+ } else {
+ prog_data->local_size[0] = src_shader->info.cs.local_size[0];
+ prog_data->local_size[1] = src_shader->info.cs.local_size[1];
+ prog_data->local_size[2] = src_shader->info.cs.local_size[2];
+ local_workgroup_size = src_shader->info.cs.local_size[0] *
+ src_shader->info.cs.local_size[1] * src_shader->info.cs.local_size[2];
+ }
/* Limit max_threads to 64 for the GPGPU_WALKER command */
const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
int *push_constant_loc;
fs_reg subgroup_id;
+ fs_reg group_size[3];
fs_reg scratch_base;
fs_reg frag_depth;
fs_reg frag_stencil;
uniforms = nir->num_uniforms / 4;
if (stage == MESA_SHADER_COMPUTE) {
- /* Add a uniform for the thread local id. It must be the last uniform
- * on the list.
- */
+ /* Add uniforms for builtins after regular NIR uniforms. */
assert(uniforms == prog_data->nr_params);
- uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
+
+ uint32_t *param;
+ if (brw_cs_prog_data(prog_data)->uses_variable_group_size) {
+ param = brw_stage_prog_data_add_params(prog_data, 3);
+ for (unsigned i = 0; i < 3; i++) {
+ param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
+ group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
+ }
+ }
+
+ /* Subgroup ID must be the last uniform on the list. This will make
+ * easier later to split between cross thread and per thread
+ * uniforms.
+ */
+ param = brw_stage_prog_data_add_params(prog_data, 1);
*param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
}
* invocations are already executed lock-step. Instead of an actual
* barrier just emit a scheduling fence, that will generate no code.
*/
- if (workgroup_size() <= dispatch_width) {
+ if (!cs_prog_data->uses_variable_group_size &&
+ workgroup_size() <= dispatch_width) {
bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
break;
}
break;
}
+ case nir_intrinsic_load_local_group_size: {
+ for (unsigned i = 0; i < 3; i++) {
+ bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
+ group_size[i]);
+ }
+ break;
+ }
+
default:
nir_emit_intrinsic(bld, instr);
break;
*
* TODO: Check if applies for many HW threads sharing same Data Port.
*/
- if (slm_fence && workgroup_size() <= dispatch_width)
+ if (!brw_cs_prog_data(prog_data)->uses_variable_group_size &&
+ slm_fence && workgroup_size() <= dispatch_width)
slm_fence = false;
/* Prior to Gen11, there's only L3 fence, so emit that instead. */
nir_ssa_def *channel = nir_load_subgroup_invocation(b);
nir_ssa_def *linear = nir_iadd(b, channel, thread_local_id);
- nir_ssa_def *size_x = nir_imm_int(b, nir->info.cs.local_size[0]);
- nir_ssa_def *size_y = nir_imm_int(b, nir->info.cs.local_size[1]);
+ nir_ssa_def *size_x;
+ nir_ssa_def *size_y;
+ if (state->nir->info.cs.local_size_variable) {
+ nir_ssa_def *size_xyz = nir_load_local_group_size(b);
+ size_x = nir_channel(b, size_xyz, 0);
+ size_y = nir_channel(b, size_xyz, 1);
+ } else {
+ size_x = nir_imm_int(b, nir->info.cs.local_size[0]);
+ size_y = nir_imm_int(b, nir->info.cs.local_size[1]);
+ }
/* The local invocation index and ID must respect the following
*
break;
case nir_intrinsic_load_num_subgroups: {
- unsigned local_workgroup_size =
- nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
- nir->info.cs.local_size[2];
- unsigned num_subgroups =
- DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
- sysval = nir_imm_int(b, num_subgroups);
+ if (state->nir->info.cs.local_size_variable) {
+ nir_ssa_def *size_xyz = nir_load_local_group_size(b);
+ nir_ssa_def *size_x = nir_channel(b, size_xyz, 0);
+ nir_ssa_def *size_y = nir_channel(b, size_xyz, 1);
+ nir_ssa_def *size_z = nir_channel(b, size_xyz, 2);
+ nir_ssa_def *size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
+
+ /* Calculate the equivalent of DIV_ROUND_UP. */
+ sysval = nir_idiv(b,
+ nir_iadd_imm(b,
+ nir_iadd_imm(b, size, state->dispatch_width), -1),
+ nir_imm_int(b, state->dispatch_width));
+ } else {
+ unsigned local_workgroup_size =
+ nir->info.cs.local_size[0] * nir->info.cs.local_size[1] *
+ nir->info.cs.local_size[2];
+ unsigned num_subgroups =
+ DIV_ROUND_UP(local_workgroup_size, state->dispatch_width);
+ sysval = nir_imm_int(b, num_subgroups);
+ }
break;
}
.dispatch_width = dispatch_width,
};
- assert(!nir->info.cs.local_size_variable);
- state.local_workgroup_size = nir->info.cs.local_size[0] *
- nir->info.cs.local_size[1] *
- nir->info.cs.local_size[2];
+ if (!nir->info.cs.local_size_variable) {
+ state.local_workgroup_size = nir->info.cs.local_size[0] *
+ nir->info.cs.local_size[1] *
+ nir->info.cs.local_size[2];
+ } else {
+ state.local_workgroup_size = nir->info.cs.max_variable_local_size;
+ }
/* Constraints from NV_compute_shader_derivatives. */
- if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
+ if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS &&
+ !nir->info.cs.local_size_variable) {
assert(nir->info.cs.local_size[0] % 2 == 0);
assert(nir->info.cs.local_size[1] % 2 == 0);
- } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
+ } else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR &&
+ !nir->info.cs.local_size_variable) {
assert(state.local_workgroup_size % 4 == 0);
}