Clear the workgroup size for all supported shader stages.
Also, unify the workgroup size calculation accross various places.
As a result, insert_waitcnt can use the proper workgroup size
which means that some waits can be dropped from tessellation
shaders. Also, in cases where the previous calculation was wrong,
we now insert s_barrier instructions.
Totals from affected shaders (GFX10):
Code Size: 340116 -> 338484 (-0.48 %) bytes
Fixes: a8d15ab6daf0a07476e9dfabe513c0f1e0f3bf82
Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4165>
}
if (instr->format == Format::PSEUDO_BARRIER) {
- uint32_t workgroup_size = UINT32_MAX;
- if (ctx.program->stage & sw_cs) {
- unsigned* bsize = ctx.program->info->cs.block_size;
- workgroup_size = bsize[0] * bsize[1] * bsize[2];
- }
switch (instr->opcode) {
case aco_opcode::p_memory_barrier_common:
imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]);
imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]);
imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
- if (workgroup_size > ctx.program->wave_size)
+ if (ctx.program->workgroup_size > ctx.program->wave_size)
imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
break;
case aco_opcode::p_memory_barrier_atomic:
imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]);
break;
case aco_opcode::p_memory_barrier_shared:
- if (workgroup_size > ctx.program->wave_size)
+ if (ctx.program->workgroup_size > ctx.program->wave_size)
imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]);
break;
case aco_opcode::p_memory_barrier_gs_data:
break;
}
- if (ctx->shader->info.stage == MESA_SHADER_COMPUTE) {
- unsigned* bsize = ctx->program->info->cs.block_size;
- unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
- if (workgroup_size > ctx->program->wave_size)
- bld.sopp(aco_opcode::s_barrier);
- } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
- /* For each patch provided during rendering, n TCS shader invocations will be processed,
- * where n is the number of vertices in the output patch.
- */
- unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out;
- if (workgroup_size > ctx->program->wave_size)
- bld.sopp(aco_opcode::s_barrier);
- } else {
- /* We don't know the workgroup size, so always emit the s_barrier. */
+ if (ctx->program->workgroup_size > ctx->program->wave_size)
bld.sopp(aco_opcode::s_barrier);
- }
break;
}
Builder bld(ctx->program, ctx->block);
bld.barrier(aco_opcode::p_memory_barrier_shared);
- unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out;
- if (unlikely(ctx->program->chip_class != GFX6 && workgroup_size > ctx->program->wave_size))
+ if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
bld.sopp(aco_opcode::s_barrier);
Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
program->sgpr_limit = 104;
}
- calc_min_waves(program);
- program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
- program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
-
isel_context ctx = {};
ctx.program = program;
ctx.args = args;
ctx.options = args->options;
ctx.stage = program->stage;
- if (ctx.stage == tess_control_hs) {
+ /* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */
+ if (program->stage & (hw_vs | hw_fs)) {
+ /* PS and legacy VS have separate waves, no workgroups */
+ program->workgroup_size = program->wave_size;
+ } else if (program->stage == compute_cs) {
+ /* CS sets the workgroup size explicitly */
+ unsigned* bsize = program->info->cs.block_size;
+ program->workgroup_size = bsize[0] * bsize[1] * bsize[2];
+ } else if ((program->stage & hw_es) || program->stage == geometry_gs) {
+ /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa) */
+ program->workgroup_size = program->wave_size;
+ } else if (program->stage & hw_gs) {
+ /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */
+ program->workgroup_size = UINT_MAX; /* TODO: set by VGT_GS_ONCHIP_CNTL, which is not plumbed to ACO */
+ } else if (program->stage == vertex_ls) {
+ /* Unmerged LS operates in workgroups */
+ program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but those are not plumbed to ACO for LS */
+ } else if (program->stage == tess_control_hs) {
+ /* Unmerged HS operates in workgroups, size is determined by the output vertices */
setup_tcs_info(&ctx, shaders[0]);
- } else if (ctx.stage == vertex_tess_control_hs) {
+ program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out;
+ } else if (program->stage == vertex_tess_control_hs) {
+ /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */
setup_tcs_info(&ctx, shaders[1]);
+ program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices);
+ } else {
+ unreachable("Unsupported shader stage.");
}
+ calc_min_waves(program);
+ program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
+ program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
+
get_io_masks(&ctx, shader_count, shaders);
unsigned scratch_size = 0;
uint16_t physical_sgprs;
uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
uint16_t vgpr_alloc_granule; /* minus one. must be power of two */
+ unsigned workgroup_size; /* if known; otherwise UINT_MAX */
bool needs_vcc = false;
bool needs_xnack_mask = false;
unsigned calc_waves_per_workgroup(Program *program)
{
- unsigned workgroup_size = program->wave_size;
- if (program->stage == compute_cs) {
- unsigned* bsize = program->info->cs.block_size;
- workgroup_size = bsize[0] * bsize[1] * bsize[2];
- }
+ /* When workgroup size is not known, just go with wave_size */
+ unsigned workgroup_size = program->workgroup_size == UINT_MAX
+ ? program->wave_size
+ : program->workgroup_size;
+
return align(workgroup_size, program->wave_size) / program->wave_size;
}
} /* end namespace */