From: Timur Kristóf Date: Thu, 12 Mar 2020 15:28:48 +0000 (+0100) Subject: aco: Fix workgroup size calculation. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=0f35b3795d131517c6dce15d86783dd98951548a;p=mesa.git aco: Fix workgroup size calculation. Clear the workgroup size for all supported shader stages. Also, unify the workgroup size calculation accross various places. As a result, insert_waitcnt can use the proper workgroup size which means that some waits can be dropped from tessellation shaders. Also, in cases where the previous calculation was wrong, we now insert s_barrier instructions. Totals from affected shaders (GFX10): Code Size: 340116 -> 338484 (-0.48 %) bytes Fixes: a8d15ab6daf0a07476e9dfabe513c0f1e0f3bf82 Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Reviewed-by: Rhys Perry Part-of: --- diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 254eb97d151..09556d232b5 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -403,17 +403,12 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) } if (instr->format == Format::PSEUDO_BARRIER) { - uint32_t workgroup_size = UINT32_MAX; - if (ctx.program->stage & sw_cs) { - unsigned* bsize = ctx.program->info->cs.block_size; - workgroup_size = bsize[0] * bsize[1] * bsize[2]; - } switch (instr->opcode) { case aco_opcode::p_memory_barrier_common: imm.combine(ctx.barrier_imm[ffs(barrier_atomic) - 1]); imm.combine(ctx.barrier_imm[ffs(barrier_buffer) - 1]); imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); - if (workgroup_size > ctx.program->wave_size) + if (ctx.program->workgroup_size > ctx.program->wave_size) imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); break; case aco_opcode::p_memory_barrier_atomic: @@ -426,7 +421,7 @@ wait_imm kill(Instruction* instr, wait_ctx& ctx) imm.combine(ctx.barrier_imm[ffs(barrier_image) - 1]); break; case aco_opcode::p_memory_barrier_shared: - if (workgroup_size > ctx.program->wave_size) + if (ctx.program->workgroup_size > ctx.program->wave_size) imm.combine(ctx.barrier_imm[ffs(barrier_shared) - 1]); break; case aco_opcode::p_memory_barrier_gs_data: diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 4ec971e4d6c..c2da6d6e238 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -6827,22 +6827,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } - if (ctx->shader->info.stage == MESA_SHADER_COMPUTE) { - unsigned* bsize = ctx->program->info->cs.block_size; - unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2]; - if (workgroup_size > ctx->program->wave_size) - bld.sopp(aco_opcode::s_barrier); - } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { - /* For each patch provided during rendering, n​ TCS shader invocations will be processed, - * where n​ is the number of vertices in the output patch. - */ - unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out; - if (workgroup_size > ctx->program->wave_size) - bld.sopp(aco_opcode::s_barrier); - } else { - /* We don't know the workgroup size, so always emit the s_barrier. */ + if (ctx->program->workgroup_size > ctx->program->wave_size) bld.sopp(aco_opcode::s_barrier); - } break; } @@ -9374,8 +9360,7 @@ static void write_tcs_tess_factors(isel_context *ctx) Builder bld(ctx->program, ctx->block); bld.barrier(aco_opcode::p_memory_barrier_shared); - unsigned workgroup_size = ctx->tcs_num_patches * ctx->shader->info.tess.tcs_vertices_out; - if (unlikely(ctx->program->chip_class != GFX6 && workgroup_size > ctx->program->wave_size)) + if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size)) bld.sopp(aco_opcode::s_barrier); Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 75f1f9b4881..bd90dcae83d 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1238,22 +1238,45 @@ setup_isel_context(Program* program, program->sgpr_limit = 104; } - calc_min_waves(program); - program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); - program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); - isel_context ctx = {}; ctx.program = program; ctx.args = args; ctx.options = args->options; ctx.stage = program->stage; - if (ctx.stage == tess_control_hs) { + /* TODO: Check if we need to adjust min_waves for unknown workgroup sizes. */ + if (program->stage & (hw_vs | hw_fs)) { + /* PS and legacy VS have separate waves, no workgroups */ + program->workgroup_size = program->wave_size; + } else if (program->stage == compute_cs) { + /* CS sets the workgroup size explicitly */ + unsigned* bsize = program->info->cs.block_size; + program->workgroup_size = bsize[0] * bsize[1] * bsize[2]; + } else if ((program->stage & hw_es) || program->stage == geometry_gs) { + /* Unmerged ESGS operate in workgroups if on-chip GS (LDS rings) are enabled on GFX7-8 (not implemented in Mesa) */ + program->workgroup_size = program->wave_size; + } else if (program->stage & hw_gs) { + /* If on-chip GS (LDS rings) are enabled on GFX9 or later, merged GS operates in workgroups */ + program->workgroup_size = UINT_MAX; /* TODO: set by VGT_GS_ONCHIP_CNTL, which is not plumbed to ACO */ + } else if (program->stage == vertex_ls) { + /* Unmerged LS operates in workgroups */ + program->workgroup_size = UINT_MAX; /* TODO: probably tcs_num_patches * tcs_vertices_in, but those are not plumbed to ACO for LS */ + } else if (program->stage == tess_control_hs) { + /* Unmerged HS operates in workgroups, size is determined by the output vertices */ setup_tcs_info(&ctx, shaders[0]); - } else if (ctx.stage == vertex_tess_control_hs) { + program->workgroup_size = ctx.tcs_num_patches * shaders[0]->info.tess.tcs_vertices_out; + } else if (program->stage == vertex_tess_control_hs) { + /* Merged LSHS operates in workgroups, but can still have a different number of LS and HS invocations */ setup_tcs_info(&ctx, shaders[1]); + program->workgroup_size = ctx.tcs_num_patches * MAX2(shaders[1]->info.tess.tcs_vertices_out, ctx.args->options->key.tcs.input_vertices); + } else { + unreachable("Unsupported shader stage."); } + calc_min_waves(program); + program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + get_io_masks(&ctx, shader_count, shaders); unsigned scratch_size = 0; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 0be646d8b0f..73a1d394eff 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1250,6 +1250,7 @@ public: uint16_t physical_sgprs; uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ uint16_t vgpr_alloc_granule; /* minus one. must be power of two */ + unsigned workgroup_size; /* if known; otherwise UINT_MAX */ bool needs_vcc = false; bool needs_xnack_mask = false; diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index d4383cf5887..e223d6d5f84 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -289,11 +289,11 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, unsigned calc_waves_per_workgroup(Program *program) { - unsigned workgroup_size = program->wave_size; - if (program->stage == compute_cs) { - unsigned* bsize = program->info->cs.block_size; - workgroup_size = bsize[0] * bsize[1] * bsize[2]; - } + /* When workgroup size is not known, just go with wave_size */ + unsigned workgroup_size = program->workgroup_size == UINT_MAX + ? program->wave_size + : program->workgroup_size; + return align(workgroup_size, program->wave_size) / program->wave_size; } } /* end namespace */