From b5c9688516d00b00184e6fa6868de826916f5fc3 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 18 Dec 2019 16:18:35 +0000 Subject: [PATCH] aco: limit register usage for large work groups MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann --- .../aco_instruction_selection_setup.cpp | 5 +++ src/amd/compiler/aco_ir.h | 2 ++ src/amd/compiler/aco_live_var_analysis.cpp | 32 +++++++++++++++---- src/amd/compiler/aco_scheduler.cpp | 1 + 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index d55d4105d39..8a461e7e929 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -819,9 +819,14 @@ setup_isel_context(Program* program, program->sgpr_alloc_granule = 7; program->sgpr_limit = 104; } + /* TODO: we don't have to allocate VCC if we don't need it */ program->needs_vcc = true; + calc_min_waves(program); + program->vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + program->sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + isel_context ctx = {}; ctx.program = program; ctx.args = args; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 82309ba8e38..04647981b68 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1155,6 +1155,7 @@ public: Temp private_segment_buffer; Temp scratch_offset; + uint16_t min_waves = 0; uint16_t lds_alloc_granule; uint32_t lds_limit; /* in bytes */ uint16_t vgpr_limit; @@ -1216,6 +1217,7 @@ void select_program(Program *program, void lower_wqm(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); void lower_bool_phis(Program* program); +void calc_min_waves(Program* program); void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options); std::vector dead_code_analysis(Program *program); diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 2841ba208f6..44a3ea53430 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -228,6 +228,16 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, assert(block->index != 0 || new_demand == RegisterDemand()); } + +unsigned calc_waves_per_workgroup(Program *program) +{ + unsigned workgroup_size = program->wave_size; + if (program->stage == compute_cs) { + unsigned* bsize = program->info->cs.block_size; + workgroup_size = bsize[0] * bsize[1] * bsize[2]; + } + return align(workgroup_size, program->wave_size) / program->wave_size; +} } /* end namespace */ uint16_t get_extra_sgprs(Program *program) @@ -284,6 +294,20 @@ uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves) return std::min(vgprs, program->vgpr_limit); } +void calc_min_waves(Program* program) +{ + unsigned waves_per_workgroup = calc_waves_per_workgroup(program); + /* currently min_waves is in wave64 waves */ + if (program->wave_size == 32) + waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2); + + unsigned simd_per_cu = 4; /* TODO: different on Navi */ + bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */ + unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu; + + program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp); +} + void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) { /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */ @@ -304,13 +328,7 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) program->max_waves = max_waves_per_simd; /* adjust max_waves for workgroup and LDS limits */ - unsigned workgroup_size = program->wave_size; - if (program->stage == compute_cs) { - unsigned* bsize = program->info->cs.block_size; - workgroup_size = bsize[0] * bsize[1] * bsize[2]; - } - unsigned waves_per_workgroup = align(workgroup_size, program->wave_size) / program->wave_size; - + unsigned waves_per_workgroup = calc_waves_per_workgroup(program); unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup; if (program->config->lds_size) { unsigned lds = program->config->lds_size * program->lds_alloc_granule; diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 665fcb2db6f..0a8d5af8c78 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -932,6 +932,7 @@ void schedule_program(Program *program, live& live_vars) ctx.num_waves = 7; else ctx.num_waves = 8; + ctx.num_waves = std::max(ctx.num_waves, program->min_waves); assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves); ctx.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2), -- 2.30.2