From 08d510010b7586387e363460b98e6a45bbe97164 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Fri, 13 Sep 2019 16:41:00 +0100 Subject: [PATCH] aco: increase accuracy of SGPR limits MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit SGPRs are allocated in groups of 16 on GFX8/GFX9. GFX10 allocates a fixed number of SGPRs and has 106 addressable SGPRs. pipeline-db (Vega): SGPRS: 5912 -> 6232 (5.41 %) VGPRS: 1772 -> 1780 (0.45 %) Spilled SGPRs: 0 -> 0 (0.00 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 88228 -> 87904 (-0.37 %) bytes LDS: 0 -> 0 (0.00 %) blocks Max Waves: 559 -> 571 (2.15 %) piepline-db (Navi): SGPRS: 341256 -> 363384 (6.48 %) VGPRS: 171536 -> 170960 (-0.34 %) Spilled SGPRs: 832 -> 581 (-30.17 %) Spilled VGPRs: 0 -> 0 (0.00 %) Private memory VGPRs: 0 -> 0 (0.00 %) Scratch size: 0 -> 0 (0.00 %) dwords per thread Code Size: 14207332 -> 14190872 (-0.12 %) bytes LDS: 33 -> 33 (0.00 %) blocks Max Waves: 18072 -> 18251 (0.99 %) v2: unconditionally count vcc as an extra sgpr on GFX10+ v3: pass SGPRs rounded to 8 Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann --- .../aco_instruction_selection_setup.cpp | 22 ++++++- src/amd/compiler/aco_ir.h | 17 +++++- src/amd/compiler/aco_live_var_analysis.cpp | 60 +++++++++++++++---- src/amd/compiler/aco_register_allocation.cpp | 12 ++-- src/amd/compiler/aco_scheduler.cpp | 4 +- src/amd/compiler/aco_spill.cpp | 8 +-- 6 files changed, 95 insertions(+), 28 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index dce0894f4dc..d7a193552ba 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -1254,9 +1254,25 @@ setup_isel_context(Program* program, program->chip_class = options->chip_class; program->family = options->family; program->wave_size = options->wave_size; - program->sgpr_limit = options->chip_class >= GFX8 ? 102 : 104; - if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND) - program->sgpr_limit = 94; /* workaround hardware bug */ + + if (options->chip_class >= GFX10) { + program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */ + program->sgpr_alloc_granule = 127; + program->sgpr_limit = 106; + } else if (program->chip_class >= GFX8) { + program->physical_sgprs = 800; + program->sgpr_alloc_granule = 15; + program->sgpr_limit = 102; + } else { + program->physical_sgprs = 512; + program->sgpr_alloc_granule = 7; + if (options->family == CHIP_TONGA || options->family == CHIP_ICELAND) + program->sgpr_limit = 94; /* workaround hardware bug */ + else + program->sgpr_limit = 104; + } + /* TODO: we don't have to allocate VCC if we don't need it */ + program->needs_vcc = true; for (unsigned i = 0; i < MAX_SETS; ++i) program->info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 7551225f718..3606b33402e 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1062,7 +1062,6 @@ class Program final { public: std::vector blocks; RegisterDemand max_reg_demand = RegisterDemand(); - uint16_t sgpr_limit = 0; uint16_t num_waves = 0; ac_shader_config* config; struct radv_shader_info *info; @@ -1076,6 +1075,13 @@ public: std::vector constant_data; + uint16_t physical_sgprs; + uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ + uint16_t sgpr_limit; + bool needs_vcc = false; + bool needs_xnack_mask = false; + bool needs_flat_scr = false; + uint32_t allocateId() { assert(allocationID <= 16777215); @@ -1154,6 +1160,15 @@ void perfwarn(bool cond, const char *msg, Instruction *instr=NULL); void aco_print_instr(Instruction *instr, FILE *output); void aco_print_program(Program *program, FILE *output); +/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ +uint16_t get_extra_sgprs(Program *program); + +/* get number of sgprs allocated required to address a number of sgprs */ +uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs); + +/* return number of addressable SGPRs for max_waves */ +uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves); + typedef struct { const int16_t opcode_gfx9[static_cast(aco_opcode::num_opcodes)]; const int16_t opcode_gfx10[static_cast(aco_opcode::num_opcodes)]; diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index f99e57c8b3a..3fe413256e7 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -28,6 +28,7 @@ */ #include "aco_ir.h" +#include "util/u_math.h" #include #include @@ -190,25 +191,62 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, } } /* end namespace */ +uint16_t get_extra_sgprs(Program *program) +{ + if (program->chip_class >= GFX10) { + assert(!program->needs_flat_scr); + assert(!program->needs_xnack_mask); + return 2; + } else if (program->chip_class >= GFX8) { + if (program->needs_flat_scr) + return 6; + else if (program->needs_xnack_mask) + return 4; + else if (program->needs_vcc) + return 2; + else + return 0; + } else { + assert(!program->needs_xnack_mask); + if (program->needs_flat_scr) + return 4; + else if (program->needs_vcc) + return 2; + else + return 0; + } +} + +uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs) +{ + assert(addressable_sgprs <= program->sgpr_limit); + uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program); + uint16_t granule = program->sgpr_alloc_granule + 1; + return align(std::max(sgprs, granule), granule); +} + +uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves) +{ + uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule; + sgprs -= get_extra_sgprs(program); + return std::min(sgprs, program->sgpr_limit); +} + void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) { // TODO: also take shared mem into account - const int16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512; - const int16_t max_addressible_sgpr = program->sgpr_limit; - /* VGPRs are allocated in chunks of 4 */ - const int16_t rounded_vgpr_demand = std::max(4, (new_demand.vgpr + 3) & ~3); - /* SGPRs are allocated in chunks of 16 between 8 and 104. VCC occupies the last 2 registers */ - const int16_t rounded_sgpr_demand = std::min(std::max(8, (new_demand.sgpr + 2 + 7) & ~7), max_addressible_sgpr); + const int16_t vgpr_alloc = std::max(4, (new_demand.vgpr + 3) & ~3); /* this won't compile, register pressure reduction necessary */ - if (new_demand.vgpr > 256 || new_demand.sgpr > max_addressible_sgpr) { + if (new_demand.vgpr > 256 || new_demand.sgpr > program->sgpr_limit) { program->num_waves = 0; program->max_reg_demand = new_demand; } else { - program->num_waves = std::min(10, - std::min(256 / rounded_vgpr_demand, - total_sgpr_regs / rounded_sgpr_demand)); + program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); + program->num_waves = std::min(program->num_waves, 256 / vgpr_alloc); + program->num_waves = std::min(program->num_waves, 10); - program->max_reg_demand = { int16_t((256 / program->num_waves) & ~3), std::min(((total_sgpr_regs / program->num_waves) & ~7) - 2, max_addressible_sgpr)}; + program->max_reg_demand.vgpr = int16_t((256 / program->num_waves) & ~3); + program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); } } diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 47ea932f115..965fe15964a 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -34,6 +34,7 @@ #include "aco_ir.h" #include "sid.h" +#include "util/u_math.h" namespace aco { namespace { @@ -1914,12 +1915,11 @@ void register_allocation(Program *program, std::vector> live_out_ } /* num_gpr = rnd_up(max_used_gpr + 1) */ - program->config->num_vgprs = (ctx.max_used_vgpr + 1 + 3) & ~3; - if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) { - assert(ctx.max_used_sgpr <= 93); - ctx.max_used_sgpr = 93; /* workaround hardware bug */ - } - program->config->num_sgprs = (ctx.max_used_sgpr + 1 + 2 + 7) & ~7; /* + 2 sgprs for vcc */ + program->config->num_vgprs = align(ctx.max_used_vgpr + 1, 4); + if (program->family == CHIP_TONGA || program->family == CHIP_ICELAND) /* workaround hardware bug */ + program->config->num_sgprs = get_sgpr_alloc(program, program->sgpr_limit); + else + program->config->num_sgprs = align(ctx.max_used_sgpr + 1 + get_extra_sgprs(program), 8); } } diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 67264fcf14f..1601545dcfe 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -806,9 +806,9 @@ void schedule_program(Program *program, live& live_vars) //TODO: this also increases window-size/max-moves? did I realize that at the time? ctx.num_waves = std::min(program->num_waves, 5); assert(ctx.num_waves); - uint16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512; + uint16_t total_sgpr_regs = program->physical_sgprs; uint16_t max_addressible_sgpr = program->sgpr_limit; - ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min(((total_sgpr_regs / ctx.num_waves) & ~7) - 2, max_addressible_sgpr)}; + ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min(((total_sgpr_regs / ctx.num_waves) & ~program->sgpr_alloc_granule) - 2, max_addressible_sgpr)}; for (Block& block : program->blocks) schedule_block(ctx, program, &block, live_vars); diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 92a23bb355c..56167e36d6d 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1568,8 +1568,6 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt return; /* else, we check if we can improve things a bit */ - uint16_t total_sgpr_regs = options->chip_class >= GFX8 ? 800 : 512; - uint16_t max_addressible_sgpr = program->sgpr_limit; /* calculate target register demand */ RegisterDemand max_reg_demand; @@ -1577,14 +1575,14 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt max_reg_demand.update(block.register_demand); } - RegisterDemand target_pressure = {256, int16_t(max_addressible_sgpr)}; + RegisterDemand target_pressure = {256, int16_t(program->sgpr_limit)}; unsigned num_waves = 1; - int spills_to_vgpr = (max_reg_demand.sgpr - max_addressible_sgpr + 63) / 64; + int spills_to_vgpr = (max_reg_demand.sgpr - program->sgpr_limit + 63) / 64; /* test if it possible to increase occupancy with little spilling */ for (unsigned num_waves_next = 2; num_waves_next <= 8; num_waves_next++) { RegisterDemand target_pressure_next = {int16_t((256 / num_waves_next) & ~3), - int16_t(std::min(((total_sgpr_regs / num_waves_next) & ~7) - 2, max_addressible_sgpr))}; + int16_t(get_addr_sgpr_from_waves(program, num_waves_next))}; /* Currently no vgpr spilling supported. * Spill as many sgprs as necessary to not hinder occupancy */ -- 2.30.2