From c52ebbcea4f63e2da68de56c3839f6a72e816f46 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 24 Oct 2019 17:34:37 +0200 Subject: [PATCH] aco: Introduce vgpr_limit to keep track of available VGPRs. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann --- src/amd/compiler/aco_instruction_selection.cpp | 5 +++++ src/amd/compiler/aco_instruction_selection_setup.cpp | 2 ++ src/amd/compiler/aco_ir.h | 2 ++ src/amd/compiler/aco_live_var_analysis.cpp | 2 +- src/amd/compiler/aco_lower_to_hw_instr.cpp | 3 --- src/amd/compiler/aco_register_allocation.cpp | 3 ++- 6 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 768860a2c9b..7ae6fb2d9a1 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -162,6 +162,11 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data * properly support subgroup shuffle like older generations (or wave32 mode), so we * emulate it here. */ + if (!ctx->has_gfx10_wave64_bpermute) { + ctx->has_gfx10_wave64_bpermute = true; + ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */ + ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */ + } Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)); lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), lane_id); diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 0104fd36f49..b65628c8521 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -79,6 +79,7 @@ struct isel_context { std::unique_ptr allocated; std::unordered_map> allocated_vec; Stage stage; /* Stage */ + bool has_gfx10_wave64_bpermute = false; struct { bool has_branch; uint16_t loop_nest_depth = 0; @@ -1255,6 +1256,7 @@ setup_isel_context(Program* program, program->lds_alloc_granule = options->chip_class >= GFX7 ? 512 : 256; program->lds_limit = options->chip_class >= GFX7 ? 65536 : 32768; + program->vgpr_limit = 256; if (options->chip_class >= GFX10) { program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */ diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 58d67ef293b..29aefef26cf 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1080,6 +1080,8 @@ public: uint16_t lds_alloc_granule; uint32_t lds_limit; /* in bytes */ + uint16_t vgpr_limit; + uint16_t physical_sgprs; uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ uint16_t sgpr_limit; diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 4d689db7070..a4a2e5c49bb 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -244,7 +244,7 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) const int16_t vgpr_alloc = std::max(4, (new_demand.vgpr + 3) & ~3); /* this won't compile, register pressure reduction necessary */ - if (new_demand.vgpr > 256 || new_demand.sgpr > program->sgpr_limit) { + if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) { program->num_waves = 0; program->max_reg_demand = new_demand; } else { diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 2fe865e2a90..3d01e59fef7 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -823,9 +823,6 @@ void lower_to_hw_instr(Program* program) assert(instr->operands[2].regClass() == v1); /* Indices x4 */ assert(instr->operands[3].regClass() == v1); /* Input data */ - /* Shared VGPRs are allocated in groups of 8 */ - program->config->num_shared_vgprs = 8; - PhysReg shared_vgpr_reg_lo = PhysReg(align(program->config->num_vgprs, 4) + 256); PhysReg shared_vgpr_reg_hi = PhysReg(shared_vgpr_reg_lo + 1); Operand compare = instr->operands[0]; diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 965fe15964a..621bc1f7636 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -668,7 +668,8 @@ PhysReg get_reg(ra_ctx& ctx, /* try using more registers */ uint16_t max_addressible_sgpr = ctx.program->sgpr_limit; - if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < 256) { + uint16_t max_addressible_vgpr = ctx.program->vgpr_limit; + if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) { update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr)); return get_reg(ctx, reg_file, rc, parallelcopies, instr); } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) { -- 2.30.2