From b21d2d9a9f1f9042def069f51ae46bd64848c853 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Wed, 27 May 2020 18:31:33 +0100 Subject: [PATCH] aco: add and use scratch SGPR to lower subdword p_create_vector on GFX6/7 This is needed to lower some corner cases correctly, in case the same operand occurs multiple times: e.g. v0 = p_create_vector(v0[0:8], v0[0:8], v0[0:8], v0[0:8]) Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 21 ++++++++++++++------ src/amd/compiler/aco_register_allocation.cpp | 12 +++++++++-- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index f0d6ceecc46..b0b8701720b 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1024,7 +1024,7 @@ uint32_t get_intersection_mask(int a_start, int a_size, return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask; } -bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc) +bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr) { bool did_copy = false; for (unsigned offset = 0; offset < copy.bytes;) { @@ -1059,9 +1059,18 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool assert(op.physReg().byte() == 0); def = Definition(def.physReg().advance(-def.physReg().byte()), v1); bld.vop2(aco_opcode::v_and_b32, def, Operand((1 << bits) - 1u), Operand(def.physReg(), op.regClass())); - bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op); - bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op); - bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op); + if (def.physReg().reg() == op.physReg().reg()) { + if (bits < 24) { + bld.vop2(aco_opcode::v_mul_u32_u24, def, Operand((1 << bits) + 1u), op); + } else { + bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u)); + bld.vop3(aco_opcode::v_mul_lo_u32, def, Operand(scratch_sgpr, s1), op); + } + } else { + bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op); + bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op); + bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op); + } } else { bld.vop1(aco_opcode::v_mov_b32, def, op); } @@ -1172,7 +1181,7 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool copy_operation tmp_copy = copy; tmp_copy.op.setFixed(copy.def.physReg()); tmp_copy.def.setFixed(copy.op.physReg()); - do_copy(ctx, bld, tmp_copy, &preserve_scc); + do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr); } void handle_operands(std::map& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi) @@ -1337,7 +1346,7 @@ void handle_operands(std::map& copy_map, lower_context* } } - bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc); + bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr); std::pair copy = *it; diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index f00001285a0..5b843070e4f 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1196,13 +1196,18 @@ void handle_pseudo(ra_ctx& ctx, } /* if all operands are constant, no need to care either */ bool reads_sgpr = false; + bool reads_subdword = false; for (Operand& op : instr->operands) { if (op.isTemp() && op.getTemp().type() == RegType::sgpr) { reads_sgpr = true; break; } + if (op.isTemp() && op.regClass().is_subdword()) + reads_subdword = true; } - if (!(writes_sgpr && reads_sgpr)) + bool needs_scratch_reg = (writes_sgpr && reads_sgpr) || + (ctx.program->chip_class <= GFX7 && reads_subdword); + if (!needs_scratch_reg) return; Pseudo_instruction *pi = (Pseudo_instruction *)instr; @@ -1216,7 +1221,10 @@ void handle_pseudo(ra_ctx& ctx, reg = ctx.max_used_sgpr + 1; for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++) ; - assert(reg < ctx.program->max_reg_demand.sgpr); + if (reg == ctx.program->max_reg_demand.sgpr) { + assert(reads_subdword && reg_file[m0] == 0); + reg = m0; + } } adjust_max_used_regs(ctx, s1, reg); -- 2.30.2