From 3865448012b16d0e98e706e1b462242a754436c7 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 12 Sep 2019 19:28:52 +0100 Subject: [PATCH] aco: Fix reductions on GFX10. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fixes p_reduce (all cluster sizes), p_inclusive_scan and p_exclusive_scan with all reduction operations. Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann --- src/amd/compiler/aco_ir.h | 2 +- src/amd/compiler/aco_lower_to_hw_instr.cpp | 92 +++++++++++++++++++--- src/amd/compiler/aco_reduce_assign.cpp | 19 +++-- 3 files changed, 95 insertions(+), 18 deletions(-) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 739ef869e6a..90fc3c6fe36 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -841,7 +841,7 @@ enum ReduceOp { * Operand(2): vector temporary * Definition(0): result * Definition(1): scalar temporary - * Definition(2): scalar identity temporary + * Definition(2): scalar identity temporary (not used to store identity on GFX10) * Definition(3): scc clobber * Definition(4): vcc clobber * diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 39585111954..2cd451e48c5 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -85,6 +85,22 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, Ph } } +void emit_op(lower_context *ctx, PhysReg dst, PhysReg src0, PhysReg src1, + aco_opcode op, Format format, bool clobber_vcc, unsigned size) +{ + aco_ptr instr; + if (format == Format::VOP3) + instr.reset(create_instruction(op, format, 2, clobber_vcc ? 2 : 1)); + else + instr.reset(create_instruction(op, format, 2, clobber_vcc ? 2 : 1)); + instr->operands[0] = Operand(src0, src0.reg >= 256 ? v1 : s1); + instr->operands[1] = Operand(src1, v1); + instr->definitions[0] = Definition(dst, v1); + if (clobber_vcc) + instr->definitions[1] = Definition(vcc, s2); + ctx->instructions.emplace_back(std::move(instr)); +} + uint32_t get_reduction_identity(ReduceOp op, unsigned idx) { switch (op) { @@ -236,12 +252,12 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig Operand vcndmask_identity[2] = {identity[0], identity[1]}; /* First, copy the source to tmp and set inactive lanes to the identity */ - // note: this clobbers SCC! bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2)); for (unsigned i = 0; i < src.size(); i++) { - /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 */ - if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) { + /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 + * except on GFX10, where v_writelane_b32 can take a literal. */ + if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) { bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]); identity[i] = Operand(PhysReg{sitmp+i}, s1); @@ -283,6 +299,16 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig exec_restored = true; emit_vopn(ctx, dst.physReg(), vtmp, tmp, src.regClass(), reduce_opcode, format, should_clobber_vcc); dst_written = true; + } else if (ctx->program->chip_class >= GFX10) { + assert(cluster_size == 64); + /* GFX10+ doesn't support row_bcast15 and row_bcast31 */ + for (unsigned i = 0; i < src.size(); i++) + bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u)); + emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size()); + + for (unsigned i = 0; i < src.size(); i++) + bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size()); } else { assert(cluster_size == 64); emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, @@ -292,11 +318,38 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig } break; case aco_opcode::p_exclusive_scan: - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false, - dpp_wf_sr1, 0xf, 0xf, true, src.size()); + if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */ + /* shift rows right */ + for (unsigned i = 0; i < src.size(); i++) { + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), dpp_row_sr(1), 0xf, 0xf, true); + } + + /* fill in the gaps in rows 1 and 3 */ + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u)); + for (unsigned i = 0; i < src.size(); i++) { + Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32, + Definition(PhysReg{vtmp+i}, v1), + Operand(PhysReg{tmp+i}, v1), + Operand(0xffffffffu), Operand(0xffffffffu)).instr; + static_cast(perm)->opsel[0] = true; /* FI (Fetch Inactive) */ + } + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + + /* fill in the gap in row 2 */ + for (unsigned i = 0; i < src.size(); i++) { + bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u)); + } + std::swap(tmp, vtmp); + } else { + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, aco_opcode::v_mov_b32, Format::VOP1, false, + dpp_wf_sr1, 0xf, 0xf, true, src.size()); + } for (unsigned i = 0; i < src.size(); i++) { if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */ - assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i}); + if (ctx->program->chip_class < GFX10) + assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i}); bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u)); } @@ -312,10 +365,29 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig dpp_row_sr(4), 0xf, 0xf, false, src.size(), identity); emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, dpp_row_sr(8), 0xf, 0xf, false, src.size(), identity); - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, - dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity); - emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, - dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity); + if (ctx->program->chip_class >= GFX10) { + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xffff0000u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffff0000u)); + for (unsigned i = 0; i < src.size(); i++) { + Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32, + Definition(PhysReg{vtmp+i}, v1), + Operand(PhysReg{tmp+i}, v1), + Operand(0xffffffffu), Operand(0xffffffffu)).instr; + static_cast(perm)->opsel[0] = true; /* FI (Fetch Inactive) */ + } + emit_op(ctx, tmp, tmp, vtmp, reduce_opcode, format, should_clobber_vcc, src.size()); + + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu)); + for (unsigned i = 0; i < src.size(); i++) + bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + emit_op(ctx, tmp, sitmp, tmp, reduce_opcode, format, should_clobber_vcc, src.size()); + } else { + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_bcast15, 0xa, 0xf, false, src.size(), identity); + emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, wrtmp, reduce_opcode, format, should_clobber_vcc, + dpp_row_bcast31, 0xc, 0xf, false, src.size(), identity); + } break; default: unreachable("Invalid reduction mode"); diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 663a43c539a..66a3ec64c04 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -115,10 +115,13 @@ void setup_reduce_temp(Program* program) } /* same as before, except for the vector temporary instead of the reduce temporary */ + unsigned cluster_size = static_cast(instr)->cluster_size; bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || op == fmin64 || op == fmax64; + if (program->chip_class >= GFX10 && cluster_size == 64) + need_vtmp = true; - need_vtmp |= static_cast(instr)->cluster_size == 32; + need_vtmp |= cluster_size == 32; vtmp_in_loop |= need_vtmp && block.loop_nest_depth > 0; if (need_vtmp && (int)last_top_level_block_idx != vtmp_inserted_at) { vtmp = {program->allocateId(), vtmp.regClass()}; @@ -144,12 +147,14 @@ void setup_reduce_temp(Program* program) instr->definitions[1] = bld.def(s2); /* scalar identity temporary */ - if (instr->opcode == aco_opcode::p_exclusive_scan && - (op == imin32 || op == imin64 || - op == imax32 || op == imax64 || - op == fmin32 || op == fmin64 || - op == fmax32 || op == fmax64 || - op == fmul64)) { + bool need_sitmp = program->chip_class >= GFX10 && cluster_size == 64; + if (instr->opcode == aco_opcode::p_exclusive_scan) { + need_sitmp |= + (op == imin32 || op == imin64 || op == imax32 || op == imax64 || + op == fmin32 || op == fmin64 || op == fmax32 || op == fmax64 || + op == fmul64); + } + if (need_sitmp) { instr->definitions[2] = bld.def(RegClass(RegType::sgpr, instr->operands[0].size())); } -- 2.30.2