From 045c9ffa7d7f496ba347aa7acbfc0edea37a0fc1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Wed, 27 May 2020 01:28:03 +0200 Subject: [PATCH] aco: Implement subgroup shuffle on GFX6-7. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit GFX6 and GFX7 don't have the ds_bpermute (or permute) instruction, but we would like to support subgroup shuffle on these old GPUs. So we introduce a new pseudio instruction which will be lowered to an "unrolled loop" that emulates bpermute on GFX6 and GFX7 using readlane instructions, while also respecting the exec mask thanks to v_cmpx. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 7 +++- src/amd/compiler/aco_lower_to_hw_instr.cpp | 41 ++++++++++++++++++- 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index e65bf7dc68e..6b85d2a9ab7 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -171,7 +171,12 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data if (ctx->options->chip_class <= GFX7) { /* GFX6-7: there is no bpermute instruction */ - unreachable("Not implemented yet on GFX6-7"); /* TODO */ + Operand index_op(index); + Operand input_data(data); + index_op.setLateKill(true); + input_data.setLateKill(true); + + return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data); } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) { /* GFX10 wave64 mode: emulate full-wave bpermute */ if (!ctx->has_gfx10_wave64_bpermute) { diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index c023cfe7c3e..4383b198890 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -853,6 +853,45 @@ void emit_gfx10_wave64_bpermute(Program *program, aco_ptr &instr, B } } +void emit_gfx6_bpermute(Program *program, aco_ptr &instr, Builder &bld) +{ + /* Emulates bpermute using readlane instructions */ + + Operand index = instr->operands[0]; + Operand input = instr->operands[1]; + Definition dst = instr->definitions[0]; + Definition temp_exec = instr->definitions[1]; + Definition clobber_vcc = instr->definitions[2]; + + assert(dst.regClass() == v1); + assert(temp_exec.regClass() == bld.lm); + assert(clobber_vcc.regClass() == bld.lm); + assert(clobber_vcc.physReg() == vcc); + assert(index.regClass() == v1); + assert(index.physReg() != dst.physReg()); + assert(input.regClass().type() == RegType::vgpr); + assert(input.bytes() <= 4); + assert(input.physReg() != dst.physReg()); + + /* Save original EXEC */ + bld.sop1(aco_opcode::s_mov_b64, temp_exec, Operand(exec, s2)); + + /* An "unrolled loop" that is executed per each lane. + * This takes only a few instructions per lane, as opposed to a "real" loop + * with branching, where the branch instruction alone would take 16+ cycles. + */ + for (unsigned n = 0; n < program->wave_size; ++n) { + /* Activate the lane which has N for its source index */ + bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), clobber_vcc, Operand(n), index); + /* Read the data from lane N */ + bld.readlane(Definition(vcc, s1), input, Operand(n)); + /* On the active lane, move the data we read from lane N to the destination VGPR */ + bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1)); + /* Restore original EXEC */ + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(temp_exec.physReg(), s2)); + } +} + struct copy_operation { Operand op; Definition def; @@ -1550,7 +1589,7 @@ void lower_to_hw_instr(Program* program) case aco_opcode::p_bpermute: { if (ctx.program->chip_class <= GFX7) - unreachable("Not implemented yet on GFX6-7"); /* TODO */ + emit_gfx6_bpermute(program, instr, bld); else if (ctx.program->chip_class == GFX10 && ctx.program->wave_size == 64) emit_gfx10_wave64_bpermute(program, instr, bld); else -- 2.30.2