From 14a5021aff661a26d76f330fec55d400d35443a8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Wed, 27 May 2020 01:22:28 +0200 Subject: [PATCH] aco/gfx10: Refactor of GFX10 wave64 bpermute. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The emulated GFX10 wave64 bpermute no longer needs a linear_vgpr, so we don't consider it a reduction anymore. Additionally, the code is slightly reorganized in preparation for the GFX6 emulated bpermute. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_builder_h.py | 2 +- .../compiler/aco_instruction_selection.cpp | 51 +++---- src/amd/compiler/aco_ir.h | 1 - src/amd/compiler/aco_lower_to_hw_instr.cpp | 141 +++++++++++------- src/amd/compiler/aco_opcodes.py | 4 +- src/amd/compiler/aco_print_ir.cpp | 1 - src/amd/compiler/aco_reduce_assign.cpp | 5 - 7 files changed, 113 insertions(+), 92 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 097743658b3..2b56e04e501 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -487,7 +487,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ("exp", [Format.EXP], 'Export_instruction', [(0, 4)]), ("branch", [Format.PSEUDO_BRANCH], 'Pseudo_branch_instruction', itertools.product([0], [0, 1])), ("barrier", [Format.PSEUDO_BARRIER], 'Pseudo_barrier_instruction', [(0, 0)]), - ("reduction", [Format.PSEUDO_REDUCTION], 'Pseudo_reduction_instruction', [(3, 2), (3, 4)]), + ("reduction", [Format.PSEUDO_REDUCTION], 'Pseudo_reduction_instruction', [(3, 2)]), ("vop1", [Format.VOP1], 'VOP1_instruction', [(1, 1), (2, 2)]), ("vop2", [Format.VOP2], 'VOP2_instruction', itertools.product([1, 2], [2, 3])), ("vop2_sdwa", [Format.VOP2, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2, 3])), diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 55fc1a59d4f..e65bf7dc68e 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -169,33 +169,34 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data if (index.regClass() == s1) return bld.readlane(bld.def(s1), data, index); - Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); - - /* Currently not implemented on GFX6-7 */ - assert(ctx->options->chip_class >= GFX8); - - if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) { + if (ctx->options->chip_class <= GFX7) { + /* GFX6-7: there is no bpermute instruction */ + unreachable("Not implemented yet on GFX6-7"); /* TODO */ + } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) { + /* GFX10 wave64 mode: emulate full-wave bpermute */ + if (!ctx->has_gfx10_wave64_bpermute) { + ctx->has_gfx10_wave64_bpermute = true; + ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */ + ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */ + } + + Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index); + Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo); + Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp()); + Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1); + Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); + Operand input_data(data); + + index_x4.setLateKill(true); + input_data.setLateKill(true); + same_half.setLateKill(true); + + return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half); + } else { + /* GFX8-9 or GFX10 wave32: bpermute works normally */ + Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data); } - - /* GFX10, wave64 mode: - * The bpermute instruction is limited to half-wave operation, which means that it can't - * properly support subgroup shuffle like older generations (or wave32 mode), so we - * emulate it here. - */ - if (!ctx->has_gfx10_wave64_bpermute) { - ctx->has_gfx10_wave64_bpermute = true; - ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */ - ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */ - } - - Temp lane_id = emit_mbcnt(ctx, bld.def(v1)); - Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id); - Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index); - Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi); - - return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), - bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute); } Temp as_vgpr(isel_context *ctx, Temp val) diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 1529f78cef7..3921cad89ab 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1138,7 +1138,6 @@ enum ReduceOp : uint16_t { iand8, iand16, iand32, iand64, ior8, ior16, ior32, ior64, ixor8, ixor16, ixor32, ixor64, - gfx10_wave64_bpermute }; /** diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 1788f90b4c6..c023cfe7c3e 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -784,6 +784,75 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig } } +void emit_gfx10_wave64_bpermute(Program *program, aco_ptr &instr, Builder &bld) +{ + /* Emulates proper bpermute on GFX10 in wave64 mode. + * + * This is necessary because on GFX10 the bpermute instruction only works + * on half waves (you can think of it as having a cluster size of 32), so we + * manually swap the data between the two halves using two shared VGPRs. + */ + + assert(program->chip_class >= GFX10); + assert(program->info->wave_size == 64); + + unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256; + Definition dst = instr->definitions[0]; + Definition tmp_exec = instr->definitions[1]; + Definition clobber_scc = instr->definitions[2]; + Operand index_x4 = instr->operands[0]; + Operand input_data = instr->operands[1]; + Operand same_half = instr->operands[2]; + + assert(dst.regClass() == v1); + assert(tmp_exec.regClass() == bld.lm); + assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc); + assert(same_half.regClass() == bld.lm); + assert(index_x4.regClass() == v1); + assert(input_data.regClass().type() == RegType::vgpr); + assert(input_data.bytes() <= 4); + assert(dst.physReg() != index_x4.physReg()); + assert(dst.physReg() != input_data.physReg()); + assert(tmp_exec.physReg() != same_half.physReg()); + + PhysReg shared_vgpr_lo(shared_vgpr_reg_0); + PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1); + + /* Permute the input within the same half-wave */ + bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data); + + /* HI: Copy data from high lanes 32-63 to shared vgpr */ + bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); + /* Save EXEC */ + bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2)); + /* Set EXEC to enable LO lanes only */ + bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(0u)); + /* LO: Copy data from low lanes 0-31 to shared vgpr */ + bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data); + /* LO: bpermute shared vgpr (high lanes' data) */ + bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4, Operand(shared_vgpr_hi, v1)); + /* Set EXEC to enable HI lanes only */ + bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u)); + /* HI: bpermute shared vgpr (low lanes' data) */ + bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4, Operand(shared_vgpr_lo, v1)); + + /* Only enable lanes which use the other half's data */ + bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc, Operand(tmp_exec.physReg(), s2), same_half); + /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */ + bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false); + /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */ + bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); + + /* Restore saved EXEC */ + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2)); + + /* RA assumes that the result is always in the low part of the register, so we have to shift, if it's not there already */ + if (input_data.physReg().byte()) { + unsigned right_shift = input_data.physReg().byte() * 8; + bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand(right_shift), Operand(dst.physReg(), v1)); + } +} + struct copy_operation { Operand op; Definition def; @@ -1478,6 +1547,15 @@ void lower_to_hw_instr(Program* program) } break; } + case aco_opcode::p_bpermute: + { + if (ctx.program->chip_class <= GFX7) + unreachable("Not implemented yet on GFX6-7"); /* TODO */ + else if (ctx.program->chip_class == GFX10 && ctx.program->wave_size == 64) + emit_gfx10_wave64_bpermute(program, instr, bld); + else + unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute."); + } default: break; } @@ -1525,63 +1603,12 @@ void lower_to_hw_instr(Program* program) } else if (instr->format == Format::PSEUDO_REDUCTION) { Pseudo_reduction_instruction* reduce = static_cast(instr.get()); - if (reduce->reduce_op == gfx10_wave64_bpermute) { - /* Only makes sense on GFX10 wave64 */ - assert(program->chip_class >= GFX10); - assert(program->info->wave_size == 64); - assert(instr->definitions[0].regClass() == v1); /* Destination */ - assert(instr->definitions[1].regClass() == s2); /* Temp EXEC */ - assert(instr->definitions[1].physReg() != vcc); - assert(instr->definitions[2].physReg() == scc); /* SCC clobber */ - assert(instr->operands[0].physReg() == vcc); /* Compare */ - assert(instr->operands[1].regClass() == v2.as_linear()); /* Temp VGPR pair */ - assert(instr->operands[2].regClass() == v1); /* Indices x4 */ - assert(instr->operands[3].bytes() <= 4); /* Indices x4 */ - - PhysReg shared_vgpr_reg_lo = PhysReg(align(program->config->num_vgprs, 4) + 256); - PhysReg shared_vgpr_reg_hi = PhysReg(shared_vgpr_reg_lo + 1); - Operand compare = instr->operands[0]; - Operand tmp1(instr->operands[1].physReg(), v1); - Operand tmp2(PhysReg(instr->operands[1].physReg() + 1), v1); - Operand index_x4 = instr->operands[2]; - Operand input_data = instr->operands[3]; - Definition shared_vgpr_lo(shared_vgpr_reg_lo, v1); - Definition shared_vgpr_hi(shared_vgpr_reg_hi, v1); - Definition def_temp1(tmp1.physReg(), v1); - Definition def_temp2(tmp2.physReg(), v1); - - /* Save EXEC and set it for all lanes */ - bld.sop1(aco_opcode::s_or_saveexec_b64, instr->definitions[1], instr->definitions[2], - Definition(exec, s2), Operand((uint64_t)-1), Operand(exec, s2)); - - /* HI: Copy data from high lanes 32-63 to shared vgpr */ - bld.vop1_dpp(aco_opcode::v_mov_b32, shared_vgpr_hi, input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); - - /* LO: Copy data from low lanes 0-31 to shared vgpr */ - bld.vop1_dpp(aco_opcode::v_mov_b32, shared_vgpr_lo, input_data, dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false); - /* LO: Copy shared vgpr (high lanes' data) to output vgpr */ - bld.vop1_dpp(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false); - - /* HI: Copy shared vgpr (low lanes' data) to output vgpr */ - bld.vop1_dpp(aco_opcode::v_mov_b32, def_temp1, Operand(shared_vgpr_reg_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); - - /* Permute the original input */ - bld.ds(aco_opcode::ds_bpermute_b32, def_temp2, index_x4, input_data); - /* Permute the swapped input */ - bld.ds(aco_opcode::ds_bpermute_b32, def_temp1, index_x4, tmp1); - - /* Restore saved EXEC */ - bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(instr->definitions[1].physReg(), s2)); - /* Choose whether to use the original or swapped */ - bld.vop2(aco_opcode::v_cndmask_b32, instr->definitions[0], tmp1, tmp2, compare); - } else { - emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size, - reduce->operands[1].physReg(), // tmp - reduce->definitions[1].physReg(), // stmp - reduce->operands[2].physReg(), // vtmp - reduce->definitions[2].physReg(), // sitmp - reduce->operands[0], reduce->definitions[0]); - } + emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size, + reduce->operands[1].physReg(), // tmp + reduce->definitions[1].physReg(), // stmp + reduce->operands[2].physReg(), // vtmp + reduce->definitions[2].physReg(), // sitmp + reduce->operands[0], reduce->definitions[0]); } else { ctx.instructions.emplace_back(std::move(instr)); } diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 30656976fee..564a8309ccf 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -221,8 +221,6 @@ opcode("p_reduce", format=Format.PSEUDO_REDUCTION) opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION) # e.g. subgroupExclusiveMin() opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION) -# simulates proper bpermute behavior on GFX10 wave64 -opcode("p_wave64_bpermute", format=Format.PSEUDO_REDUCTION) opcode("p_branch", format=Format.PSEUDO_BRANCH) opcode("p_cbranch", format=Format.PSEUDO_BRANCH) @@ -253,6 +251,8 @@ opcode("p_exit_early_if") opcode("p_fs_buffer_store_smem", format=Format.SMEM) +# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64 +opcode("p_bpermute") # SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc) SOP2 = { diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 2b18daef154..545dc9f553c 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -55,7 +55,6 @@ static const char *reduce_ops[] = { [ixor16] = "ixor16", [ixor32] = "ixor32", [ixor64] = "ixor64", - [gfx10_wave64_bpermute] = "gfx10_wave64_bpermute", }; static void print_reg_class(const RegClass rc, FILE *output) diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 708f401d5fe..7bf7a6c3b68 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -114,11 +114,6 @@ void setup_reduce_temp(Program* program) } } - if (op == gfx10_wave64_bpermute) { - instr->operands[1] = Operand(reduceTmp); - continue; - } - /* same as before, except for the vector temporary instead of the reduce temporary */ unsigned cluster_size = static_cast(instr)->cluster_size; bool need_vtmp = op == imul32 || op == fadd64 || op == fmul64 || -- 2.30.2