From 6a586a60067ccc7337a3bb047e21ecc2384cc56a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Thu, 7 Nov 2019 18:02:33 +0100 Subject: [PATCH] aco: split read/writelane opcode into VOP2/VOP3 version for SI/CI Reviewed-by: Rhys Perry --- src/amd/compiler/aco_builder_h.py | 13 +++++++ src/amd/compiler/aco_insert_NOPs.cpp | 8 +++-- src/amd/compiler/aco_insert_exec_mask.cpp | 4 ++- .../compiler/aco_instruction_selection.cpp | 8 ++--- src/amd/compiler/aco_lower_to_hw_instr.cpp | 34 +++++++++---------- src/amd/compiler/aco_opcodes.py | 6 ++-- src/amd/compiler/aco_optimizer.cpp | 16 ++++++--- src/amd/compiler/aco_register_allocation.cpp | 3 +- src/amd/compiler/aco_validate.cpp | 15 ++++++-- 9 files changed, 72 insertions(+), 35 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index ada0806f6a9..d215c7b198f 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -418,6 +418,19 @@ public: return insert(std::move(sub)); } + Result readlane(Definition dst, Op vsrc, Op lane) + { + if (program->chip_class >= GFX8) + return vop3(aco_opcode::v_readlane_b32_e64, dst, vsrc, lane); + else + return vop2(aco_opcode::v_readlane_b32, dst, vsrc, lane); + } + Result writelane(Definition dst, Op val, Op lane, Op vsrc) { + if (program->chip_class >= GFX8) + return vop3(aco_opcode::v_writelane_b32_e64, dst, val, lane, vsrc); + else + return vop2(aco_opcode::v_writelane_b32, dst, val, lane, vsrc); + } <% import itertools formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]), diff --git a/src/amd/compiler/aco_insert_NOPs.cpp b/src/amd/compiler/aco_insert_NOPs.cpp index 29bc8375ffc..1ead0c04da5 100644 --- a/src/amd/compiler/aco_insert_NOPs.cpp +++ b/src/amd/compiler/aco_insert_NOPs.cpp @@ -110,7 +110,9 @@ bool VALU_writes_sgpr(aco_ptr& instr) return true; if (instr->isVOP3() && instr->definitions.size() == 2) return true; - if (instr->opcode == aco_opcode::v_readfirstlane_b32 || instr->opcode == aco_opcode::v_readlane_b32) + if (instr->opcode == aco_opcode::v_readfirstlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32_e64) return true; return false; } @@ -285,7 +287,9 @@ int handle_instruction_gfx8_9(NOP_ctx_gfx8_9& ctx, aco_ptr& instr, switch (instr->opcode) { case aco_opcode::v_readlane_b32: - case aco_opcode::v_writelane_b32: { + case aco_opcode::v_readlane_b32_e64: + case aco_opcode::v_writelane_b32: + case aco_opcode::v_writelane_b32_e64: { if (ctx.VALU_wrsgpr + 4 < new_idx) break; PhysReg reg = instr->operands[1].physReg(); diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index cbc0698096b..607b4f52793 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -118,7 +118,9 @@ bool pred_by_exec_mask(aco_ptr& instr) { } if (instr->opcode == aco_opcode::v_readlane_b32 || - instr->opcode == aco_opcode::v_writelane_b32) + instr->opcode == aco_opcode::v_readlane_b32_e64 || + instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) return false; return true; diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index f05c1df9d03..1222a11f2b4 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -163,7 +163,7 @@ Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_ne static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data) { if (index.regClass() == s1) - return bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), data, index); + return bld.readlane(bld.def(s1), data, index); Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index); @@ -6098,14 +6098,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); if (dst.regClass() == v1) { /* src2 is ignored for writelane. RA assigns the same reg for dst */ - emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst); + emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst); } else if (dst.regClass() == v2) { Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1); Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1); bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src); bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); - Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi)); - Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi)); + Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi)); + Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi)); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else { diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 19e0f598074..38e6030687b 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -481,7 +481,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (cluster_size == 64) { for (unsigned i = 0; i < src.size(); i++) - bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); } } else if (cluster_size == 32) { @@ -519,8 +519,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (ctx->program->wave_size == 64) { /* fill in the gap in row 2 */ for (unsigned i = 0; i < src.size(); i++) { - bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); - bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u)); + bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1)); } } std::swap(tmp, vtmp); @@ -531,8 +531,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take case of this overwise */ if (ctx->program->chip_class < GFX10) assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i}); - bld.vop3(aco_opcode::v_writelane_b32, Definition(PhysReg{tmp+i}, v1), - identity[i], Operand(0u)); + bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1)); } } /* fall through */ @@ -562,7 +561,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0u)); bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0xffffffffu)); for (unsigned i = 0; i < src.size(); i++) - bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); + bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u)); emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); } } else { @@ -581,8 +580,8 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (op == aco_opcode::p_reduce && dst.regClass().type() == RegType::sgpr) { for (unsigned k = 0; k < src.size(); k++) { - bld.vop3(aco_opcode::v_readlane_b32, Definition(PhysReg{dst.physReg() + k}, s1), - Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1)); + bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), + Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1)); } } else if (!(dst.physReg() == tmp) && !dst_written) { for (unsigned k = 0; k < src.size(); k++) { @@ -911,21 +910,20 @@ void lower_to_hw_instr(Program* program) case aco_opcode::p_spill: { assert(instr->operands[0].regClass() == v1.as_linear()); - for (unsigned i = 0; i < instr->operands[2].size(); i++) { - bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1, instr->operands[0].physReg()), - Operand(PhysReg{instr->operands[2].physReg() + i}, s1), - Operand(instr->operands[1].constantValue() + i)); - } + for (unsigned i = 0; i < instr->operands[2].size(); i++) + bld.writelane(bld.def(v1, instr->operands[0].physReg()), + Operand(PhysReg{instr->operands[2].physReg() + i}, s1), + Operand(instr->operands[1].constantValue() + i), + instr->operands[0]); break; } case aco_opcode::p_reload: { assert(instr->operands[0].regClass() == v1.as_linear()); - for (unsigned i = 0; i < instr->definitions[0].size(); i++) { - bld.vop3(aco_opcode::v_readlane_b32, - bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), - instr->operands[0], Operand(instr->operands[1].constantValue() + i)); - } + for (unsigned i = 0; i < instr->definitions[0].size(); i++) + bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), + instr->operands[0], + Operand(instr->operands[1].constantValue() + i)); break; } case aco_opcode::p_as_uniform: diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index f9697420ae0..65e739b0644 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -592,6 +592,8 @@ for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM: VOP2 = { # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32", False), + (0x01, 0x01, -1, -1, -1, "v_readlane_b32", False), + (0x02, 0x02, -1, -1, -1, "v_writelane_b32", False), (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True), (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True), (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True), @@ -984,8 +986,8 @@ VOP3 = { ( -1, -1, 0x276, 0x276, -1, "v_interp_p2_legacy_f16", True, True), ( -1, -1, -1, 0x277, 0x35a, "v_interp_p2_f16", True, True), (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True), - (0x101, 0x101, 0x289, 0x289, 0x360, "v_readlane_b32", False, False), - (0x102, 0x102, 0x28a, 0x28a, 0x361, "v_writelane_b32", False, False), + ( -1, -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False), + ( -1, -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False), (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False), (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False), (0x124, 0x124, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32", False, False), diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index e92531030c8..cea466e3819 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -421,14 +421,19 @@ bool can_use_VOP3(aco_ptr& instr) return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 && instr->opcode != aco_opcode::v_madmk_f16 && - instr->opcode != aco_opcode::v_madak_f16; + instr->opcode != aco_opcode::v_madak_f16 && + instr->opcode != aco_opcode::v_readlane_b32 && + instr->opcode != aco_opcode::v_writelane_b32 && + instr->opcode != aco_opcode::v_readfirstlane_b32; } bool can_apply_sgprs(aco_ptr& instr) { return instr->opcode != aco_opcode::v_readfirstlane_b32 && instr->opcode != aco_opcode::v_readlane_b32 && - instr->opcode != aco_opcode::v_writelane_b32; + instr->opcode != aco_opcode::v_readlane_b32_e64 && + instr->opcode != aco_opcode::v_writelane_b32 && + instr->opcode != aco_opcode::v_writelane_b32_e64; } void to_VOP3(opt_ctx& ctx, aco_ptr& instr) @@ -458,6 +463,7 @@ bool can_accept_constant(aco_ptr& instr, unsigned operand) case aco_opcode::v_interp_p2_f32: case aco_opcode::v_mac_f32: case aco_opcode::v_writelane_b32: + case aco_opcode::v_writelane_b32_e64: case aco_opcode::v_cndmask_b32: return operand != 2; case aco_opcode::s_addk_i32: @@ -466,6 +472,7 @@ bool can_accept_constant(aco_ptr& instr, unsigned operand) case aco_opcode::p_extract_vector: case aco_opcode::p_split_vector: case aco_opcode::v_readlane_b32: + case aco_opcode::v_readlane_b32_e64: case aco_opcode::v_readfirstlane_b32: return operand != 0; default: @@ -494,7 +501,8 @@ bool valu_can_accept_literal(opt_ctx& ctx, aco_ptr& instr, unsigned bool valu_can_accept_vgpr(aco_ptr& instr, unsigned operand) { - if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) + if (instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_readlane_b32_e64 || + instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) return operand != 1; return true; } @@ -633,7 +641,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } if (info.is_constant() && can_accept_constant(instr, i)) { perfwarn(instr->opcode == aco_opcode::v_cndmask_b32 && i == 2, "v_cndmask_b32 with a constant selector", instr.get()); - if (i == 0) { + if (i == 0 || instr->opcode == aco_opcode::v_readlane_b32 || instr->opcode == aco_opcode::v_writelane_b32) { instr->operands[i] = Operand(info.val); continue; } else if (!instr->isVOP3() && can_swap_operands(instr)) { diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 504ad015746..834f06d2b94 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1509,7 +1509,8 @@ void register_allocation(Program *program, std::vector> live_out_ /* handle definitions which must have the same register as an operand */ if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 || - instr->opcode == aco_opcode::v_writelane_b32) { + instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) { instr->definitions[0].setFixed(instr->operands[2].physReg()); } else if (instr->opcode == aco_opcode::s_addk_i32 || instr->opcode == aco_opcode::s_mulk_i32) { diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 8282d7e27e3..655fb0c3bce 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -132,12 +132,21 @@ void validate(Program* program, FILE * output) check(instr->definitions[0].getTemp().type() == RegType::vgpr || (int) instr->format & (int) Format::VOPC || instr->opcode == aco_opcode::v_readfirstlane_b32 || - instr->opcode == aco_opcode::v_readlane_b32, + instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32_e64, "Wrong Definition type for VALU instruction", instr.get()); unsigned num_sgpr = 0; unsigned sgpr_idx = instr->operands.size(); - for (unsigned i = 0; i < instr->operands.size(); i++) - { + for (unsigned i = 0; i < instr->operands.size(); i++) { + if (instr->opcode == aco_opcode::v_readfirstlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32 || + instr->opcode == aco_opcode::v_readlane_b32_e64 || + instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) { + check(!instr->operands[i].isLiteral(), "No literal allowed on VALU instruction", instr.get()); + check(i == 1 || (instr->operands[i].isTemp() && instr->operands[i].regClass() == v1), "Wrong Operand type for VALU instruction", instr.get()); + continue; + } if (instr->operands[i].isTemp() && instr->operands[i].regClass().type() == RegType::sgpr) { check(i != 1 || (int) instr->format & (int) Format::VOP3A, "Wrong source position for SGPR argument", instr.get()); -- 2.30.2