From e0bcefc3a0a15a8c7ec00cfa53fd8fffcc07342a Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Wed, 27 Nov 2019 11:04:47 +0100 Subject: [PATCH] aco/wave32: Use lane mask regclass for exec/vcc. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Currently all usages of exec and vcc are hardcoded to use s2 regclass. This commit makes it possible to use s1 in wave32 mode and s2 in wave64 mode. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann --- src/amd/compiler/aco_builder_h.py | 21 +- src/amd/compiler/aco_insert_exec_mask.cpp | 72 ++--- .../compiler/aco_instruction_selection.cpp | 249 ++++++++++-------- .../aco_instruction_selection_setup.cpp | 26 +- src/amd/compiler/aco_ir.h | 1 + src/amd/compiler/aco_live_var_analysis.cpp | 12 +- src/amd/compiler/aco_lower_bool_phis.cpp | 30 ++- src/amd/compiler/aco_lower_to_hw_instr.cpp | 42 +-- src/amd/compiler/aco_reduce_assign.cpp | 2 +- src/amd/compiler/aco_register_allocation.cpp | 1 + src/amd/compiler/aco_ssa_elimination.cpp | 1 + src/amd/compiler/aco_validate.cpp | 2 +- 12 files changed, 250 insertions(+), 209 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index e70d9317b3f..ada0806f6a9 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -135,13 +135,14 @@ public: Program *program; bool use_iterator; bool start; // only when use_iterator == false + RegClass lm; std::vector> *instructions; std::vector>::iterator it; - Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), instructions(NULL) {} - Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), instructions(&block->instructions) {} - Builder(Program *pgm, std::vector> *instrs) : program(pgm), use_iterator(false), start(false), instructions(instrs) {} + Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), lm(pgm->lane_mask), instructions(NULL) {} + Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(&block->instructions) {} + Builder(Program *pgm, std::vector> *instrs) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(instrs) {} void moveEnd(Block *block) { instructions = &block->instructions; @@ -265,17 +266,26 @@ public: % for fixed in ['m0', 'vcc', 'exec', 'scc']: Operand ${fixed}(Temp tmp) { + % if fixed == 'vcc' or fixed == 'exec': + assert(tmp.regClass() == lm); + % endif Operand op(tmp); op.setFixed(aco::${fixed}); return op; } Definition ${fixed}(Definition def) { + % if fixed == 'vcc' or fixed == 'exec': + assert(def.regClass() == lm); + % endif def.setFixed(aco::${fixed}); return def; } Definition hint_${fixed}(Definition def) { + % if fixed == 'vcc' or fixed == 'exec': + assert(def.regClass() == lm); + % endif def.setHint(aco::${fixed}); return def; } @@ -350,11 +360,11 @@ public: assert((post_ra || b.op.hasRegClass()) && b.op.regClass().type() == RegType::vgpr); if (!carry_in.op.isUndefined()) - return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(s2)), a, b, carry_in); + return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(lm)), a, b, carry_in); else if (program->chip_class >= GFX10 && carry_out) return vop3(aco_opcode::v_add_co_u32_e64, Definition(dst), def(s2), a, b); else if (program->chip_class < GFX9 || carry_out) - return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(s2)), a, b); + return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(lm)), a, b); else return vop2(aco_opcode::v_add_u32, Definition(dst), a, b); } @@ -407,6 +417,7 @@ public: } return insert(std::move(sub)); } + <% import itertools formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]), diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 31ae5ca658c..cbc0698096b 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -302,14 +302,15 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) return; if (ctx.info[idx].exec.back().second & mask_type_global) { Temp exec_mask = ctx.info[idx].exec.back().first; - exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask); + exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask); ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm); return; } /* otherwise, the WQM mask should be one below the current mask */ ctx.info[idx].exec.pop_back(); assert(ctx.info[idx].exec.back().second & mask_type_wqm); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); } @@ -324,14 +325,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) !(ctx.info[idx].exec.back().second & mask_type_loop)) { ctx.info[idx].exec.pop_back(); assert(ctx.info[idx].exec.back().second & mask_type_exact); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); return; } /* otherwise, we create an exact mask and push to the stack */ Temp wqm = ctx.info[idx].exec.back().first; - Temp exact = bld.tmp(s2); - wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp exact = bld.tmp(bld.lm); + wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm)); ctx.info[idx].exec.back().first = wqm; ctx.info[idx].exec.emplace_back(exact, mask_type_exact); @@ -359,7 +361,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } else { uint8_t mask = mask_type_global; if (ctx.program->needs_wqm) { - exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask)); + exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask)); mask |= mask_type_wqm; } else { mask |= mask_type_exact; @@ -383,7 +385,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, aco_ptr phi; for (int i = 0; i < info.num_exec_masks - 1; i++) { phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)); - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first); ctx.info[idx].exec[i].first = bld.insert(std::move(phi)); } @@ -393,7 +395,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (info.has_divergent_break) { /* this phi might be trivial but ensures a parallelcopy on the loop header */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); } @@ -401,9 +403,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, /* create ssa name for loop active mask */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; if (info.has_divergent_continue) - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); else - phi->definitions[0] = bld.def(s2, exec); + phi->definitions[0] = bld.def(bld.lm, exec); phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first); Temp loop_active = bld.insert(std::move(phi)); @@ -423,7 +425,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, i++; } uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); - ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first), mask_type); } @@ -480,7 +483,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } else { /* create phi for loop footer */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); for (unsigned i = 0; i < phi->operands.size(); i++) phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first); ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type); @@ -510,7 +513,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, transition_to_Exact(ctx, bld, idx); } - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); ctx.loop.pop_back(); @@ -536,7 +540,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, continue; } - Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2), + Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm), ctx.info[preds[0]].exec[i].first, ctx.info[preds[1]].exec[i].first); uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; @@ -578,7 +582,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (block->kind & block_kind_merge) { Temp restore = ctx.info[idx].exec.back().first; - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore); + assert(restore.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore); } return i; @@ -589,7 +594,7 @@ void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptroperands[1]; if (need_check) { /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */ - Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u)); + Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u)); if (offset.isLiteral()) offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset); @@ -665,7 +670,7 @@ void process_instructions(exec_ctx& ctx, Block* block, assert(num); Operand cond = instr->operands[0]; for (int i = num - 1; i >= 0; i--) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == num - 1) { andn2->operands[0].setFixed(exec); @@ -689,8 +694,9 @@ void process_instructions(exec_ctx& ctx, Block* block, if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) { Definition dst = instr->definitions[0]; + assert(dst.size() == bld.lm.size()); if (state == Exact) { - instr.reset(create_instruction(aco_opcode::s_mov_b64, Format::SOP1, 1, 1)); + instr.reset(create_instruction(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1)); instr->operands[0] = Operand(0u); instr->definitions[0] = dst; } else { @@ -710,7 +716,7 @@ void process_instructions(exec_ctx& ctx, Block* block, assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial); assert(exact_mask.second & mask_type_exact); - instr.reset(create_instruction(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2)); + instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2)); instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */ instr->operands[1] = Operand(exact_mask.first); instr->definitions[0] = dst; @@ -726,8 +732,8 @@ void process_instructions(exec_ctx& ctx, Block* block, if (instr->operands.empty()) { /* transition to exact and set exec to zero */ Temp old_exec = ctx.info[block->index].exec.back().first; - Temp new_exec = bld.tmp(s2); - cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp new_exec = bld.tmp(bld.lm); + cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); if (ctx.info[block->index].exec.back().second & mask_type_exact) { ctx.info[block->index].exec.back().first = new_exec; @@ -746,7 +752,7 @@ void process_instructions(exec_ctx& ctx, Block* block, num += ctx.info[block->index].exec.size() - 1; for (int i = num - 1; i >= 0; i--) { if (ctx.info[block->index].exec[i].second & mask_type_exact) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == num - 1) { andn2->operands[0].setFixed(exec); @@ -878,13 +884,13 @@ void add_branch_code(exec_ctx& ctx, Block* block) } Temp old_exec = ctx.info[idx].exec.back().first; - Temp new_exec = bld.tmp(s2); - Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp new_exec = bld.tmp(bld.lm); + Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); ctx.info[idx].exec.back().first = new_exec; for (int i = num - 1; i >= 0; i--) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == (int)ctx.info[idx].exec.size() - 1) andn2->definitions[0].setFixed(exec); @@ -912,7 +918,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) Temp cond = Temp(); for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) { if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) { - cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u)); + cond = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u)); break; } } @@ -957,8 +963,8 @@ void add_branch_code(exec_ctx& ctx, Block* block) Temp current_exec = ctx.info[idx].exec.back().first; uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); - Temp then_mask = bld.tmp(s2); - Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp then_mask = bld.tmp(bld.lm); + Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(then_mask)), cond, bld.exec(current_exec)); ctx.info[idx].exec.back().first = old_exec; @@ -978,7 +984,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) uint8_t mask_type = ctx.info[idx].exec.back().second; ctx.info[idx].exec.pop_back(); Temp orig_exec = ctx.info[idx].exec.back().first; - Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec), + Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec), bld.def(s1, scc), orig_exec, bld.exec(then_mask)); /* add next current exec to the stack */ @@ -998,7 +1004,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) { cond = bld.tmp(s1); Temp exec_mask = ctx.info[idx].exec[exec_idx].first; - exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)), + exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), exec_mask, current_exec); ctx.info[idx].exec[exec_idx].first = exec_mask; if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) @@ -1010,7 +1016,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u)); + ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); } bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); @@ -1028,7 +1034,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) break; cond = bld.tmp(s1); Temp exec_mask = ctx.info[idx].exec[exec_idx].first; - exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)), + exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), exec_mask, bld.exec(current_exec)); ctx.info[idx].exec[exec_idx].first = exec_mask; } @@ -1039,7 +1045,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u)); + ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); } bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a2b2c21170c..9de9d5dec14 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -351,12 +351,12 @@ Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2 { Builder bld(ctx->program, ctx->block); if (!dst.id()) - dst = bld.tmp(s2); + dst = bld.tmp(bld.lm); assert(val.regClass() == s1); - assert(dst.regClass() == s2); + assert(dst.regClass() == bld.lm); - return bld.sop2(aco_opcode::s_cselect_b64, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val)); + return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val)); } Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1)) @@ -365,12 +365,12 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1 if (!dst.id()) dst = bld.tmp(s1); - assert(val.regClass() == s2); + assert(val.regClass() == bld.lm); assert(dst.regClass() == s1); /* if we're currently in WQM mode, ensure that the source is also computed in WQM */ Temp tmp = bld.tmp(s1); - bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(Definition(tmp)), val, Operand(exec, s2)); + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm)); return emit_wqm(ctx, tmp, dst); } @@ -489,6 +489,8 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o { Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); + assert(src0.size() == src1.size()); + aco_ptr vopc; if (src1.type() == RegType::sgpr) { if (src0.type() == RegType::vgpr) { @@ -549,12 +551,13 @@ void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o { Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); + Builder bld(ctx->program, ctx->block); - assert(dst.regClass() == s2); + assert(dst.regClass() == bld.lm); assert(src0.type() == RegType::sgpr); assert(src1.type() == RegType::sgpr); + assert(src0.regClass() == src1.regClass()); - Builder bld(ctx->program, ctx->block); /* Emit the SALU comparison instruction */ Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1); /* Turn the result into a per-lane bool */ @@ -580,17 +583,17 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, emit_sopc_instruction(ctx, instr, op, dst); } -void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op64, Temp dst) +void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst) { Builder bld(ctx->program, ctx->block); Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); - assert(dst.regClass() == s2); - assert(src0.regClass() == s2); - assert(src1.regClass() == s2); + assert(dst.regClass() == bld.lm); + assert(src0.regClass() == bld.lm); + assert(src1.regClass() == bld.lm); - bld.sop2(op64, Definition(dst), bld.def(s1, scc), src0, src1); + bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1); } void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) @@ -600,7 +603,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) Temp then = get_alu_src(ctx, instr->src[1]); Temp els = get_alu_src(ctx, instr->src[2]); - assert(cond.regClass() == s2); + assert(cond.regClass() == bld.lm); if (dst.type() == RegType::vgpr) { aco_ptr bcsel; @@ -628,14 +631,15 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) } if (instr->dest.dest.ssa.bit_size == 1) { - assert(dst.regClass() == s2); - assert(then.regClass() == s2); - assert(els.regClass() == s2); + assert(dst.regClass() == bld.lm); + assert(then.regClass() == bld.lm); + assert(els.regClass() == bld.lm); } if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */ if (dst.regClass() == s1 || dst.regClass() == s2) { assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass()); + assert(dst.size() == then.size()); aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond))); } else { @@ -652,20 +656,20 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) assert(instr->dest.dest.ssa.bit_size == 1); if (cond.id() != then.id()) - then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then); + then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then); if (cond.id() == els.id()) - bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then); + bld.sop1(Builder::s_mov, Definition(dst), then); else - bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then, - bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond)); + bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then, + bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond)); } void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val, aco_opcode op, uint32_t undo) { /* multiply by 16777216 to handle denormals */ - Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(s2)), + Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4)))); Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val); scaled = bld.vop1(op, bld.def(v1), scaled); @@ -766,9 +770,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_inot: { Temp src = get_alu_src(ctx, instr->src[0]); if (instr->dest.dest.ssa.bit_size == 1) { - assert(src.regClass() == s2); - assert(dst.regClass() == s2); - bld.sop2(aco_opcode::s_andn2_b64, Definition(dst), bld.def(s1, scc), Operand(exec, s2), src); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); + bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); } else if (dst.type() == RegType::sgpr) { @@ -835,12 +839,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz); } else if (dst.regClass() == v1) { Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); - Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz); } else if (dst.regClass() == v2) { Temp upper = emit_extract_vector(ctx, src, 1, v1); Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper); - Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz); upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); @@ -901,7 +905,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_ior: { if (instr->dest.dest.ssa.bit_size == 1) { - emit_boolean_logic(ctx, instr, aco_opcode::s_or_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_or, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true); } else if (dst.regClass() == s1) { @@ -917,7 +921,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_iand: { if (instr->dest.dest.ssa.bit_size == 1) { - emit_boolean_logic(ctx, instr, aco_opcode::s_and_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_and, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true); } else if (dst.regClass() == s1) { @@ -933,7 +937,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_ixor: { if (instr->dest.dest.ssa.bit_size == 1) { - emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_xor, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true); } else if (dst.regClass() == s1) { @@ -1709,16 +1713,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsign: { Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); if (dst.size() == 1) { - Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond); - cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond); } else if (dst.size() == 2) { - Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u)); Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond); - cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u)); upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond); @@ -1922,7 +1926,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) { Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); - Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent); + Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent); exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent); Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa); @@ -1986,7 +1990,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_b2f32: { Temp src = get_alu_src(ctx, instr->src[0]); - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); if (dst.regClass() == s1) { src = bool_to_scalar_condition(ctx, src); @@ -2000,7 +2004,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_b2f64: { Temp src = get_alu_src(ctx, instr->src[0]); - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); if (dst.regClass() == s2) { src = bool_to_scalar_condition(ctx, src); @@ -2073,7 +2077,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_b2i32: { Temp src = get_alu_src(ctx, instr->src[0]); - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); if (dst.regClass() == s1) { // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ @@ -2087,7 +2091,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_i2b1: { Temp src = get_alu_src(ctx, instr->src[0]); - assert(dst.regClass() == s2); + assert(dst.regClass() == bld.lm); if (src.type() == RegType::vgpr) { assert(src.regClass() == v1 || src.regClass() == v2); @@ -2164,7 +2168,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */ - Temp cmp_res = bld.tmp(s2); + Temp cmp_res = bld.tmp(bld.lm); bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc); Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); @@ -2338,14 +2342,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_ieq: { if (instr->src[0].src.ssa->bit_size == 1) - emit_boolean_logic(ctx, instr, aco_opcode::s_xnor_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_xnor, dst); else emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64); break; } case nir_op_ine: { if (instr->src[0].src.ssa->bit_size == 1) - emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_xor, dst); else emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64); break; @@ -2405,8 +2409,10 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) Builder bld(ctx->program, ctx->block); if (instr->def.bit_size == 1) { - assert(dst.regClass() == s2); - bld.sop1(aco_opcode::s_mov_b64, Definition(dst), Operand((uint64_t)(instr->value[0].b ? -1 : 0))); + assert(dst.regClass() == bld.lm); + int val = instr->value[0].b ? -1 : 0; + Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val); + bld.sop1(Builder::s_mov, Definition(dst), op); } else if (dst.size() == 1) { bld.copy(Definition(dst), Operand(instr->value[0].u32)); } else { @@ -3033,7 +3039,7 @@ Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alph /* Convert back to the right type. */ if (adjustment == RADV_ALPHA_ADJUST_SNORM) { alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); - Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha); + Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha); alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp); } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) { alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); @@ -3599,8 +3605,8 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr) // TODO: optimize uniform conditions Builder bld(ctx->program, ctx->block); Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - assert(src.regClass() == s2); - src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + assert(src.regClass() == bld.lm); + src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_discard_if, src); ctx->block->kind |= block_kind_uses_discard_if; return; @@ -3663,7 +3669,7 @@ void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr) ctx->program->needs_exact = true; /* save exec somewhere temporarily so that it doesn't get * overwritten before the discard from outer exec masks */ - Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2)); + Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_discard_if, cond); ctx->block->kind |= block_kind_uses_discard_if; return; @@ -3950,7 +3956,7 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK * resource descriptor is 0 (invalid), */ - Temp compare = bld.tmp(s2); + Temp compare = bld.tmp(bld.lm); bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc); @@ -4739,12 +4745,12 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) if (offset > 0 && ctx->options->chip_class < GFX9) { Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1); Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1); - Temp carry = bld.tmp(s2); + Temp carry = bld.tmp(bld.lm); bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr); bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)), Operand(offset), addr0); - bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2), + bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), Operand(0u), addr1, carry).def(1).setHint(vcc); @@ -5219,25 +5225,25 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te return src; } if (op == nir_op_iand && cluster_size == 4) { //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) - Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src); - return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc), - bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp)); + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); + return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), + bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp)); } else if (op == nir_op_ior && cluster_size == 4) { //subgroupClusteredOr(val, 4) -> wqm(val & exec) - return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), - bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))); + return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); } else if (op == nir_op_iand && cluster_size == 64) { //subgroupAnd(val) -> (exec & ~val) == 0 - Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp(); - return bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp)); + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp)); } else if (op == nir_op_ior && cluster_size == 64) { //subgroupOr(val) -> (val & exec) != 0 - Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp(); + Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp(); return bool_to_vector_condition(ctx, tmp); } else if (op == nir_op_ixor && cluster_size == 64) { //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 - Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); - tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s1), bld.def(s1, scc), tmp); + Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp); tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp(); return bool_to_vector_condition(ctx, tmp); } else { @@ -5256,25 +5262,28 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te Temp tmp; if (op == nir_op_iand) - tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); else - tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u; - tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); + if (ctx->program->wave_size == 64) + tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); + else + tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp); tmp = emit_extract_vector(ctx, tmp, 0, v1); if (cluster_mask != 0xffffffff) tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp); Definition cmp_def = Definition(); if (op == nir_op_iand) { - cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0); } else if (op == nir_op_ior) { - cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0); } else if (op == nir_op_ixor) { tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u))); - cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0); } cmp_def.setHint(vcc); return cmp_def.getTemp(); @@ -5290,9 +5299,9 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src) //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 Temp tmp; if (op == nir_op_iand) - tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src); + tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); else - tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm)); Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp); Temp lo = lohi.def(0).getTemp(); @@ -5301,11 +5310,11 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src) Definition cmp_def = Definition(); if (op == nir_op_iand) - cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0); else if (op == nir_op_ior) - cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0); else if (op == nir_op_ixor) - cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0); cmp_def.setHint(vcc); return cmp_def.getTemp(); @@ -5320,11 +5329,11 @@ Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src) //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val Temp tmp = emit_boolean_exclusive_scan(ctx, op, src); if (op == nir_op_iand) - return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src); else if (op == nir_op_ior) - return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src); else if (op == nir_op_ixor) - return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src); assert(false); return Temp(); @@ -5453,7 +5462,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp pck0 = bld.tmp(v1); Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp(); tmp1 = as_vgpr(ctx, tmp1); - Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry); + Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry); addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1); /* sample_pos = flat_load_dwordx2 addr */ @@ -5685,11 +5694,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_ballot: { - Definition tmp = bld.def(s2); Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Definition tmp = bld.def(dst.regClass()); if (instr->src[0].ssa->bit_size == 1) { - assert(src.regClass() == s2); - bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src); + assert(src.regClass() == bld.lm); + bld.sop2(Builder::s_and, tmp, bld.def(s1, scc), Operand(exec, bld.lm), src); } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) { bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src); } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) { @@ -5699,7 +5709,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) nir_print_instr(&instr->instr, stderr); fprintf(stderr, "\n"); } - emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa)); + emit_wqm(ctx, tmp.getTemp(), dst); break; } case nir_intrinsic_shuffle: @@ -5722,15 +5732,19 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) { - assert(src.regClass() == s2); - Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid); + assert(src.regClass() == bld.lm); + Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid); bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) { - assert(src.regClass() == s2); - Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); + assert(src.regClass() == bld.lm); + Temp tmp; + if (ctx->program->wave_size == 64) + tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); + else + tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src); tmp = emit_extract_vector(ctx, tmp, 0, v1); tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp); - emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst); + emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -5763,9 +5777,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else if (instr->dest.ssa.bit_size == 1) { - assert(src.regClass() == s2); - Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, - bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))); + assert(src.regClass() == bld.lm); + Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, + bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm))); bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); } else if (src.regClass() == s1) { bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); @@ -5781,22 +5795,22 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_vote_all: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - assert(src.regClass() == s2); - assert(dst.regClass() == s2); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); - Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp(); - Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp)); + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp)); emit_wqm(ctx, val, dst); break; } case nir_intrinsic_vote_any: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - assert(src.regClass() == s2); - assert(dst.regClass() == s2); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); - Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp(); - Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(-1u), Operand(0u), bld.scc(tmp)); + Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), Operand(0u), bld.scc(tmp)); emit_wqm(ctx, val, dst); break; } @@ -5879,7 +5893,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp tmp_dst = bld.tmp(dst.regClass()); reduce->definitions[0] = Definition(tmp_dst); - reduce->definitions[1] = bld.def(s2); // used internally + reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally reduce->definitions[2] = Definition(); reduce->definitions[3] = Definition(scc, s1); reduce->definitions[4] = Definition(); @@ -5899,13 +5913,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); unsigned lane = nir_src_as_const_value(instr->src[1])->u32; if (instr->dest.ssa.bit_size == 1) { - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); uint32_t half_mask = 0x11111111u << lane; Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask)); - Temp tmp = bld.tmp(s2); - bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp), - bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp, - bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)))); + Temp tmp = bld.tmp(bld.lm); + bld.sop1(Builder::s_wqm, Definition(tmp), + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)))); emit_wqm(ctx, tmp, dst); } else if (instr->dest.ssa.bit_size == 32) { emit_wqm(ctx, @@ -5957,10 +5972,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); if (instr->dest.ssa.bit_size == 1) { - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src); src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); - Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src); + Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src); emit_wqm(ctx, tmp, dst); } else if (instr->dest.ssa.bit_size == 32) { Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); @@ -6060,15 +6075,15 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; case nir_intrinsic_demote_if: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - assert(src.regClass() == s2); - Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + assert(src.regClass() == bld.lm); + Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_demote_to_helper, cond); ctx->block->kind |= block_kind_uses_demote; ctx->program->needs_exact = true; break; } case nir_intrinsic_first_invocation: { - emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)), + emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)), get_ssa_temp(ctx, &instr->dest.ssa)); break; } @@ -6180,14 +6195,14 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, Operand two(0x40000000u); Operand four(0x40800000u); - Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma); + Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma); Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive); Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma); - Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id); + Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id); Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id); - is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z); - Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y); + is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z); + Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y); // select sc Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x); @@ -6667,7 +6682,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa)); } - tg4_compare_cube_wa64 = bld.tmp(s2); + tg4_compare_cube_wa64 = bld.tmp(bld.lm); bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64); nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u)); @@ -6800,7 +6815,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) assert(dmask == 1 && dst.regClass() == v1); assert(dst.id() != tmp_dst.id()); - Temp tmp = bld.tmp(s2); + Temp tmp = bld.tmp(bld.lm); bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc); bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp); @@ -6921,7 +6936,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) { aco_ptr phi; Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == s2); + assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask); bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index]; logical |= ctx->block->kind & block_kind_merge; @@ -7295,7 +7310,7 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond ctx->block->kind |= block_kind_branch; /* branch to linear then block */ - assert(cond.regClass() == s2); + assert(cond.regClass() == ctx->program->lane_mask); aco_ptr branch; branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0)); branch->operands[0] = Operand(cond); @@ -7439,7 +7454,7 @@ static void visit_if(isel_context *ctx, nir_if *if_stmt) ctx->block->kind |= block_kind_uniform; /* emit branch */ - assert(cond.regClass() == s2); + assert(cond.regClass() == bld.lm); // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction cond = bool_to_scalar_condition(ctx, cond); @@ -7825,7 +7840,7 @@ void handle_bc_optimize(isel_context *ctx) ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid); ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid); if (uses_center && uses_centroid) { - Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), + Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u)); if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) { @@ -7934,7 +7949,7 @@ void select_program(Program *program, Builder bld(ctx.program, ctx.block); Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u))); Temp thread_id = emit_mbcnt(&ctx, bld.def(v1)); - Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id); + Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id); begin_divergent_if_then(&ctx, &ic, cond); } diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index ab96a4507cf..a7446c6c058 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -126,6 +126,7 @@ unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) void init_context(isel_context *ctx, nir_shader *shader) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); + unsigned lane_mask_size = ctx->program->lane_mask.size(); ctx->shader = shader; ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform); @@ -207,7 +208,7 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_op_ieq: case nir_op_ine: case nir_op_i2b1: - size = 2; + size = lane_mask_size; break; case nir_op_f2i64: case nir_op_f2u64: @@ -219,7 +220,7 @@ void init_context(isel_context *ctx, nir_shader *shader) break; case nir_op_bcsel: if (alu_instr->dest.dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; } else { if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) { type = RegType::vgpr; @@ -237,14 +238,14 @@ void init_context(isel_context *ctx, nir_shader *shader) break; case nir_op_mov: if (alu_instr->dest.dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; } else { type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr; } break; default: if (alu_instr->dest.dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; } else { for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) @@ -261,7 +262,7 @@ void init_context(isel_context *ctx, nir_shader *shader) if (nir_instr_as_load_const(instr)->def.bit_size == 64) size *= 2; else if (nir_instr_as_load_const(instr)->def.bit_size == 1) - size *= 2; + size *= lane_mask_size; allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size)); break; } @@ -289,11 +290,11 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_first_invocation: type = RegType::sgpr; if (intrinsic->dest.ssa.bit_size == 1) - size = 2; + size = lane_mask_size; break; case nir_intrinsic_ballot: type = RegType::sgpr; - size = 2; + size = lane_mask_size; break; case nir_intrinsic_load_sample_id: case nir_intrinsic_load_sample_mask_in: @@ -369,7 +370,7 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_inclusive_scan: case nir_intrinsic_exclusive_scan: if (intrinsic->dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; type = RegType::sgpr; } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) { type = RegType::sgpr; @@ -384,11 +385,11 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_load_helper_invocation: case nir_intrinsic_is_helper_invocation: type = RegType::sgpr; - size = 2; + size = lane_mask_size; break; case nir_intrinsic_reduce: if (intrinsic->dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; type = RegType::sgpr; } else if (nir_intrinsic_cluster_size(intrinsic) == 0 || !ctx->divergent_vals[intrinsic->dest.ssa.index]) { @@ -489,7 +490,7 @@ void init_context(isel_context *ctx, nir_shader *shader) if (phi->dest.ssa.bit_size == 1) { assert(size == 1 && "multiple components not yet supported on boolean phis."); type = RegType::sgpr; - size *= 2; + size *= lane_mask_size; allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size)); break; } @@ -590,7 +591,7 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx) startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); arg++; } - startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, s2}; + startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, ctx->program->lane_mask}; Pseudo_instruction *instr = startpgm.get(); ctx->block->instructions.push_back(std::move(startpgm)); @@ -796,6 +797,7 @@ setup_isel_context(Program* program, program->chip_class = args->options->chip_class; program->family = args->options->family; program->wave_size = args->shader_info->wave_size; + program->lane_mask = program->wave_size == 32 ? s1 : s2; program->lds_alloc_granule = args->options->chip_class >= GFX7 ? 512 : 256; program->lds_limit = args->options->chip_class >= GFX7 ? 65536 : 32768; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 4073086662a..1f4721f5ffd 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1149,6 +1149,7 @@ public: enum chip_class chip_class; enum radeon_family family; unsigned wave_size; + RegClass lane_mask; Stage stage; /* Stage */ bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ bool needs_wqm = false; /* there exists a p_wqm instruction */ diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 05ddb7bc68a..4255d56173b 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -54,7 +54,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, bool exec_live = false; if (block->live_out_exec != Temp()) { live_sgprs.insert(block->live_out_exec); - new_demand.sgpr += 2; + new_demand.sgpr += program->lane_mask.size(); exec_live = true; } @@ -77,10 +77,10 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, if (is_phi(insn)) break; - /* substract the 2 sgprs from exec */ + /* substract the 1 or 2 sgprs from exec */ if (exec_live) - assert(new_demand.sgpr >= 2); - register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? 2 : 0)); + assert(new_demand.sgpr >= (int16_t) program->lane_mask.size()); + register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0)); /* KILL */ for (Definition& definition : insn->definitions) { @@ -144,8 +144,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, /* update block's register demand for a last time */ if (exec_live) - assert(new_demand.sgpr >= 2); - new_demand.sgpr -= exec_live ? 2 : 0; + assert(new_demand.sgpr >= (int16_t) program->lane_mask.size()); + new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0; block->register_demand.update(new_demand); /* handle phi definitions */ diff --git a/src/amd/compiler/aco_lower_bool_phis.cpp b/src/amd/compiler/aco_lower_bool_phis.cpp index dc64f0133b5..988f753c82d 100644 --- a/src/amd/compiler/aco_lower_bool_phis.cpp +++ b/src/amd/compiler/aco_lower_bool_phis.cpp @@ -54,12 +54,12 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state) while (true) { auto pos = state->latest.find(block_idx); if (pos != state->latest.end()) - return Operand({pos->second, s2}); + return Operand({pos->second, program->lane_mask}); Block& block = program->blocks[block_idx]; size_t pred = block.linear_preds.size(); if (pred == 0) { - return Operand(s2); + return Operand(program->lane_mask); } else if (pred == 1) { block_idx = block.linear_preds[0]; continue; @@ -75,10 +75,10 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state) state->phis[phi->operands[i].tempId()][(phi_use){&block, res}] |= (uint64_t)1 << i; } } - phi->definitions[0] = Definition(Temp{res, s2}); + phi->definitions[0] = Definition(Temp{res, program->lane_mask}); block.instructions.emplace(block.instructions.begin(), std::move(phi)); - return Operand({res, s2}); + return Operand({res, program->lane_mask}); } } } @@ -118,7 +118,7 @@ Temp write_ssa(Program *program, Block *block, ssa_state *state, unsigned previo update_phi(program, state, phi.first.block, phi.first.phi_def, phi.second); } - return {id, s2}; + return {id, program->lane_mask}; } void insert_before_logical_end(Block *block, aco_ptr instr) @@ -150,23 +150,25 @@ void lower_divergent_bool_phi(Program *program, Block *block, aco_ptroperands[i].isTemp()); Temp phi_src = phi->operands[i].getTemp(); - assert(phi_src.regClass() == s2); + assert(phi_src.regClass() == bld.lm); Operand cur = get_ssa(program, pred->index, &state); + assert(cur.regClass() == bld.lm); Temp new_cur = write_ssa(program, pred, &state, cur.isTemp() ? cur.tempId() : 0); + assert(new_cur.regClass() == bld.lm); if (cur.isUndefined()) { insert_before_logical_end(pred, bld.sop1(aco_opcode::s_mov_b64, Definition(new_cur), phi_src).get_ptr()); } else { - Temp tmp1 = bld.tmp(s2), tmp2 = bld.tmp(s2); + Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm); insert_before_logical_end(pred, - bld.sop2(aco_opcode::s_andn2_b64, Definition(tmp1), bld.def(s1, scc), - cur, Operand(exec, s2)).get_ptr()); + bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), + cur, Operand(exec, bld.lm)).get_ptr()); insert_before_logical_end(pred, - bld.sop2(aco_opcode::s_and_b64, Definition(tmp2), bld.def(s1, scc), - phi_src, Operand(exec, s2)).get_ptr()); + bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), + phi_src, Operand(exec, bld.lm)).get_ptr()); insert_before_logical_end(pred, - bld.sop2(aco_opcode::s_or_b64, Definition(new_cur), bld.def(s1, scc), + bld.sop2(Builder::s_or, Definition(new_cur), bld.def(s1, scc), tmp1, tmp2).get_ptr()); } } @@ -192,8 +194,8 @@ void lower_bool_phis(Program* program) for (Block& block : program->blocks) { for (aco_ptr& phi : block.instructions) { if (phi->opcode == aco_opcode::p_phi) { - assert(phi->definitions[0].regClass() != s1); - if (phi->definitions[0].regClass() == s2) + assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2); + if (phi->definitions[0].regClass() == program->lane_mask) lower_divergent_bool_phi(program, &block, phi); } else if (!is_phi(phi)) { break; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index cbb3b55179c..e9c2d66d823 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -76,8 +76,10 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) { void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1) { Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true); - if (instr->definitions.size() >= 2) + if (instr->definitions.size() >= 2) { + assert(instr->definitions[1].regClass() == bld.lm); instr->definitions[1].setFixed(vcc); + } } void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, @@ -99,12 +101,12 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, bound_ctrl); - bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(s2, vcc), vtmp_op[0], src1[0]); + bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]); } else { - bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(s2, vcc), src0[0], src1[0], + bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, bound_ctrl); } - bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(s2, vcc), src0[1], src1[1], Operand(vcc, s2), + bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl); } else if (op == iand64) { bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], @@ -149,9 +151,9 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask, bound_ctrl); - bld.vopc(cmp, bld.def(s2, vcc), vtmp_op64, src1_64); - bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, s2)); - bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, s2)); + bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64); + bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm)); } else if (op == imul64) { /* t4 = dpp(x_hi) * t1 = umul_lo(t4, y_lo) @@ -216,11 +218,11 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe if (op == iadd64) { if (ctx->program->chip_class >= GFX10) { - bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(s2, vcc), src0[0], src1[0]); + bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); } else { - bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(s2, vcc), src0[0], src1[0]); + bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); } - bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(s2, vcc), src0[1], src1[1], Operand(vcc, s2)); + bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm)); } else if (op == iand64) { bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]); bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]); @@ -249,9 +251,9 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe break; } - bld.vopc(cmp, bld.def(s2, vcc), src0_64, src1_64); - bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, s2)); - bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, s2)); + bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64); + bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm)); } else if (op == imul64) { if (src1_reg == dst_reg) { /* it's fine if src0==dst but not if src1==dst */ @@ -298,7 +300,7 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg if (!vop3) { if (opcode == aco_opcode::v_add_co_u32) - bld.vop2_dpp(opcode, dst, bld.def(s2, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); else bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); return; @@ -342,7 +344,7 @@ void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1 if (vop3) { bld.vop3(opcode, dst, src0, src1); } else if (opcode == aco_opcode::v_add_co_u32) { - bld.vop2(opcode, dst, bld.def(s2, vcc), src0, src1); + bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1); } else { bld.vop2(opcode, dst, src0, src1); } @@ -420,7 +422,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig Operand vcndmask_identity[2] = {identity[0], identity[1]}; /* First, copy the source to tmp and set inactive lanes to the identity */ - bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2)); + bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm)); for (unsigned i = 0; i < src.size(); i++) { /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 @@ -440,7 +442,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig for (unsigned i = 0; i < src.size(); i++) { bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1), vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1), - Operand(stmp, s2)); + Operand(stmp, bld.lm)); } bool exec_restored = false; @@ -463,7 +465,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (cluster_size == 32) { for (unsigned i = 0; i < src.size(); i++) bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10)); - bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2)); + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); exec_restored = true; emit_op(ctx, dst.physReg(), vtmp, tmp, PhysReg{0}, reduce_op, src.size()); dst_written = true; @@ -500,7 +502,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig Operand(0xffffffffu), Operand(0xffffffffu)).instr; static_cast(perm)->opsel[0] = true; /* FI (Fetch Inactive) */ } - bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX)); /* fill in the gap in row 2 */ for (unsigned i = 0; i < src.size(); i++) { @@ -559,7 +561,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig } if (!exec_restored) - bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2)); + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); if (op == aco_opcode::p_reduce && cluster_size == 64) { for (unsigned k = 0; k < src.size(); k++) { diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 28a779580a2..68a0dc15761 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -172,7 +172,7 @@ void setup_reduce_temp(Program* program) clobber_vcc = true; if (clobber_vcc) - instr->definitions[4] = Definition(vcc, s2); + instr->definitions[4] = Definition(vcc, bld.lm); } } } diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index c4144cc42f0..504ad015746 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1719,6 +1719,7 @@ void register_allocation(Program *program, std::vector> live_out_ pc->operands[i] = parallelcopy[i].first; pc->definitions[i] = parallelcopy[i].second; + assert(pc->operands[i].size() == pc->definitions[i].size()); /* it might happen that the operand is already renamed. we have to restore the original name. */ std::map::iterator it = ctx.orig_names.find(pc->operands[i].tempId()); diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp index 3d76dcd8867..54e691ba476 100644 --- a/src/amd/compiler/aco_ssa_elimination.cpp +++ b/src/amd/compiler/aco_ssa_elimination.cpp @@ -58,6 +58,7 @@ void collect_phi_info(ssa_elimination_ctx& ctx) std::vector& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; phi_info& info = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info : ctx.linear_phi_info; const auto result = info.emplace(preds[i], std::vector>()); + assert(phi->definitions[0].size() == phi->operands[i].size()); result.first->second.emplace_back(phi->definitions[0], phi->operands[i]); ctx.empty_blocks[preds[i]] = false; } diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 8d2bf8449db..8282d7e27e3 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -190,7 +190,7 @@ void validate(Program* program, FILE * output) } } else if (instr->opcode == aco_opcode::p_phi) { check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get()); - check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == s2, "Logical Phi Definition must be vgpr or divergent boolean", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == program->lane_mask, "Logical Phi Definition must be vgpr or divergent boolean", instr.get()); } else if (instr->opcode == aco_opcode::p_linear_phi) { for (const Operand& op : instr->operands) check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get()); -- 2.30.2