From: Timur Kristóf Date: Wed, 27 Nov 2019 10:04:47 +0000 (+0100) Subject: aco/wave32: Use lane mask regclass for exec/vcc. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=e0bcefc3a0a15a8c7ec00cfa53fd8fffcc07342a;p=mesa.git aco/wave32: Use lane mask regclass for exec/vcc. Currently all usages of exec and vcc are hardcoded to use s2 regclass. This commit makes it possible to use s1 in wave32 mode and s2 in wave64 mode. Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann --- diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index e70d9317b3f..ada0806f6a9 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -135,13 +135,14 @@ public: Program *program; bool use_iterator; bool start; // only when use_iterator == false + RegClass lm; std::vector> *instructions; std::vector>::iterator it; - Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), instructions(NULL) {} - Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), instructions(&block->instructions) {} - Builder(Program *pgm, std::vector> *instrs) : program(pgm), use_iterator(false), start(false), instructions(instrs) {} + Builder(Program *pgm) : program(pgm), use_iterator(false), start(false), lm(pgm->lane_mask), instructions(NULL) {} + Builder(Program *pgm, Block *block) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(&block->instructions) {} + Builder(Program *pgm, std::vector> *instrs) : program(pgm), use_iterator(false), start(false), lm(pgm ? pgm->lane_mask : s2), instructions(instrs) {} void moveEnd(Block *block) { instructions = &block->instructions; @@ -265,17 +266,26 @@ public: % for fixed in ['m0', 'vcc', 'exec', 'scc']: Operand ${fixed}(Temp tmp) { + % if fixed == 'vcc' or fixed == 'exec': + assert(tmp.regClass() == lm); + % endif Operand op(tmp); op.setFixed(aco::${fixed}); return op; } Definition ${fixed}(Definition def) { + % if fixed == 'vcc' or fixed == 'exec': + assert(def.regClass() == lm); + % endif def.setFixed(aco::${fixed}); return def; } Definition hint_${fixed}(Definition def) { + % if fixed == 'vcc' or fixed == 'exec': + assert(def.regClass() == lm); + % endif def.setHint(aco::${fixed}); return def; } @@ -350,11 +360,11 @@ public: assert((post_ra || b.op.hasRegClass()) && b.op.regClass().type() == RegType::vgpr); if (!carry_in.op.isUndefined()) - return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(s2)), a, b, carry_in); + return vop2(aco_opcode::v_addc_co_u32, Definition(dst), hint_vcc(def(lm)), a, b, carry_in); else if (program->chip_class >= GFX10 && carry_out) return vop3(aco_opcode::v_add_co_u32_e64, Definition(dst), def(s2), a, b); else if (program->chip_class < GFX9 || carry_out) - return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(s2)), a, b); + return vop2(aco_opcode::v_add_co_u32, Definition(dst), hint_vcc(def(lm)), a, b); else return vop2(aco_opcode::v_add_u32, Definition(dst), a, b); } @@ -407,6 +417,7 @@ public: } return insert(std::move(sub)); } + <% import itertools formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.product(range(5), range(5))) + [(8, 1), (1, 8)]), diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 31ae5ca658c..cbc0698096b 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -302,14 +302,15 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) return; if (ctx.info[idx].exec.back().second & mask_type_global) { Temp exec_mask = ctx.info[idx].exec.back().first; - exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask); + exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask); ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm); return; } /* otherwise, the WQM mask should be one below the current mask */ ctx.info[idx].exec.pop_back(); assert(ctx.info[idx].exec.back().second & mask_type_wqm); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); } @@ -324,14 +325,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) !(ctx.info[idx].exec.back().second & mask_type_loop)) { ctx.info[idx].exec.pop_back(); assert(ctx.info[idx].exec.back().second & mask_type_exact); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); return; } /* otherwise, we create an exact mask and push to the stack */ Temp wqm = ctx.info[idx].exec.back().first; - Temp exact = bld.tmp(s2); - wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp exact = bld.tmp(bld.lm); + wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm)); ctx.info[idx].exec.back().first = wqm; ctx.info[idx].exec.emplace_back(exact, mask_type_exact); @@ -359,7 +361,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } else { uint8_t mask = mask_type_global; if (ctx.program->needs_wqm) { - exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask)); + exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask)); mask |= mask_type_wqm; } else { mask |= mask_type_exact; @@ -383,7 +385,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, aco_ptr phi; for (int i = 0; i < info.num_exec_masks - 1; i++) { phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)); - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first); ctx.info[idx].exec[i].first = bld.insert(std::move(phi)); } @@ -393,7 +395,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (info.has_divergent_break) { /* this phi might be trivial but ensures a parallelcopy on the loop header */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); } @@ -401,9 +403,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, /* create ssa name for loop active mask */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; if (info.has_divergent_continue) - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); else - phi->definitions[0] = bld.def(s2, exec); + phi->definitions[0] = bld.def(bld.lm, exec); phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first); Temp loop_active = bld.insert(std::move(phi)); @@ -423,7 +425,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, i++; } uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); - ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first), mask_type); } @@ -480,7 +483,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } else { /* create phi for loop footer */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); for (unsigned i = 0; i < phi->operands.size(); i++) phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first); ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type); @@ -510,7 +513,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, transition_to_Exact(ctx, bld, idx); } - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); ctx.loop.pop_back(); @@ -536,7 +540,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, continue; } - Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2), + Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm), ctx.info[preds[0]].exec[i].first, ctx.info[preds[1]].exec[i].first); uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; @@ -578,7 +582,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (block->kind & block_kind_merge) { Temp restore = ctx.info[idx].exec.back().first; - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore); + assert(restore.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore); } return i; @@ -589,7 +594,7 @@ void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptroperands[1]; if (need_check) { /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */ - Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u)); + Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u)); if (offset.isLiteral()) offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset); @@ -665,7 +670,7 @@ void process_instructions(exec_ctx& ctx, Block* block, assert(num); Operand cond = instr->operands[0]; for (int i = num - 1; i >= 0; i--) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == num - 1) { andn2->operands[0].setFixed(exec); @@ -689,8 +694,9 @@ void process_instructions(exec_ctx& ctx, Block* block, if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) { Definition dst = instr->definitions[0]; + assert(dst.size() == bld.lm.size()); if (state == Exact) { - instr.reset(create_instruction(aco_opcode::s_mov_b64, Format::SOP1, 1, 1)); + instr.reset(create_instruction(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1)); instr->operands[0] = Operand(0u); instr->definitions[0] = dst; } else { @@ -710,7 +716,7 @@ void process_instructions(exec_ctx& ctx, Block* block, assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial); assert(exact_mask.second & mask_type_exact); - instr.reset(create_instruction(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2)); + instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2)); instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */ instr->operands[1] = Operand(exact_mask.first); instr->definitions[0] = dst; @@ -726,8 +732,8 @@ void process_instructions(exec_ctx& ctx, Block* block, if (instr->operands.empty()) { /* transition to exact and set exec to zero */ Temp old_exec = ctx.info[block->index].exec.back().first; - Temp new_exec = bld.tmp(s2); - cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp new_exec = bld.tmp(bld.lm); + cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); if (ctx.info[block->index].exec.back().second & mask_type_exact) { ctx.info[block->index].exec.back().first = new_exec; @@ -746,7 +752,7 @@ void process_instructions(exec_ctx& ctx, Block* block, num += ctx.info[block->index].exec.size() - 1; for (int i = num - 1; i >= 0; i--) { if (ctx.info[block->index].exec[i].second & mask_type_exact) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == num - 1) { andn2->operands[0].setFixed(exec); @@ -878,13 +884,13 @@ void add_branch_code(exec_ctx& ctx, Block* block) } Temp old_exec = ctx.info[idx].exec.back().first; - Temp new_exec = bld.tmp(s2); - Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp new_exec = bld.tmp(bld.lm); + Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); ctx.info[idx].exec.back().first = new_exec; for (int i = num - 1; i >= 0; i--) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == (int)ctx.info[idx].exec.size() - 1) andn2->definitions[0].setFixed(exec); @@ -912,7 +918,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) Temp cond = Temp(); for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) { if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) { - cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u)); + cond = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u)); break; } } @@ -957,8 +963,8 @@ void add_branch_code(exec_ctx& ctx, Block* block) Temp current_exec = ctx.info[idx].exec.back().first; uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); - Temp then_mask = bld.tmp(s2); - Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp then_mask = bld.tmp(bld.lm); + Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(then_mask)), cond, bld.exec(current_exec)); ctx.info[idx].exec.back().first = old_exec; @@ -978,7 +984,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) uint8_t mask_type = ctx.info[idx].exec.back().second; ctx.info[idx].exec.pop_back(); Temp orig_exec = ctx.info[idx].exec.back().first; - Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec), + Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec), bld.def(s1, scc), orig_exec, bld.exec(then_mask)); /* add next current exec to the stack */ @@ -998,7 +1004,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) { cond = bld.tmp(s1); Temp exec_mask = ctx.info[idx].exec[exec_idx].first; - exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)), + exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), exec_mask, current_exec); ctx.info[idx].exec[exec_idx].first = exec_mask; if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) @@ -1010,7 +1016,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u)); + ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); } bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); @@ -1028,7 +1034,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) break; cond = bld.tmp(s1); Temp exec_mask = ctx.info[idx].exec[exec_idx].first; - exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)), + exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), exec_mask, bld.exec(current_exec)); ctx.info[idx].exec[exec_idx].first = exec_mask; } @@ -1039,7 +1045,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u)); + ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); } bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a2b2c21170c..9de9d5dec14 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -351,12 +351,12 @@ Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2 { Builder bld(ctx->program, ctx->block); if (!dst.id()) - dst = bld.tmp(s2); + dst = bld.tmp(bld.lm); assert(val.regClass() == s1); - assert(dst.regClass() == s2); + assert(dst.regClass() == bld.lm); - return bld.sop2(aco_opcode::s_cselect_b64, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val)); + return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val)); } Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1)) @@ -365,12 +365,12 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1 if (!dst.id()) dst = bld.tmp(s1); - assert(val.regClass() == s2); + assert(val.regClass() == bld.lm); assert(dst.regClass() == s1); /* if we're currently in WQM mode, ensure that the source is also computed in WQM */ Temp tmp = bld.tmp(s1); - bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(Definition(tmp)), val, Operand(exec, s2)); + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm)); return emit_wqm(ctx, tmp, dst); } @@ -489,6 +489,8 @@ void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o { Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); + assert(src0.size() == src1.size()); + aco_ptr vopc; if (src1.type() == RegType::sgpr) { if (src0.type() == RegType::vgpr) { @@ -549,12 +551,13 @@ void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o { Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); + Builder bld(ctx->program, ctx->block); - assert(dst.regClass() == s2); + assert(dst.regClass() == bld.lm); assert(src0.type() == RegType::sgpr); assert(src1.type() == RegType::sgpr); + assert(src0.regClass() == src1.regClass()); - Builder bld(ctx->program, ctx->block); /* Emit the SALU comparison instruction */ Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1); /* Turn the result into a per-lane bool */ @@ -580,17 +583,17 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst, emit_sopc_instruction(ctx, instr, op, dst); } -void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op64, Temp dst) +void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst) { Builder bld(ctx->program, ctx->block); Temp src0 = get_alu_src(ctx, instr->src[0]); Temp src1 = get_alu_src(ctx, instr->src[1]); - assert(dst.regClass() == s2); - assert(src0.regClass() == s2); - assert(src1.regClass() == s2); + assert(dst.regClass() == bld.lm); + assert(src0.regClass() == bld.lm); + assert(src1.regClass() == bld.lm); - bld.sop2(op64, Definition(dst), bld.def(s1, scc), src0, src1); + bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1); } void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) @@ -600,7 +603,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) Temp then = get_alu_src(ctx, instr->src[1]); Temp els = get_alu_src(ctx, instr->src[2]); - assert(cond.regClass() == s2); + assert(cond.regClass() == bld.lm); if (dst.type() == RegType::vgpr) { aco_ptr bcsel; @@ -628,14 +631,15 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) } if (instr->dest.dest.ssa.bit_size == 1) { - assert(dst.regClass() == s2); - assert(then.regClass() == s2); - assert(els.regClass() == s2); + assert(dst.regClass() == bld.lm); + assert(then.regClass() == bld.lm); + assert(els.regClass() == bld.lm); } if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */ if (dst.regClass() == s1 || dst.regClass() == s2) { assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass()); + assert(dst.size() == then.size()); aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond))); } else { @@ -652,20 +656,20 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst) assert(instr->dest.dest.ssa.bit_size == 1); if (cond.id() != then.id()) - then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then); + then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then); if (cond.id() == els.id()) - bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then); + bld.sop1(Builder::s_mov, Definition(dst), then); else - bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then, - bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond)); + bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then, + bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond)); } void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val, aco_opcode op, uint32_t undo) { /* multiply by 16777216 to handle denormals */ - Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(s2)), + Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4)))); Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val); scaled = bld.vop1(op, bld.def(v1), scaled); @@ -766,9 +770,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_inot: { Temp src = get_alu_src(ctx, instr->src[0]); if (instr->dest.dest.ssa.bit_size == 1) { - assert(src.regClass() == s2); - assert(dst.regClass() == s2); - bld.sop2(aco_opcode::s_andn2_b64, Definition(dst), bld.def(s1, scc), Operand(exec, s2), src); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); + bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src); } else if (dst.regClass() == v1) { emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); } else if (dst.type() == RegType::sgpr) { @@ -835,12 +839,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz); } else if (dst.regClass() == v1) { Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src); - Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz); } else if (dst.regClass() == v2) { Temp upper = emit_extract_vector(ctx, src, 1, v1); Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper); - Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz); upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz); bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); @@ -901,7 +905,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_ior: { if (instr->dest.dest.ssa.bit_size == 1) { - emit_boolean_logic(ctx, instr, aco_opcode::s_or_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_or, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true); } else if (dst.regClass() == s1) { @@ -917,7 +921,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_iand: { if (instr->dest.dest.ssa.bit_size == 1) { - emit_boolean_logic(ctx, instr, aco_opcode::s_and_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_and, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true); } else if (dst.regClass() == s1) { @@ -933,7 +937,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_ixor: { if (instr->dest.dest.ssa.bit_size == 1) { - emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_xor, dst); } else if (dst.regClass() == v1) { emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true); } else if (dst.regClass() == s1) { @@ -1709,16 +1713,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) case nir_op_fsign: { Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); if (dst.size() == 1) { - Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond); - cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond); } else if (dst.size() == 2) { - Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u)); Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond); - cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src); + cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src); tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u)); upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond); @@ -1922,7 +1926,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp src = get_alu_src(ctx, instr->src[0]); if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) { Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); - Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent); + Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent); exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent); Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src); mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa); @@ -1986,7 +1990,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_b2f32: { Temp src = get_alu_src(ctx, instr->src[0]); - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); if (dst.regClass() == s1) { src = bool_to_scalar_condition(ctx, src); @@ -2000,7 +2004,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_b2f64: { Temp src = get_alu_src(ctx, instr->src[0]); - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); if (dst.regClass() == s2) { src = bool_to_scalar_condition(ctx, src); @@ -2073,7 +2077,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_b2i32: { Temp src = get_alu_src(ctx, instr->src[0]); - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); if (dst.regClass() == s1) { // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ @@ -2087,7 +2091,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_i2b1: { Temp src = get_alu_src(ctx, instr->src[0]); - assert(dst.regClass() == s2); + assert(dst.regClass() == bld.lm); if (src.type() == RegType::vgpr) { assert(src.regClass() == v1 || src.regClass() == v2); @@ -2164,7 +2168,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */ - Temp cmp_res = bld.tmp(s2); + Temp cmp_res = bld.tmp(bld.lm); bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc); Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); @@ -2338,14 +2342,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr) } case nir_op_ieq: { if (instr->src[0].src.ssa->bit_size == 1) - emit_boolean_logic(ctx, instr, aco_opcode::s_xnor_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_xnor, dst); else emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64); break; } case nir_op_ine: { if (instr->src[0].src.ssa->bit_size == 1) - emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b64, dst); + emit_boolean_logic(ctx, instr, Builder::s_xor, dst); else emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64); break; @@ -2405,8 +2409,10 @@ void visit_load_const(isel_context *ctx, nir_load_const_instr *instr) Builder bld(ctx->program, ctx->block); if (instr->def.bit_size == 1) { - assert(dst.regClass() == s2); - bld.sop1(aco_opcode::s_mov_b64, Definition(dst), Operand((uint64_t)(instr->value[0].b ? -1 : 0))); + assert(dst.regClass() == bld.lm); + int val = instr->value[0].b ? -1 : 0; + Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val); + bld.sop1(Builder::s_mov, Definition(dst), op); } else if (dst.size() == 1) { bld.copy(Definition(dst), Operand(instr->value[0].u32)); } else { @@ -3033,7 +3039,7 @@ Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alph /* Convert back to the right type. */ if (adjustment == RADV_ALPHA_ADJUST_SNORM) { alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); - Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha); + Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha); alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp); } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) { alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); @@ -3599,8 +3605,8 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr) // TODO: optimize uniform conditions Builder bld(ctx->program, ctx->block); Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - assert(src.regClass() == s2); - src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + assert(src.regClass() == bld.lm); + src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_discard_if, src); ctx->block->kind |= block_kind_uses_discard_if; return; @@ -3663,7 +3669,7 @@ void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr) ctx->program->needs_exact = true; /* save exec somewhere temporarily so that it doesn't get * overwritten before the discard from outer exec masks */ - Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2)); + Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_discard_if, cond); ctx->block->kind |= block_kind_uses_discard_if; return; @@ -3950,7 +3956,7 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coo /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK * resource descriptor is 0 (invalid), */ - Temp compare = bld.tmp(s2); + Temp compare = bld.tmp(bld.lm); bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare), Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc); @@ -4739,12 +4745,12 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) if (offset > 0 && ctx->options->chip_class < GFX9) { Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1); Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1); - Temp carry = bld.tmp(s2); + Temp carry = bld.tmp(bld.lm); bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr); bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)), Operand(offset), addr0); - bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2), + bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), Operand(0u), addr1, carry).def(1).setHint(vcc); @@ -5219,25 +5225,25 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te return src; } if (op == nir_op_iand && cluster_size == 4) { //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) - Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src); - return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc), - bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp)); + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); + return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), + bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp)); } else if (op == nir_op_ior && cluster_size == 4) { //subgroupClusteredOr(val, 4) -> wqm(val & exec) - return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), - bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))); + return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); } else if (op == nir_op_iand && cluster_size == 64) { //subgroupAnd(val) -> (exec & ~val) == 0 - Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp(); - return bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp)); + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp)); } else if (op == nir_op_ior && cluster_size == 64) { //subgroupOr(val) -> (val & exec) != 0 - Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp(); + Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp(); return bool_to_vector_condition(ctx, tmp); } else if (op == nir_op_ixor && cluster_size == 64) { //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 - Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); - tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s1), bld.def(s1, scc), tmp); + Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); + tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp); tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp(); return bool_to_vector_condition(ctx, tmp); } else { @@ -5256,25 +5262,28 @@ Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Te Temp tmp; if (op == nir_op_iand) - tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); else - tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u; - tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); + if (ctx->program->wave_size == 64) + tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); + else + tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp); tmp = emit_extract_vector(ctx, tmp, 0, v1); if (cluster_mask != 0xffffffff) tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp); Definition cmp_def = Definition(); if (op == nir_op_iand) { - cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0); } else if (op == nir_op_ior) { - cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0); } else if (op == nir_op_ixor) { tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u))); - cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0); } cmp_def.setHint(vcc); return cmp_def.getTemp(); @@ -5290,9 +5299,9 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src) //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 Temp tmp; if (op == nir_op_iand) - tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src); + tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); else - tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm)); Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp); Temp lo = lohi.def(0).getTemp(); @@ -5301,11 +5310,11 @@ Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src) Definition cmp_def = Definition(); if (op == nir_op_iand) - cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0); else if (op == nir_op_ior) - cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0); + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0); else if (op == nir_op_ixor) - cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), + cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0); cmp_def.setHint(vcc); return cmp_def.getTemp(); @@ -5320,11 +5329,11 @@ Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src) //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val Temp tmp = emit_boolean_exclusive_scan(ctx, op, src); if (op == nir_op_iand) - return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src); else if (op == nir_op_ior) - return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src); else if (op == nir_op_ixor) - return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src); + return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src); assert(false); return Temp(); @@ -5453,7 +5462,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp pck0 = bld.tmp(v1); Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp(); tmp1 = as_vgpr(ctx, tmp1); - Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry); + Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry); addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1); /* sample_pos = flat_load_dwordx2 addr */ @@ -5685,11 +5694,12 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; } case nir_intrinsic_ballot: { - Definition tmp = bld.def(s2); Temp src = get_ssa_temp(ctx, instr->src[0].ssa); + Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + Definition tmp = bld.def(dst.regClass()); if (instr->src[0].ssa->bit_size == 1) { - assert(src.regClass() == s2); - bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src); + assert(src.regClass() == bld.lm); + bld.sop2(Builder::s_and, tmp, bld.def(s1, scc), Operand(exec, bld.lm), src); } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) { bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src); } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) { @@ -5699,7 +5709,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) nir_print_instr(&instr->instr, stderr); fprintf(stderr, "\n"); } - emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa)); + emit_wqm(ctx, tmp.getTemp(), dst); break; } case nir_intrinsic_shuffle: @@ -5722,15 +5732,19 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) { - assert(src.regClass() == s2); - Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, tid); + assert(src.regClass() == bld.lm); + Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid); bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) { - assert(src.regClass() == s2); - Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); + assert(src.regClass() == bld.lm); + Temp tmp; + if (ctx->program->wave_size == 64) + tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); + else + tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src); tmp = emit_extract_vector(ctx, tmp, 0, v1); tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp); - emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst); + emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst); } else { fprintf(stderr, "Unimplemented NIR instr bit size: "); nir_print_instr(&instr->instr, stderr); @@ -5763,9 +5777,9 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); emit_split_vector(ctx, dst, 2); } else if (instr->dest.ssa.bit_size == 1) { - assert(src.regClass() == s2); - Temp tmp = bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, - bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))); + assert(src.regClass() == bld.lm); + Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, + bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm))); bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst); } else if (src.regClass() == s1) { bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src); @@ -5781,22 +5795,22 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) case nir_intrinsic_vote_all: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - assert(src.regClass() == s2); - assert(dst.regClass() == s2); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); - Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp(); - Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(0u), Operand(-1u), bld.scc(tmp)); + Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp)); emit_wqm(ctx, val, dst); break; } case nir_intrinsic_vote_any: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - assert(src.regClass() == s2); - assert(dst.regClass() == s2); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); - Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp(); - Temp val = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), Operand(-1u), Operand(0u), bld.scc(tmp)); + Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp(); + Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), Operand(0u), bld.scc(tmp)); emit_wqm(ctx, val, dst); break; } @@ -5879,7 +5893,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp tmp_dst = bld.tmp(dst.regClass()); reduce->definitions[0] = Definition(tmp_dst); - reduce->definitions[1] = bld.def(s2); // used internally + reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally reduce->definitions[2] = Definition(); reduce->definitions[3] = Definition(scc, s1); reduce->definitions[4] = Definition(); @@ -5899,13 +5913,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); unsigned lane = nir_src_as_const_value(instr->src[1])->u32; if (instr->dest.ssa.bit_size == 1) { - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); + assert(dst.regClass() == bld.lm); uint32_t half_mask = 0x11111111u << lane; Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask)); - Temp tmp = bld.tmp(s2); - bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp), - bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp, - bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)))); + Temp tmp = bld.tmp(bld.lm); + bld.sop1(Builder::s_wqm, Definition(tmp), + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, + bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)))); emit_wqm(ctx, tmp, dst); } else if (instr->dest.ssa.bit_size == 32) { emit_wqm(ctx, @@ -5957,10 +5972,10 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); if (instr->dest.ssa.bit_size == 1) { - assert(src.regClass() == s2); + assert(src.regClass() == bld.lm); src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src); src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); - Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src); + Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src); emit_wqm(ctx, tmp, dst); } else if (instr->dest.ssa.bit_size == 32) { Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); @@ -6060,15 +6075,15 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr) break; case nir_intrinsic_demote_if: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); - assert(src.regClass() == s2); - Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); + assert(src.regClass() == bld.lm); + Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); bld.pseudo(aco_opcode::p_demote_to_helper, cond); ctx->block->kind |= block_kind_uses_demote; ctx->program->needs_exact = true; break; } case nir_intrinsic_first_invocation: { - emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)), + emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)), get_ssa_temp(ctx, &instr->dest.ssa)); break; } @@ -6180,14 +6195,14 @@ void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv, Operand two(0x40000000u); Operand four(0x40800000u); - Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma); + Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma); Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive); Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma); - Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id); + Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id); Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id); - is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z); - Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y); + is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z); + Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y); // select sc Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x); @@ -6667,7 +6682,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa)); } - tg4_compare_cube_wa64 = bld.tmp(s2); + tg4_compare_cube_wa64 = bld.tmp(bld.lm); bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64); nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u)); @@ -6800,7 +6815,7 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr) assert(dmask == 1 && dst.regClass() == v1); assert(dst.id() != tmp_dst.id()); - Temp tmp = bld.tmp(s2); + Temp tmp = bld.tmp(bld.lm); bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc); bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp); @@ -6921,7 +6936,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr) { aco_ptr phi; Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); - assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == s2); + assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask); bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index]; logical |= ctx->block->kind & block_kind_merge; @@ -7295,7 +7310,7 @@ static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond ctx->block->kind |= block_kind_branch; /* branch to linear then block */ - assert(cond.regClass() == s2); + assert(cond.regClass() == ctx->program->lane_mask); aco_ptr branch; branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0)); branch->operands[0] = Operand(cond); @@ -7439,7 +7454,7 @@ static void visit_if(isel_context *ctx, nir_if *if_stmt) ctx->block->kind |= block_kind_uniform; /* emit branch */ - assert(cond.regClass() == s2); + assert(cond.regClass() == bld.lm); // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction cond = bool_to_scalar_condition(ctx, cond); @@ -7825,7 +7840,7 @@ void handle_bc_optimize(isel_context *ctx) ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid); ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid); if (uses_center && uses_centroid) { - Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), + Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u)); if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) { @@ -7934,7 +7949,7 @@ void select_program(Program *program, Builder bld(ctx.program, ctx.block); Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u))); Temp thread_id = emit_mbcnt(&ctx, bld.def(v1)); - Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id); + Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id); begin_divergent_if_then(&ctx, &ic, cond); } diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index ab96a4507cf..a7446c6c058 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -126,6 +126,7 @@ unsigned get_interp_input(nir_intrinsic_op intrin, enum glsl_interp_mode interp) void init_context(isel_context *ctx, nir_shader *shader) { nir_function_impl *impl = nir_shader_get_entrypoint(shader); + unsigned lane_mask_size = ctx->program->lane_mask.size(); ctx->shader = shader; ctx->divergent_vals = nir_divergence_analysis(shader, nir_divergence_view_index_uniform); @@ -207,7 +208,7 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_op_ieq: case nir_op_ine: case nir_op_i2b1: - size = 2; + size = lane_mask_size; break; case nir_op_f2i64: case nir_op_f2u64: @@ -219,7 +220,7 @@ void init_context(isel_context *ctx, nir_shader *shader) break; case nir_op_bcsel: if (alu_instr->dest.dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; } else { if (ctx->divergent_vals[alu_instr->dest.dest.ssa.index]) { type = RegType::vgpr; @@ -237,14 +238,14 @@ void init_context(isel_context *ctx, nir_shader *shader) break; case nir_op_mov: if (alu_instr->dest.dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; } else { type = ctx->divergent_vals[alu_instr->dest.dest.ssa.index] ? RegType::vgpr : RegType::sgpr; } break; default: if (alu_instr->dest.dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; } else { for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { if (allocated[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) @@ -261,7 +262,7 @@ void init_context(isel_context *ctx, nir_shader *shader) if (nir_instr_as_load_const(instr)->def.bit_size == 64) size *= 2; else if (nir_instr_as_load_const(instr)->def.bit_size == 1) - size *= 2; + size *= lane_mask_size; allocated[nir_instr_as_load_const(instr)->def.index] = Temp(0, RegClass(RegType::sgpr, size)); break; } @@ -289,11 +290,11 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_first_invocation: type = RegType::sgpr; if (intrinsic->dest.ssa.bit_size == 1) - size = 2; + size = lane_mask_size; break; case nir_intrinsic_ballot: type = RegType::sgpr; - size = 2; + size = lane_mask_size; break; case nir_intrinsic_load_sample_id: case nir_intrinsic_load_sample_mask_in: @@ -369,7 +370,7 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_inclusive_scan: case nir_intrinsic_exclusive_scan: if (intrinsic->dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; type = RegType::sgpr; } else if (!ctx->divergent_vals[intrinsic->dest.ssa.index]) { type = RegType::sgpr; @@ -384,11 +385,11 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_load_helper_invocation: case nir_intrinsic_is_helper_invocation: type = RegType::sgpr; - size = 2; + size = lane_mask_size; break; case nir_intrinsic_reduce: if (intrinsic->dest.ssa.bit_size == 1) { - size = 2; + size = lane_mask_size; type = RegType::sgpr; } else if (nir_intrinsic_cluster_size(intrinsic) == 0 || !ctx->divergent_vals[intrinsic->dest.ssa.index]) { @@ -489,7 +490,7 @@ void init_context(isel_context *ctx, nir_shader *shader) if (phi->dest.ssa.bit_size == 1) { assert(size == 1 && "multiple components not yet supported on boolean phis."); type = RegType::sgpr; - size *= 2; + size *= lane_mask_size; allocated[phi->dest.ssa.index] = Temp(0, RegClass(type, size)); break; } @@ -590,7 +591,7 @@ Pseudo_instruction *add_startpgm(struct isel_context *ctx) startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); arg++; } - startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, s2}; + startpgm->definitions[arg_count] = Definition{ctx->program->allocateId(), exec, ctx->program->lane_mask}; Pseudo_instruction *instr = startpgm.get(); ctx->block->instructions.push_back(std::move(startpgm)); @@ -796,6 +797,7 @@ setup_isel_context(Program* program, program->chip_class = args->options->chip_class; program->family = args->options->family; program->wave_size = args->shader_info->wave_size; + program->lane_mask = program->wave_size == 32 ? s1 : s2; program->lds_alloc_granule = args->options->chip_class >= GFX7 ? 512 : 256; program->lds_limit = args->options->chip_class >= GFX7 ? 65536 : 32768; diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 4073086662a..1f4721f5ffd 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1149,6 +1149,7 @@ public: enum chip_class chip_class; enum radeon_family family; unsigned wave_size; + RegClass lane_mask; Stage stage; /* Stage */ bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ bool needs_wqm = false; /* there exists a p_wqm instruction */ diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp index 05ddb7bc68a..4255d56173b 100644 --- a/src/amd/compiler/aco_live_var_analysis.cpp +++ b/src/amd/compiler/aco_live_var_analysis.cpp @@ -54,7 +54,7 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, bool exec_live = false; if (block->live_out_exec != Temp()) { live_sgprs.insert(block->live_out_exec); - new_demand.sgpr += 2; + new_demand.sgpr += program->lane_mask.size(); exec_live = true; } @@ -77,10 +77,10 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, if (is_phi(insn)) break; - /* substract the 2 sgprs from exec */ + /* substract the 1 or 2 sgprs from exec */ if (exec_live) - assert(new_demand.sgpr >= 2); - register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? 2 : 0)); + assert(new_demand.sgpr >= (int16_t) program->lane_mask.size()); + register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0)); /* KILL */ for (Definition& definition : insn->definitions) { @@ -144,8 +144,8 @@ void process_live_temps_per_block(Program *program, live& lives, Block* block, /* update block's register demand for a last time */ if (exec_live) - assert(new_demand.sgpr >= 2); - new_demand.sgpr -= exec_live ? 2 : 0; + assert(new_demand.sgpr >= (int16_t) program->lane_mask.size()); + new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0; block->register_demand.update(new_demand); /* handle phi definitions */ diff --git a/src/amd/compiler/aco_lower_bool_phis.cpp b/src/amd/compiler/aco_lower_bool_phis.cpp index dc64f0133b5..988f753c82d 100644 --- a/src/amd/compiler/aco_lower_bool_phis.cpp +++ b/src/amd/compiler/aco_lower_bool_phis.cpp @@ -54,12 +54,12 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state) while (true) { auto pos = state->latest.find(block_idx); if (pos != state->latest.end()) - return Operand({pos->second, s2}); + return Operand({pos->second, program->lane_mask}); Block& block = program->blocks[block_idx]; size_t pred = block.linear_preds.size(); if (pred == 0) { - return Operand(s2); + return Operand(program->lane_mask); } else if (pred == 1) { block_idx = block.linear_preds[0]; continue; @@ -75,10 +75,10 @@ Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state) state->phis[phi->operands[i].tempId()][(phi_use){&block, res}] |= (uint64_t)1 << i; } } - phi->definitions[0] = Definition(Temp{res, s2}); + phi->definitions[0] = Definition(Temp{res, program->lane_mask}); block.instructions.emplace(block.instructions.begin(), std::move(phi)); - return Operand({res, s2}); + return Operand({res, program->lane_mask}); } } } @@ -118,7 +118,7 @@ Temp write_ssa(Program *program, Block *block, ssa_state *state, unsigned previo update_phi(program, state, phi.first.block, phi.first.phi_def, phi.second); } - return {id, s2}; + return {id, program->lane_mask}; } void insert_before_logical_end(Block *block, aco_ptr instr) @@ -150,23 +150,25 @@ void lower_divergent_bool_phi(Program *program, Block *block, aco_ptroperands[i].isTemp()); Temp phi_src = phi->operands[i].getTemp(); - assert(phi_src.regClass() == s2); + assert(phi_src.regClass() == bld.lm); Operand cur = get_ssa(program, pred->index, &state); + assert(cur.regClass() == bld.lm); Temp new_cur = write_ssa(program, pred, &state, cur.isTemp() ? cur.tempId() : 0); + assert(new_cur.regClass() == bld.lm); if (cur.isUndefined()) { insert_before_logical_end(pred, bld.sop1(aco_opcode::s_mov_b64, Definition(new_cur), phi_src).get_ptr()); } else { - Temp tmp1 = bld.tmp(s2), tmp2 = bld.tmp(s2); + Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm); insert_before_logical_end(pred, - bld.sop2(aco_opcode::s_andn2_b64, Definition(tmp1), bld.def(s1, scc), - cur, Operand(exec, s2)).get_ptr()); + bld.sop2(Builder::s_andn2, Definition(tmp1), bld.def(s1, scc), + cur, Operand(exec, bld.lm)).get_ptr()); insert_before_logical_end(pred, - bld.sop2(aco_opcode::s_and_b64, Definition(tmp2), bld.def(s1, scc), - phi_src, Operand(exec, s2)).get_ptr()); + bld.sop2(Builder::s_and, Definition(tmp2), bld.def(s1, scc), + phi_src, Operand(exec, bld.lm)).get_ptr()); insert_before_logical_end(pred, - bld.sop2(aco_opcode::s_or_b64, Definition(new_cur), bld.def(s1, scc), + bld.sop2(Builder::s_or, Definition(new_cur), bld.def(s1, scc), tmp1, tmp2).get_ptr()); } } @@ -192,8 +194,8 @@ void lower_bool_phis(Program* program) for (Block& block : program->blocks) { for (aco_ptr& phi : block.instructions) { if (phi->opcode == aco_opcode::p_phi) { - assert(phi->definitions[0].regClass() != s1); - if (phi->definitions[0].regClass() == s2) + assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2); + if (phi->definitions[0].regClass() == program->lane_mask) lower_divergent_bool_phi(program, &block, phi); } else if (!is_phi(phi)) { break; diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index cbb3b55179c..e9c2d66d823 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -76,8 +76,10 @@ aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) { void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1) { Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true); - if (instr->definitions.size() >= 2) + if (instr->definitions.size() >= 2) { + assert(instr->definitions[1].regClass() == bld.lm); instr->definitions[1].setFixed(vcc); + } } void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, @@ -99,12 +101,12 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, bound_ctrl); - bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(s2, vcc), vtmp_op[0], src1[0]); + bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]); } else { - bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(s2, vcc), src0[0], src1[0], + bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, bound_ctrl); } - bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(s2, vcc), src0[1], src1[1], Operand(vcc, s2), + bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl); } else if (op == iand64) { bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], @@ -149,9 +151,9 @@ void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, Ph bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask, bound_ctrl); - bld.vopc(cmp, bld.def(s2, vcc), vtmp_op64, src1_64); - bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, s2)); - bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, s2)); + bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64); + bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm)); } else if (op == imul64) { /* t4 = dpp(x_hi) * t1 = umul_lo(t4, y_lo) @@ -216,11 +218,11 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe if (op == iadd64) { if (ctx->program->chip_class >= GFX10) { - bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(s2, vcc), src0[0], src1[0]); + bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); } else { - bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(s2, vcc), src0[0], src1[0]); + bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); } - bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(s2, vcc), src0[1], src1[1], Operand(vcc, s2)); + bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm)); } else if (op == iand64) { bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]); bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]); @@ -249,9 +251,9 @@ void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysRe break; } - bld.vopc(cmp, bld.def(s2, vcc), src0_64, src1_64); - bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, s2)); - bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, s2)); + bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64); + bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm)); + bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm)); } else if (op == imul64) { if (src1_reg == dst_reg) { /* it's fine if src0==dst but not if src1==dst */ @@ -298,7 +300,7 @@ void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg if (!vop3) { if (opcode == aco_opcode::v_add_co_u32) - bld.vop2_dpp(opcode, dst, bld.def(s2, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); + bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); else bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); return; @@ -342,7 +344,7 @@ void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1 if (vop3) { bld.vop3(opcode, dst, src0, src1); } else if (opcode == aco_opcode::v_add_co_u32) { - bld.vop2(opcode, dst, bld.def(s2, vcc), src0, src1); + bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1); } else { bld.vop2(opcode, dst, src0, src1); } @@ -420,7 +422,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig Operand vcndmask_identity[2] = {identity[0], identity[1]}; /* First, copy the source to tmp and set inactive lanes to the identity */ - bld.sop1(aco_opcode::s_or_saveexec_b64, Definition(stmp, s2), Definition(scc, s1), Definition(exec, s2), Operand(UINT64_MAX), Operand(exec, s2)); + bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm)); for (unsigned i = 0; i < src.size(); i++) { /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 @@ -440,7 +442,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig for (unsigned i = 0; i < src.size(); i++) { bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1), vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1), - Operand(stmp, s2)); + Operand(stmp, bld.lm)); } bool exec_restored = false; @@ -463,7 +465,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig if (cluster_size == 32) { for (unsigned i = 0; i < src.size(); i++) bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, s1), ds_pattern_bitmode(0x1f, 0, 0x10)); - bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2)); + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); exec_restored = true; emit_op(ctx, dst.physReg(), vtmp, tmp, PhysReg{0}, reduce_op, src.size()); dst_written = true; @@ -500,7 +502,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig Operand(0xffffffffu), Operand(0xffffffffu)).instr; static_cast(perm)->opsel[0] = true; /* FI (Fetch Inactive) */ } - bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX)); + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX)); /* fill in the gap in row 2 */ for (unsigned i = 0; i < src.size(); i++) { @@ -559,7 +561,7 @@ void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsig } if (!exec_restored) - bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(stmp, s2)); + bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); if (op == aco_opcode::p_reduce && cluster_size == 64) { for (unsigned k = 0; k < src.size(); k++) { diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 28a779580a2..68a0dc15761 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -172,7 +172,7 @@ void setup_reduce_temp(Program* program) clobber_vcc = true; if (clobber_vcc) - instr->definitions[4] = Definition(vcc, s2); + instr->definitions[4] = Definition(vcc, bld.lm); } } } diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index c4144cc42f0..504ad015746 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1719,6 +1719,7 @@ void register_allocation(Program *program, std::vector> live_out_ pc->operands[i] = parallelcopy[i].first; pc->definitions[i] = parallelcopy[i].second; + assert(pc->operands[i].size() == pc->definitions[i].size()); /* it might happen that the operand is already renamed. we have to restore the original name. */ std::map::iterator it = ctx.orig_names.find(pc->operands[i].tempId()); diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp index 3d76dcd8867..54e691ba476 100644 --- a/src/amd/compiler/aco_ssa_elimination.cpp +++ b/src/amd/compiler/aco_ssa_elimination.cpp @@ -58,6 +58,7 @@ void collect_phi_info(ssa_elimination_ctx& ctx) std::vector& preds = phi->opcode == aco_opcode::p_phi ? block.logical_preds : block.linear_preds; phi_info& info = phi->opcode == aco_opcode::p_phi ? ctx.logical_phi_info : ctx.linear_phi_info; const auto result = info.emplace(preds[i], std::vector>()); + assert(phi->definitions[0].size() == phi->operands[i].size()); result.first->second.emplace_back(phi->definitions[0], phi->operands[i]); ctx.empty_blocks[preds[i]] = false; } diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index 8d2bf8449db..8282d7e27e3 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -190,7 +190,7 @@ void validate(Program* program, FILE * output) } } else if (instr->opcode == aco_opcode::p_phi) { check(instr->operands.size() == block.logical_preds.size(), "Number of Operands does not match number of predecessors", instr.get()); - check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == s2, "Logical Phi Definition must be vgpr or divergent boolean", instr.get()); + check(instr->definitions[0].getTemp().type() == RegType::vgpr || instr->definitions[0].getTemp().regClass() == program->lane_mask, "Logical Phi Definition must be vgpr or divergent boolean", instr.get()); } else if (instr->opcode == aco_opcode::p_linear_phi) { for (const Operand& op : instr->operands) check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type", instr.get());