X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_insert_exec_mask.cpp;h=ebd44ade4cd00730df057080d118090c105c28cb;hb=2182bbf84f0f19846a47f0438ec702f4d862731e;hp=530faab216756e664a604454fcc737ca81df98fc;hpb=655a7033493692961428cf0861477ae89f458e9b;p=mesa.git diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 530faab2167..ebd44ade4cd 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -24,6 +24,7 @@ #include "aco_ir.h" #include "aco_builder.h" +#include "util/u_math.h" namespace aco { @@ -83,6 +84,7 @@ struct block_info { std::vector instr_needs; uint8_t block_needs; uint8_t ever_again_needs; + bool logical_end_wqm; /* more... */ }; @@ -95,6 +97,8 @@ struct exec_ctx { }; bool pred_by_exec_mask(aco_ptr& instr) { + if (instr->isSALU()) + return instr->reads_exec(); if (instr->format == Format::SMEM || instr->isSALU()) return false; if (instr->format == Format::PSEUDO_BARRIER) @@ -103,10 +107,13 @@ bool pred_by_exec_mask(aco_ptr& instr) { if (instr->format == Format::PSEUDO) { switch (instr->opcode) { case aco_opcode::p_create_vector: - return instr->definitions[0].getTemp().type() == RegType::vgpr; case aco_opcode::p_extract_vector: case aco_opcode::p_split_vector: - return instr->operands[0].getTemp().type() == RegType::vgpr; + for (Definition def : instr->definitions) { + if (def.getTemp().type() == RegType::vgpr) + return true; + } + return false; case aco_opcode::p_spill: case aco_opcode::p_reload: return false; @@ -116,7 +123,9 @@ bool pred_by_exec_mask(aco_ptr& instr) { } if (instr->opcode == aco_opcode::v_readlane_b32 || - instr->opcode == aco_opcode::v_writelane_b32) + instr->opcode == aco_opcode::v_readlane_b32_e64 || + instr->opcode == aco_opcode::v_writelane_b32 || + instr->opcode == aco_opcode::v_writelane_b32_e64) return false; return true; @@ -132,6 +141,9 @@ bool needs_exact(aco_ptr& instr) { } else if (instr->format == Format::MIMG) { MIMG_instruction *mimg = static_cast(instr.get()); return mimg->disable_wqm; + } else if (instr->format == Format::FLAT || instr->format == Format::GLOBAL) { + FLAT_instruction *flat = static_cast(instr.get()); + return flat->disable_wqm; } else { return instr->format == Format::EXP || instr->opcode == aco_opcode::p_fs_buffer_store_smem; } @@ -153,12 +165,6 @@ void mark_block_wqm(wqm_ctx &ctx, unsigned block_idx) ctx.branch_wqm[block_idx] = true; Block& block = ctx.program->blocks[block_idx]; - aco_ptr& branch = block.instructions.back(); - - if (branch->opcode != aco_opcode::p_branch) { - assert(!branch->operands.empty() && branch->operands[0].isTemp()); - set_needs_wqm(ctx, branch->operands[0].getTemp()); - } /* TODO: this sets more branch conditions to WQM than it needs to * it should be enough to stop at the "exec mask top level" */ @@ -177,11 +183,14 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) if (block->kind & block_kind_top_level) { if (ctx.loop && ctx.wqm) { - /* mark all break conditions as WQM */ unsigned block_idx = block->index + 1; while (!(ctx.program->blocks[block_idx].kind & block_kind_top_level)) { + /* flag all break conditions as WQM: + * the conditions might be computed outside the nested CF */ if (ctx.program->blocks[block_idx].kind & block_kind_break) mark_block_wqm(ctx, block_idx); + /* flag all blocks as WQM to ensure we enter all (nested) loops in WQM */ + exec_ctx.info[block_idx].block_needs |= WQM; block_idx++; } } else if (ctx.loop && !ctx.wqm) { @@ -200,8 +209,7 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) ctx.wqm = false; } - for (int i = block->instructions.size() - 1; i >= 0; --i) - { + for (int i = block->instructions.size() - 1; i >= 0; --i) { aco_ptr& instr = block->instructions[i]; WQMState needs = needs_exact(instr) ? Exact : Unspecified; @@ -219,6 +227,11 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) } } + if (instr->format == Format::PSEUDO_BRANCH && ctx.branch_wqm[block->index]) { + needs = WQM; + propagate_wqm = true; + } + if (propagate_wqm) { for (const Operand& op : instr->operands) { if (op.isTemp()) { @@ -231,8 +244,17 @@ void get_block_needs(wqm_ctx &ctx, exec_ctx &exec_ctx, Block* block) /* ensure the condition controlling the control flow for this phi is in WQM */ if (needs == WQM && instr->opcode == aco_opcode::p_phi) { - for (unsigned pred_idx : block->logical_preds) + for (unsigned pred_idx : block->logical_preds) { mark_block_wqm(ctx, pred_idx); + exec_ctx.info[pred_idx].logical_end_wqm = true; + ctx.worklist.insert(pred_idx); + } + } + + if ((instr->opcode == aco_opcode::p_logical_end && info.logical_end_wqm) || + instr->opcode == aco_opcode::p_wqm) { + assert(needs != Exact); + needs = WQM; } instr_needs[i] = needs; @@ -298,14 +320,22 @@ void transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) return; if (ctx.info[idx].exec.back().second & mask_type_global) { Temp exec_mask = ctx.info[idx].exec.back().first; - exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), exec_mask); + /* TODO: we might generate better code if we pass the uncopied "exec_mask" + * directly to the s_wqm (we still need to keep this parallelcopy for + * potential later uses of exec_mask though). We currently can't do this + * because of a RA bug. */ + exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), bld.exec(exec_mask)); + ctx.info[idx].exec.back().first = exec_mask; + + exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), exec_mask); ctx.info[idx].exec.emplace_back(exec_mask, mask_type_global | mask_type_wqm); return; } /* otherwise, the WQM mask should be one below the current mask */ ctx.info[idx].exec.pop_back(); assert(ctx.info[idx].exec.back().second & mask_type_wqm); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); } @@ -320,14 +350,15 @@ void transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) !(ctx.info[idx].exec.back().second & mask_type_loop)) { ctx.info[idx].exec.pop_back(); assert(ctx.info[idx].exec.back().second & mask_type_exact); - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); return; } /* otherwise, we create an exact mask and push to the stack */ Temp wqm = ctx.info[idx].exec.back().first; - Temp exact = bld.tmp(s2); - wqm = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp exact = bld.tmp(bld.lm); + wqm = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(exact)), ctx.info[idx].exec[0].first, bld.exec(wqm)); ctx.info[idx].exec.back().first = wqm; ctx.info[idx].exec.emplace_back(exact, mask_type_exact); @@ -347,6 +378,12 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, Temp exec_mask = startpgm->definitions.back().getTemp(); bld.insert(std::move(startpgm)); + /* exec seems to need to be manually initialized with combined shaders */ + if (util_bitcount(ctx.program->stage & sw_mask) > 1 || (ctx.program->stage & hw_ngg_gs)) { + bld.sop1(Builder::s_mov, bld.exec(Definition(exec_mask)), bld.lm == s2 ? Operand(UINT64_MAX) : Operand(UINT32_MAX)); + instructions[0]->definitions.pop_back(); + } + if (ctx.handle_wqm) { ctx.info[0].exec.emplace_back(exec_mask, mask_type_global | mask_type_exact | mask_type_initial); /* if this block only needs WQM, initialize already */ @@ -355,7 +392,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, } else { uint8_t mask = mask_type_global; if (ctx.program->needs_wqm) { - exec_mask = bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2, exec), bld.def(s1, scc), bld.exec(exec_mask)); + exec_mask = bld.sop1(Builder::s_wqm, bld.def(bld.lm, exec), bld.def(s1, scc), bld.exec(exec_mask)); mask |= mask_type_wqm; } else { mask |= mask_type_exact; @@ -379,7 +416,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, aco_ptr phi; for (int i = 0; i < info.num_exec_masks - 1; i++) { phi.reset(create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)); - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); phi->operands[0] = Operand(ctx.info[preds[0]].exec[i].first); ctx.info[idx].exec[i].first = bld.insert(std::move(phi)); } @@ -389,7 +426,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (info.has_divergent_break) { /* this phi might be trivial but ensures a parallelcopy on the loop header */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); phi->operands[0] = Operand(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); } @@ -397,9 +434,9 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, /* create ssa name for loop active mask */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; if (info.has_divergent_continue) - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); else - phi->definitions[0] = bld.def(s2, exec); + phi->definitions[0] = bld.def(bld.lm, exec); phi->operands[0] = Operand(ctx.info[preds[0]].exec.back().first); Temp loop_active = bld.insert(std::move(phi)); @@ -419,7 +456,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, i++; } uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); - ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + ctx.info[idx].exec.emplace_back(bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first), mask_type); } @@ -461,6 +499,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2); /* create the loop exit phis if not trivial */ + bool need_parallelcopy = false; for (unsigned k = 0; k < info.num_exec_masks; k++) { Temp same = ctx.info[preds[0]].exec[k].first; uint8_t type = ctx.info[header_preds[0]].exec[k].second; @@ -471,12 +510,31 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, trivial = false; } + if (k == info.num_exec_masks - 1u) { + bool all_liveout_exec = true; + bool all_not_liveout_exec = true; + for (unsigned pred : preds) { + all_liveout_exec = all_liveout_exec && same == ctx.program->blocks[pred].live_out_exec; + all_not_liveout_exec = all_not_liveout_exec && same != ctx.program->blocks[pred].live_out_exec; + } + if (!all_liveout_exec && !all_not_liveout_exec) + trivial = false; + else if (all_not_liveout_exec) + need_parallelcopy = true; + + need_parallelcopy |= !trivial; + } + if (trivial) { ctx.info[idx].exec.emplace_back(same, type); } else { /* create phi for loop footer */ aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; - phi->definitions[0] = bld.def(s2); + phi->definitions[0] = bld.def(bld.lm); + if (k == info.num_exec_masks - 1u) { + phi->definitions[0].setFixed(exec); + need_parallelcopy = false; + } for (unsigned i = 0; i < phi->operands.size(); i++) phi->operands[i] = Operand(ctx.info[preds[i]].exec[k].first); ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type); @@ -506,8 +564,16 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, transition_to_Exact(ctx, bld, idx); } - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), - ctx.info[idx].exec.back().first); + assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); + if (need_parallelcopy) { + /* only create this parallelcopy is needed, since the operand isn't + * fixed to exec which causes the spiller to miscalculate register demand */ + /* TODO: Fix register_demand calculation for spilling on loop exits. + * The problem is only mitigated because the register demand could be + * higher if the exec phi doesn't get assigned to exec. */ + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), + ctx.info[idx].exec.back().first); + } ctx.loop.pop_back(); return i; @@ -532,7 +598,7 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, continue; } - Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(s2, exec) : bld.def(s2), + Temp phi = bld.pseudo(aco_opcode::p_linear_phi, in_exec ? bld.def(bld.lm, exec) : bld.def(bld.lm), ctx.info[preds[0]].exec[i].first, ctx.info[preds[1]].exec[i].first); uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; @@ -574,7 +640,8 @@ unsigned add_coupling_code(exec_ctx& ctx, Block* block, if (block->kind & block_kind_merge) { Temp restore = ctx.info[idx].exec.back().first; - ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s2, exec), restore); + assert(restore.size() == bld.lm.size()); + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), restore); } return i; @@ -585,7 +652,7 @@ void lower_fs_buffer_store_smem(Builder& bld, bool need_check, aco_ptroperands[1]; if (need_check) { /* if exec is zero, then use UINT32_MAX as an offset and make this store a no-op */ - Temp nonempty = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), cur_exec, Operand(0u)); + Temp nonempty = bld.sopc(Builder::s_cmp_lg, bld.def(s1, scc), cur_exec, Operand(0u)); if (offset.isLiteral()) offset = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1), offset); @@ -661,7 +728,7 @@ void process_instructions(exec_ctx& ctx, Block* block, assert(num); Operand cond = instr->operands[0]; for (int i = num - 1; i >= 0; i--) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); if (i == num - 1) { andn2->operands[0].setFixed(exec); @@ -685,8 +752,9 @@ void process_instructions(exec_ctx& ctx, Block* block, if (instr->opcode == aco_opcode::p_is_helper || instr->opcode == aco_opcode::p_load_helper) { Definition dst = instr->definitions[0]; + assert(dst.size() == bld.lm.size()); if (state == Exact) { - instr.reset(create_instruction(aco_opcode::s_mov_b64, Format::SOP1, 1, 1)); + instr.reset(create_instruction(bld.w64or32(Builder::s_mov), Format::SOP1, 1, 1)); instr->operands[0] = Operand(0u); instr->definitions[0] = dst; } else { @@ -706,7 +774,7 @@ void process_instructions(exec_ctx& ctx, Block* block, assert(instr->opcode == aco_opcode::p_is_helper || exact_mask.second & mask_type_initial); assert(exact_mask.second & mask_type_exact); - instr.reset(create_instruction(aco_opcode::s_andn2_b64, Format::SOP2, 2, 2)); + instr.reset(create_instruction(bld.w64or32(Builder::s_andn2), Format::SOP2, 2, 2)); instr->operands[0] = Operand(ctx.info[block->index].exec.back().first); /* current exec */ instr->operands[1] = Operand(exact_mask.first); instr->definitions[0] = dst; @@ -717,14 +785,18 @@ void process_instructions(exec_ctx& ctx, Block* block, assert((ctx.info[block->index].exec[0].second & (mask_type_exact | mask_type_global)) == (mask_type_exact | mask_type_global)); ctx.info[block->index].exec[0].second &= ~mask_type_initial; - int num = 0; - Temp cond; - if (instr->operands.empty()) { + int num; + Temp cond, exit_cond; + if (instr->operands[0].isConstant()) { + assert(instr->operands[0].constantValue() == -1u); /* transition to exact and set exec to zero */ Temp old_exec = ctx.info[block->index].exec.back().first; - Temp new_exec = bld.tmp(s2); - cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp new_exec = bld.tmp(bld.lm); + exit_cond = bld.tmp(s1); + cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.scc(Definition(exit_cond)), bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); + + num = ctx.info[block->index].exec.size() - 2; if (ctx.info[block->index].exec.back().second & mask_type_exact) { ctx.info[block->index].exec.back().first = new_exec; } else { @@ -736,27 +808,26 @@ void process_instructions(exec_ctx& ctx, Block* block, transition_to_Exact(ctx, bld, block->index); assert(instr->operands[0].isTemp()); cond = instr->operands[0].getTemp(); - num = 1; + num = ctx.info[block->index].exec.size() - 1; } - num += ctx.info[block->index].exec.size() - 1; - for (int i = num - 1; i >= 0; i--) { + for (int i = num; i >= 0; i--) { if (ctx.info[block->index].exec[i].second & mask_type_exact) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); - if (i == num - 1) { + if (i == (int)ctx.info[block->index].exec.size() - 1) { andn2->operands[0].setFixed(exec); andn2->definitions[0].setFixed(exec); } - if (i == 0) { - instr->opcode = aco_opcode::p_exit_early_if; - instr->operands[0] = bld.scc(andn2->definitions[1].getTemp()); - } + ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); + exit_cond = andn2->definitions[1].getTemp(); } else { assert(i != 0); } } + instr->opcode = aco_opcode::p_exit_early_if; + instr->operands[0] = bld.scc(exit_cond); state = Exact; } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) { @@ -858,6 +929,11 @@ void add_branch_code(exec_ctx& ctx, Block* block) has_discard); } + /* For normal breaks, this is the exec mask. For discard+break, it's the + * old exec mask before it was zero'd. + */ + Operand break_cond = bld.exec(ctx.info[idx].exec.back().first); + if (block->kind & block_kind_discard) { assert(block->instructions.back()->format == Format::PSEUDO_BRANCH); @@ -874,22 +950,23 @@ void add_branch_code(exec_ctx& ctx, Block* block) } Temp old_exec = ctx.info[idx].exec.back().first; - Temp new_exec = bld.tmp(s2); - Temp cond = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp new_exec = bld.tmp(bld.lm); + Temp cond = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); ctx.info[idx].exec.back().first = new_exec; for (int i = num - 1; i >= 0; i--) { - Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + Instruction *andn2 = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), ctx.info[block->index].exec[i].first, cond); + if (i == (int)ctx.info[idx].exec.size() - 1) + andn2->definitions[0].setFixed(exec); if (i == 0) bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp())); ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); } assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); - if ((block->kind & (block_kind_break | block_kind_uniform)) == block_kind_break) - ctx.info[idx].exec.back().first = cond; + break_cond = Operand(cond); bld.insert(std::move(branch)); /* no return here as it can be followed by a divergent break */ } @@ -900,20 +977,15 @@ void add_branch_code(exec_ctx& ctx, Block* block) assert(block->instructions.back()->opcode == aco_opcode::p_branch); block->instructions.pop_back(); - if (ctx.info[idx].exec.back().second & mask_type_loop) { - bld.branch(aco_opcode::p_cbranch_nz, bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]); - } else { - Temp cond = Temp(); - for (int exec_idx = ctx.info[idx].exec.size() - 1; exec_idx >= 0; exec_idx--) { - if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) { - cond = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), ctx.info[idx].exec[exec_idx].first, Operand(0u)); - break; - } - } - assert(cond != Temp()); - - bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + bool need_parallelcopy = false; + while (!(ctx.info[idx].exec.back().second & mask_type_loop)) { + ctx.info[idx].exec.pop_back(); + need_parallelcopy = true; } + + if (need_parallelcopy) + ctx.info[idx].exec.back().first = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm, exec), ctx.info[idx].exec.back().first); + bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.exec(ctx.info[idx].exec.back().first), block->linear_succs[1], block->linear_succs[0]); return; } @@ -951,8 +1023,8 @@ void add_branch_code(exec_ctx& ctx, Block* block) Temp current_exec = ctx.info[idx].exec.back().first; uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); - Temp then_mask = bld.tmp(s2); - Temp old_exec = bld.sop1(aco_opcode::s_and_saveexec_b64, bld.def(s2), bld.def(s1, scc), + Temp then_mask = bld.tmp(bld.lm); + Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), bld.exec(Definition(then_mask)), cond, bld.exec(current_exec)); ctx.info[idx].exec.back().first = old_exec; @@ -960,7 +1032,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) /* add next current exec to the stack */ ctx.info[idx].exec.emplace_back(then_mask, mask_type); - bld.branch(aco_opcode::p_cbranch_z, bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), bld.exec(then_mask), block->linear_succs[1], block->linear_succs[0]); return; } @@ -972,13 +1044,13 @@ void add_branch_code(exec_ctx& ctx, Block* block) uint8_t mask_type = ctx.info[idx].exec.back().second; ctx.info[idx].exec.pop_back(); Temp orig_exec = ctx.info[idx].exec.back().first; - Temp else_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2, exec), + Temp else_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm, exec), bld.def(s1, scc), orig_exec, bld.exec(then_mask)); /* add next current exec to the stack */ ctx.info[idx].exec.emplace_back(else_mask, mask_type); - bld.branch(aco_opcode::p_cbranch_z, bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_z, bld.hint_vcc(bld.def(s2)), bld.exec(else_mask), block->linear_succs[1], block->linear_succs[0]); return; } @@ -987,13 +1059,12 @@ void add_branch_code(exec_ctx& ctx, Block* block) assert(block->instructions.back()->opcode == aco_opcode::p_branch); block->instructions.pop_back(); - Temp current_exec = ctx.info[idx].exec.back().first; Temp cond = Temp(); for (int exec_idx = ctx.info[idx].exec.size() - 2; exec_idx >= 0; exec_idx--) { cond = bld.tmp(s1); Temp exec_mask = ctx.info[idx].exec[exec_idx].first; - exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)), - exec_mask, current_exec); + exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), + exec_mask, break_cond); ctx.info[idx].exec[exec_idx].first = exec_mask; if (ctx.info[idx].exec[exec_idx].second & mask_type_loop) break; @@ -1004,10 +1075,10 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u)); + ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); } - bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); return; } @@ -1022,7 +1093,7 @@ void add_branch_code(exec_ctx& ctx, Block* block) break; cond = bld.tmp(s1); Temp exec_mask = ctx.info[idx].exec[exec_idx].first; - exec_mask = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.scc(Definition(cond)), + exec_mask = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.scc(Definition(cond)), exec_mask, bld.exec(current_exec)); ctx.info[idx].exec[exec_idx].first = exec_mask; } @@ -1033,10 +1104,10 @@ void add_branch_code(exec_ctx& ctx, Block* block) unsigned succ_idx = ctx.program->blocks[block->linear_succs[1]].linear_succs[0]; Block& succ = ctx.program->blocks[succ_idx]; if (!(succ.kind & block_kind_invert || succ.kind & block_kind_merge)) { - ctx.info[idx].exec.back().first = bld.sop1(aco_opcode::s_mov_b64, bld.def(s2, exec), Operand(0u)); + ctx.info[idx].exec.back().first = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand(0u)); } - bld.branch(aco_opcode::p_cbranch_nz, bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); + bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), bld.scc(cond), block->linear_succs[1], block->linear_succs[0]); return; } }