From 2ea9e59e8d976ec77800d2a20645087b96d1e241 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Tue, 8 Oct 2019 13:40:17 +0100 Subject: [PATCH] aco: move s_andn2_b64 instructions out of the p_discard_if MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit And use a new p_discard_early_exit instruction. This fixes some cases where a definition having the same register as an operand causes issues. v2: rename instruction to p_exit_early_if v2: modify the existing instruction instead of creating a new one v3: merge the "i == num - 1" IFs Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann --- src/amd/compiler/aco_insert_exec_mask.cpp | 72 +++++++++---------- .../compiler/aco_instruction_selection.cpp | 1 + src/amd/compiler/aco_lower_to_hw_instr.cpp | 35 +++------ src/amd/compiler/aco_opcodes.py | 1 + src/amd/compiler/aco_scheduler.cpp | 6 ++ 5 files changed, 54 insertions(+), 61 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 155c21a5aa4..3f4b48e661f 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -657,22 +657,23 @@ void process_instructions(exec_ctx& ctx, Block* block, transition_to_WQM(ctx, bld, block->index); ctx.info[block->index].exec.back().second &= ~mask_type_global; } - unsigned num = ctx.info[block->index].exec.size(); + int num = ctx.info[block->index].exec.size(); assert(num); Operand cond = instr->operands[0]; - instr.reset(create_instruction(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)); - for (unsigned i = 0; i < num; i++) { - instr->operands[i] = Operand(ctx.info[block->index].exec[i].first); - if (i == num - 1) - instr->operands[i].setFixed(exec); - Temp new_mask = bld.tmp(s2); - instr->definitions[i] = Definition(new_mask); - ctx.info[block->index].exec[i].first = new_mask; + for (int i = num - 1; i >= 0; i--) { + Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + ctx.info[block->index].exec[i].first, cond); + if (i == num - 1) { + andn2->operands[0].setFixed(exec); + andn2->definitions[0].setFixed(exec); + } + if (i == 0) { + instr->opcode = aco_opcode::p_exit_early_if; + instr->operands[0] = bld.scc(andn2->definitions[1].getTemp()); + } + ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); } - assert((ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); - instr->definitions[num - 1].setFixed(exec); - instr->operands[num] = cond; - instr->definitions[num] = bld.def(s1, scc); + assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); } else if (needs == WQM && state != WQM) { transition_to_WQM(ctx, bld, block->index); @@ -738,24 +739,24 @@ void process_instructions(exec_ctx& ctx, Block* block, num = 1; } - for (unsigned i = 0; i < ctx.info[block->index].exec.size() - 1; i++) - num += ctx.info[block->index].exec[i].second & mask_type_exact ? 1 : 0; - instr.reset(create_instruction(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)); - int k = 0; - for (unsigned i = 0; k < num; i++) { + num += ctx.info[block->index].exec.size() - 1; + for (int i = num - 1; i >= 0; i--) { if (ctx.info[block->index].exec[i].second & mask_type_exact) { - instr->operands[k] = Operand(ctx.info[block->index].exec[i].first); - Temp new_mask = bld.tmp(s2); - instr->definitions[k] = Definition(new_mask); - if (i == ctx.info[block->index].exec.size() - 1) - instr->definitions[k].setFixed(exec); - k++; - ctx.info[block->index].exec[i].first = new_mask; + Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + ctx.info[block->index].exec[i].first, cond); + if (i == num - 1) { + andn2->operands[0].setFixed(exec); + andn2->definitions[0].setFixed(exec); + } + if (i == 0) { + instr->opcode = aco_opcode::p_exit_early_if; + instr->operands[0] = bld.scc(andn2->definitions[1].getTemp()); + } + ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); + } else { + assert(i != 0); } } - assert(k == num); - instr->definitions[num] = bld.def(s1, scc); - instr->operands[num] = Operand(cond); state = Exact; } else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) { @@ -878,18 +879,15 @@ void add_branch_code(exec_ctx& ctx, Block* block) bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec)); ctx.info[idx].exec.back().first = new_exec; - aco_ptr discard{create_instruction(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)}; - for (unsigned i = 0; i < num; i++) { - discard->operands[i] = Operand(ctx.info[block->index].exec[i].first); - Temp new_mask = bld.tmp(s2); - discard->definitions[i] = Definition(new_mask); - ctx.info[block->index].exec[i].first = new_mask; + for (int i = num - 1; i >= 0; i--) { + Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), + ctx.info[block->index].exec[i].first, cond); + if (i == 0) + bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp())); + ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp(); } assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0); - discard->operands[num] = Operand(cond); - discard->definitions[num] = bld.def(s1, scc); - bld.insert(std::move(discard)); if ((block->kind & (block_kind_break | block_kind_uniform)) == block_kind_break) ctx.info[idx].exec.back().first = cond; bld.insert(std::move(branch)); diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index bba091fd74b..d1849d7b92b 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3266,6 +3266,7 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr) ctx->program->needs_exact = true; + // TODO: optimize uniform conditions Builder bld(ctx->program, ctx->block); Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false); src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 8fd33e47d92..39585111954 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -606,15 +606,15 @@ void lower_to_hw_instr(Program* program) handle_operands(copy_operations, &ctx, program->chip_class, pi); break; } - case aco_opcode::p_discard_if: + case aco_opcode::p_exit_early_if: { - bool early_exit = false; - if (block->instructions[j + 1]->opcode != aco_opcode::p_logical_end || - block->instructions[j + 2]->opcode != aco_opcode::s_endpgm) { - early_exit = true; + /* don't bother with an early exit at the end of the program */ + if (block->instructions[j + 1]->opcode == aco_opcode::p_logical_end && + block->instructions[j + 2]->opcode == aco_opcode::s_endpgm) { + break; } - if (early_exit && !discard_block) { + if (!discard_block) { discard_block = program->create_and_insert_block(); block = &program->blocks[i]; @@ -628,26 +628,13 @@ void lower_to_hw_instr(Program* program) bld.reset(&ctx.instructions); } - // TODO: optimize uniform conditions - Definition branch_cond = instr->definitions.back(); - Operand discard_cond = instr->operands.back(); - aco_ptr sop2; - /* backwards, to finally branch on the global exec mask */ - for (int i = instr->operands.size() - 2; i >= 0; i--) { - bld.sop2(aco_opcode::s_andn2_b64, - instr->definitions[i], /* new mask */ - branch_cond, /* scc */ - instr->operands[i], /* old mask */ - discard_cond); - } - - if (early_exit) { - bld.sopp(aco_opcode::s_cbranch_scc0, bld.scc(branch_cond.getTemp()), discard_block->index); + //TODO: exec can be zero here with block_kind_discard - discard_block->linear_preds.push_back(block->index); - block->linear_succs.push_back(discard_block->index); - } + assert(instr->operands[0].physReg() == scc); + bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index); + discard_block->linear_preds.push_back(block->index); + block->linear_succs.push_back(discard_block->index); break; } case aco_opcode::p_spill: diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index a5b4eb9a54e..a358527e60b 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -236,6 +236,7 @@ opcode("p_discard_if") opcode("p_load_helper") opcode("p_demote_to_helper") opcode("p_is_helper") +opcode("p_exit_early_if") opcode("p_fs_buffer_store_smem", format=Format.SMEM) diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index 0cd67a979e0..09076a9a71f 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -220,6 +220,8 @@ void schedule_SMEM(sched_ctx& ctx, Block* block, break; if (candidate->opcode == aco_opcode::p_logical_start) break; + if (candidate->opcode == aco_opcode::p_exit_early_if) + break; if (!can_move_instr(candidate, current, moving_interaction)) break; register_pressure.update(register_demand[candidate_idx]); @@ -445,6 +447,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, break; if (candidate->opcode == aco_opcode::p_logical_start) break; + if (candidate->opcode == aco_opcode::p_exit_early_if) + break; if (!can_move_instr(candidate, current, moving_interaction)) break; @@ -665,6 +669,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block, /* break when encountering logical_start or barriers */ if (candidate->opcode == aco_opcode::p_logical_start) break; + if (candidate->opcode == aco_opcode::p_exit_early_if) + break; if (candidate->isVMEM() || candidate->format == Format::SMEM) break; if (!can_move_instr(candidate, current, moving_interaction)) -- 2.30.2