From 8a32f57fff56b3b94f1b5589feba38016f39427c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 16 Jan 2020 19:32:31 +0100 Subject: [PATCH] aco: Transform uniform bitwise instructions to 32-bit if possible. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This allows removing superfluous s_cselect instructions that come from turning booleans into 64-bit vector condition. v2 by Daniel Schürmann: - Make the code massively simpler v3 by Timur Kristóf: - Fix regressions, make it work in wave32 mode - Eliminate extra moves by not always using the SCC definition - Use s_absdiff_i32 for uniform XOR - Skip the transformation for uncommon or invalid instructions Signed-off-by: Timur Kristóf Reviewed-by: Daniel Schürmann Tested-by: Marge Bot Part-of: --- src/amd/compiler/aco_optimizer.cpp | 87 ++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 1088d8aec9e..73b9c653a64 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -86,6 +86,7 @@ enum Label { label_uniform_bitwise = 1 << 23, label_scc_invert = 1 << 24, label_vcc_hint = 1 << 25, + label_scc_needed = 1 << 26, }; static constexpr uint32_t instr_labels = label_vec | label_mul | label_mad | label_omod_success | label_clamp_success | @@ -384,6 +385,16 @@ struct ssa_info { return label & label_fcmp; } + void set_scc_needed() + { + add_label(label_scc_needed); + } + + bool is_scc_needed() + { + return label & label_scc_needed; + } + void set_scc_invert(Temp scc_inv) { add_label(label_scc_invert); @@ -2458,6 +2469,53 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr } } +bool to_uniform_bool_instr(opt_ctx &ctx, aco_ptr &instr) +{ + switch (instr->opcode) { + case aco_opcode::s_and_b32: + case aco_opcode::s_and_b64: + instr->opcode = aco_opcode::s_and_b32; + break; + case aco_opcode::s_or_b32: + case aco_opcode::s_or_b64: + instr->opcode = aco_opcode::s_or_b32; + break; + case aco_opcode::s_xor_b32: + case aco_opcode::s_xor_b64: + instr->opcode = aco_opcode::s_absdiff_i32; + break; + default: + /* Don't transform other instructions. They are very unlikely to appear here. */ + return false; + } + + for (Operand &op : instr->operands) { + ctx.uses[op.tempId()]--; + + if (ctx.info[op.tempId()].is_uniform_bool()) { + /* Just use the uniform boolean temp. */ + op.setTemp(ctx.info[op.tempId()].temp); + } else if (ctx.info[op.tempId()].is_uniform_bitwise()) { + /* Use the SCC definition of the predecessor instruction. + * This allows the predecessor to get picked up by the same optimization (if it has no divergent users), + * and it also makes sure that the current instruction will keep working even if the predecessor won't be transformed. + */ + Instruction *pred_instr = ctx.info[op.tempId()].instr; + assert(pred_instr->definitions.size() >= 2); + assert(pred_instr->definitions[1].isFixed() && pred_instr->definitions[1].physReg() == scc); + op.setTemp(pred_instr->definitions[1].getTemp()); + } else { + unreachable("Invalid operand on uniform bitwise instruction."); + } + + ctx.uses[op.tempId()]++; + } + + instr->definitions[0].setTemp(Temp(instr->definitions[0].tempId(), s1)); + assert(instr->operands[0].regClass() == s1); + assert(instr->operands[1].regClass() == s1); + return true; +} void select_instruction(opt_ctx &ctx, aco_ptr& instr) { @@ -2571,10 +2629,39 @@ void select_instruction(opt_ctx &ctx, aco_ptr& instr) } } + /* Mark SCC needed, so the uniform boolean transformation won't swap the definitions when it isn't beneficial */ + if (instr->format == Format::PSEUDO_BRANCH && + instr->operands.size() && + instr->operands[0].isTemp()) { + ctx.info[instr->operands[0].tempId()].set_scc_needed(); + return; + } else if ((instr->opcode == aco_opcode::s_cselect_b64 || + instr->opcode == aco_opcode::s_cselect_b32) && + instr->operands[2].isTemp()) { + ctx.info[instr->operands[2].tempId()].set_scc_needed(); + } + /* check for literals */ if (!instr->isSALU() && !instr->isVALU()) return; + /* Transform uniform bitwise boolean operations to 32-bit when there are no divergent uses. */ + if (instr->definitions.size() && + ctx.uses[instr->definitions[0].tempId()] == 0 && + ctx.info[instr->definitions[0].tempId()].is_uniform_bitwise()) { + bool transform_done = to_uniform_bool_instr(ctx, instr); + + if (transform_done && !ctx.info[instr->definitions[1].tempId()].is_scc_needed()) { + /* Swap the two definition IDs in order to avoid overusing the SCC. This reduces extra moves generated by RA. */ + uint32_t def0_id = instr->definitions[0].getTemp().id(); + uint32_t def1_id = instr->definitions[1].getTemp().id(); + instr->definitions[0].setTemp(Temp(def1_id, s1)); + instr->definitions[1].setTemp(Temp(def0_id, s1)); + } + + return; + } + if (instr->isSDWA() || instr->isDPP() || (instr->isVOP3() && ctx.program->chip_class < GFX10)) return; /* some encodings can't ever take literals */ -- 2.30.2