From 47b0653d5dc7e7e6bed9263254e7436ca8b830cd Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 6 Jan 2020 15:46:28 +0000 Subject: [PATCH] aco: rework boolean phi pass MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The pass should now create much less linear phis. Removes piles of phis and lots of sgpr spilling from Detroit: Become Human and parallel-rdp. fossil-db (Navi): Totals from 7654 (5.63% of 135946) affected shaders: SGPRs: 796224 -> 787616 (-1.08%); split: -1.08%, +0.00% VGPRs: 576164 -> 572116 (-0.70%); split: -0.70%, +0.00% SpillSGPRs: 147695 -> 52258 (-64.62%) SpillVGPRs: 2167 -> 2102 (-3.00%) CodeSize: 80671680 -> 76240420 (-5.49%); split: -5.50%, +0.01% Scratch: 137216 -> 135168 (-1.49%) MaxWaves: 54235 -> 54707 (+0.87%) Instrs: 15569429 -> 14820569 (-4.81%); split: -4.82%, +0.01% Signed-off-by: Rhys Perry Co-authored-by: Daniel Schürmann Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_lower_phis.cpp | 168 +++++++++++++--------------- 1 file changed, 79 insertions(+), 89 deletions(-) diff --git a/src/amd/compiler/aco_lower_phis.cpp b/src/amd/compiler/aco_lower_phis.cpp index c47f59e7cc6..b90d99ee424 100644 --- a/src/amd/compiler/aco_lower_phis.cpp +++ b/src/amd/compiler/aco_lower_phis.cpp @@ -34,91 +34,56 @@ namespace aco { -struct phi_use { - Block *block; - unsigned phi_def; - - bool operator<(const phi_use& other) const { - return std::make_tuple(block, phi_def) < - std::make_tuple(other.block, other.phi_def); - } -}; - struct ssa_state { - std::map latest; - std::map> phis; + bool needs_init; + uint64_t cur_undef_operands; + + unsigned phi_block_idx; + unsigned loop_nest_depth; + std::map writes; + std::vector latest; }; -Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state) +Operand get_ssa(Program *program, unsigned block_idx, ssa_state *state, bool before_write) { - while (true) { - auto pos = state->latest.find(block_idx); - if (pos != state->latest.end()) - return Operand(Temp(pos->second, program->lane_mask)); - - Block& block = program->blocks[block_idx]; - size_t pred = block.linear_preds.size(); - if (pred == 0) { - return Operand(program->lane_mask); - } else if (pred == 1) { - block_idx = block.linear_preds[0]; - continue; - } else { - unsigned res = program->allocateId(); - state->latest[block_idx] = res; - - aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)}; - for (unsigned i = 0; i < pred; i++) { - phi->operands[i] = get_ssa(program, block.linear_preds[i], state); - if (phi->operands[i].isTemp()) { - assert(i < 64); - state->phis[phi->operands[i].tempId()][(phi_use){&block, res}] |= (uint64_t)1 << i; - } - } - phi->definitions[0] = Definition(Temp{res, program->lane_mask}); - block.instructions.emplace(block.instructions.begin(), std::move(phi)); - - return Operand(Temp(res, program->lane_mask)); - } + if (!before_write) { + auto it = state->writes.find(block_idx); + if (it != state->writes.end()) + return Operand(Temp(it->second, program->lane_mask)); + if (state->latest[block_idx]) + return Operand(Temp(state->latest[block_idx], program->lane_mask)); } -} -void update_phi(Program *program, ssa_state *state, Block *block, unsigned phi_def, uint64_t operand_mask) { - for (auto& phi : block->instructions) { - if (phi->opcode != aco_opcode::p_phi && phi->opcode != aco_opcode::p_linear_phi) - break; - if (phi->opcode != aco_opcode::p_linear_phi) - continue; - if (phi->definitions[0].tempId() != phi_def) - continue; - assert(ffsll(operand_mask) <= phi->operands.size()); - - uint64_t operands = operand_mask; - while (operands) { - unsigned operand = u_bit_scan64(&operands); - Operand new_operand = get_ssa(program, block->linear_preds[operand], state); - phi->operands[operand] = new_operand; - if (!new_operand.isUndefined()) - state->phis[new_operand.tempId()][(phi_use){block, phi_def}] |= (uint64_t)1 << operand; - } - return; - } - assert(false); -} + Block& block = program->blocks[block_idx]; + size_t pred = block.linear_preds.size(); + if (pred == 0 || block.loop_nest_depth < state->loop_nest_depth) { + return Operand(program->lane_mask); + } else if (block.loop_nest_depth > state->loop_nest_depth) { + Operand op = get_ssa(program, block_idx - 1, state, false); + assert(!state->latest[block_idx]); + state->latest[block_idx] = op.tempId(); + return op; + } else if (pred == 1 || block.kind & block_kind_loop_exit) { + Operand op = get_ssa(program, block.linear_preds[0], state, false); + assert(!state->latest[block_idx]); + state->latest[block_idx] = op.tempId(); + return op; + } else if (block.kind & block_kind_loop_header && + !(program->blocks[state->phi_block_idx].kind & block_kind_loop_exit)) { + return Operand(program->lane_mask); + } else { + unsigned res = program->allocateId(); + assert(!state->latest[block_idx]); + state->latest[block_idx] = res; -Temp write_ssa(Program *program, Block *block, ssa_state *state, unsigned previous) { - unsigned id = program->allocateId(); - state->latest[block->index] = id; + aco_ptr phi{create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)}; + for (unsigned i = 0; i < pred; i++) + phi->operands[i] = get_ssa(program, block.linear_preds[i], state, false); + phi->definitions[0] = Definition(Temp{res, program->lane_mask}); + block.instructions.emplace(block.instructions.begin(), std::move(phi)); - /* update phis */ - if (previous) { - std::map phis; - phis.swap(state->phis[previous]); - for (auto& phi : phis) - update_phi(program, state, phi.first.block, phi.first.phi_def, phi.second); + return Operand(Temp(res, program->lane_mask)); } - - return {id, program->lane_mask}; } void insert_before_logical_end(Block *block, aco_ptr instr) @@ -136,29 +101,51 @@ void insert_before_logical_end(Block *block, aco_ptr instr) block->instructions.insert(std::prev(it.base()), std::move(instr)); } -void lower_divergent_bool_phi(Program *program, Block *block, aco_ptr& phi) +void lower_divergent_bool_phi(Program *program, ssa_state *state, Block *block, aco_ptr& phi) { Builder bld(program); - ssa_state state; - state.latest[block->index] = phi->definitions[0].tempId(); + state->latest.resize(program->blocks.size()); + + uint64_t undef_operands = 0; + for (unsigned i = 0; i < phi->operands.size(); i++) + undef_operands |= phi->operands[i].isUndefined() << i; + + if (state->needs_init || undef_operands != state->cur_undef_operands || + block->logical_preds.size() > 64) { + /* this only has to be done once per block unless the set of predecessors + * which are undefined changes */ + state->cur_undef_operands = undef_operands; + state->phi_block_idx = block->index; + state->loop_nest_depth = block->loop_nest_depth; + if (block->kind & block_kind_loop_exit) { + state->loop_nest_depth += 1; + } + state->writes.clear(); + state->needs_init = false; + } + std::fill(state->latest.begin(), state->latest.end(), 0); + + for (unsigned i = 0; i < phi->operands.size(); i++) { + if (phi->operands[i].isUndefined()) + continue; + + state->writes[block->logical_preds[i]] = program->allocateId(); + } + for (unsigned i = 0; i < phi->operands.size(); i++) { Block *pred = &program->blocks[block->logical_preds[i]]; if (phi->operands[i].isUndefined()) continue; - assert(phi->operands[i].isTemp()); - Temp phi_src = phi->operands[i].getTemp(); - assert(phi_src.regClass() == bld.lm); - - Operand cur = get_ssa(program, pred->index, &state); + Operand cur = get_ssa(program, pred->index, state, true); assert(cur.regClass() == bld.lm); - Temp new_cur = write_ssa(program, pred, &state, cur.isTemp() ? cur.tempId() : 0); + Temp new_cur = {state->writes.at(pred->index), program->lane_mask}; assert(new_cur.regClass() == bld.lm); if (cur.isUndefined()) { - insert_before_logical_end(pred, bld.sop1(aco_opcode::s_mov_b64, Definition(new_cur), phi_src).get_ptr()); + insert_before_logical_end(pred, bld.sop1(aco_opcode::s_mov_b64, Definition(new_cur), phi->operands[i]).get_ptr()); } else { Temp tmp1 = bld.tmp(bld.lm), tmp2 = bld.tmp(bld.lm); insert_before_logical_end(pred, @@ -166,7 +153,7 @@ void lower_divergent_bool_phi(Program *program, Block *block, aco_ptroperands[i].getTemp(), Operand(exec, bld.lm)).get_ptr()); insert_before_logical_end(pred, bld.sop2(Builder::s_or, Definition(new_cur), bld.def(s1, scc), tmp1, tmp2).get_ptr()); @@ -184,7 +171,7 @@ void lower_divergent_bool_phi(Program *program, Block *block, aco_ptroperands.size() == num_preds); for (unsigned i = 0; i < num_preds; i++) - phi->operands[i] = get_ssa(program, block->linear_preds[i], &state); + phi->operands[i] = get_ssa(program, block->linear_preds[i], state, false); return; } @@ -215,12 +202,15 @@ void lower_subdword_phis(Program *program, Block *block, aco_ptr& p void lower_phis(Program* program) { + ssa_state state; + for (Block& block : program->blocks) { + state.needs_init = true; for (aco_ptr& phi : block.instructions) { if (phi->opcode == aco_opcode::p_phi) { assert(program->wave_size == 64 ? phi->definitions[0].regClass() != s1 : phi->definitions[0].regClass() != s2); if (phi->definitions[0].regClass() == program->lane_mask) - lower_divergent_bool_phi(program, &block, phi); + lower_divergent_bool_phi(program, &state, &block, phi); else if (phi->definitions[0].regClass().is_subdword()) lower_subdword_phis(program, &block, phi); } else if (!is_phi(phi)) { -- 2.30.2