From 0b8216b2cdbcaccfd2bd1a65be6b8ac5654e3067 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= Date: Tue, 15 Oct 2019 18:23:52 +0200 Subject: [PATCH] aco: Lower to CSSA Converting to 'Conventional SSA Form' ensures correctness w.r.t. spilling of phi nodes. Previously, it was possible that phi operands have intersecting live-ranges, and thus, couldn't get spilled to the same spilling slot. For this reason, ACO tried to avoid to spill phis, even if it was beneficial. This patch implements a conversion pass which is currently only called if spilling is necessary. Reviewed-by: Rhys Perry --- src/amd/compiler/aco_builder_h.py | 19 +- src/amd/compiler/aco_lower_to_cssa.cpp | 240 +++++++++++++++++++++++++ src/amd/compiler/aco_spill.cpp | 49 ++--- src/amd/compiler/meson.build | 1 + 4 files changed, 268 insertions(+), 41 deletions(-) create mode 100644 src/amd/compiler/aco_lower_to_cssa.cpp diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 8ee86716a29..7359a6f1ab4 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -115,10 +115,8 @@ public: Program *program; bool use_iterator; - union { - bool forwards; //when use_iterator == true - bool start; //when use_iterator == false - }; + bool start; // only when use_iterator == false + std::vector> *instructions; std::vector>::iterator it; @@ -148,13 +146,19 @@ public: instructions = instrs; } + void reset(std::vector> *instrs, std::vector>::iterator instr_it) { + use_iterator = true; + start = false; + instructions = instrs; + it = instr_it; + } + Result insert(aco_ptr instr) { Instruction *instr_ptr = instr.get(); if (instructions) { if (use_iterator) { it = instructions->emplace(it, std::move(instr)); - if (forwards) - it = std::next(it); + it = std::next(it); } else if (!start) { instructions->emplace_back(std::move(instr)); } else { @@ -168,8 +172,7 @@ public: if (instructions) { if (use_iterator) { it = instructions->emplace(it, aco_ptr(instr)); - if (forwards) - it = std::next(it); + it = std::next(it); } else if (!start) { instructions->emplace_back(aco_ptr(instr)); } else { diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp new file mode 100644 index 00000000000..7d26363ec80 --- /dev/null +++ b/src/amd/compiler/aco_lower_to_cssa.cpp @@ -0,0 +1,240 @@ +/* + * Copyright © 2019 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include +#include "aco_ir.h" +#include "aco_builder.h" + +/* + * Implements an algorithm to lower to Concentional SSA Form (CSSA). + * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency" + * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon, + * + * By lowering the IR to CSSA, the insertion of parallelcopies is separated from + * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling. + * The algorithm tries to find beneficial insertion points by checking if a basic block + * is empty and if the variable already has a new definition in a dominating block. + */ + + +namespace aco { +namespace { + +typedef std::map>> phi_info; + +struct cssa_ctx { + Program* program; + live& live_vars; + phi_info logical_phi_info; + phi_info linear_phi_info; + + cssa_ctx(Program* program, live& live_vars) : program(program), live_vars(live_vars) {} +}; + +unsigned get_lca(cssa_ctx& ctx, unsigned x, unsigned y, bool is_logical) +{ + while (x != y) { + if (x > y) { + x = is_logical ? ctx.program->blocks[x].logical_idom : ctx.program->blocks[x].linear_idom; + } else { + y = is_logical ? ctx.program->blocks[y].logical_idom : ctx.program->blocks[y].linear_idom; + } + } + return x; +} + +bool collect_phi_info(cssa_ctx& ctx) +{ + bool progress = false; + for (Block& block : ctx.program->blocks) { + for (aco_ptr& phi : block.instructions) { + bool is_logical; + if (phi->opcode == aco_opcode::p_phi) + is_logical = true; + else if (phi->opcode == aco_opcode::p_linear_phi) + is_logical = false; + else + break; + + /* no CSSA for the exec mask as we don't spill it anyway */ + if (phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec) + continue; + std::vector& preds = is_logical ? block.logical_preds : block.linear_preds; + + /* collect definition's block per Operand */ + std::vector def_points(phi->operands.size()); + for (unsigned i = 0; i < phi->operands.size(); i++) { + Operand& op = phi->operands[i]; + if (op.isUndefined()) { + def_points[i] = preds[i]; + } else if (op.isConstant()) { + /* in theory, we could insert the definition there... */ + def_points[i] = 0; + } else { + assert(op.isTemp()); + unsigned pred = preds[i]; + do { + def_points[i] = pred; + pred = is_logical ? + ctx.program->blocks[pred].logical_idom : + ctx.program->blocks[pred].linear_idom; + } while (def_points[i] != pred && + ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end()); + } + } + + /* check live-range intersections */ + for (unsigned i = 0; i < phi->operands.size(); i++) { + Operand op = phi->operands[i]; + if (op.isUndefined()) + continue; + /* check if the operand comes from the exec mask of a predecessor */ + if (op.isTemp() && op.getTemp() == ctx.program->blocks[preds[i]].live_out_exec) + op.setFixed(exec); + + /* calculate the earliest block where we can insert a copy if needed */ + bool intersects = false; + unsigned upper_bound = preds[i]; + while (def_points[i] < upper_bound) { + unsigned new_ub = is_logical ? + ctx.program->blocks[upper_bound].logical_idom : + ctx.program->blocks[upper_bound].linear_idom; + + for (unsigned j = 0; j < phi->operands.size(); j++) { + /* same operands cannot intersect */ + if (phi->operands[j].isTemp() && op.getTemp() == phi->operands[j].getTemp()) + continue; + /* live-ranges intersect if any other definition point is dominated by the current definition */ + intersects |= def_points[j] == new_ub; + } + if (intersects) + break; + else + upper_bound = new_ub; + } + + if (!intersects) { + /* constants and live-through definitions can get a copy + * at their definition point if there is no other intersection */ + if (op.isConstant() || !op.isKill() || op.regClass().type() != phi->definitions[0].regClass().type()) { + upper_bound = def_points[i]; + } else { + continue; + } + } + + progress = true; + unsigned insert_block = preds[i]; + + /* if the predecessor block is empty, try to insert at the dominator */ + bool is_empty = (is_logical && ctx.program->blocks[insert_block].instructions.size() == 3) || + ctx.program->blocks[insert_block].instructions.size() == 1; + if (is_empty) { + unsigned idom = is_logical ? + ctx.program->blocks[insert_block].logical_idom : + ctx.program->blocks[insert_block].linear_idom; + if (idom > upper_bound) + insert_block = idom; + } + + /* if other operands share the same value, try to insert at LCA */ + std::vector indices = {i}; + for (unsigned j = i + 1; j < phi->operands.size(); j++) { + if ((phi->operands[j].isTemp() && op.isTemp() && phi->operands[j].getTemp() == op.getTemp()) || + (phi->operands[j].isConstant() && op.isConstant() && phi->operands[j].constantValue() == op.constantValue())) { + unsigned lca = get_lca(ctx, insert_block, preds[j], is_logical); + if (lca > upper_bound) { + insert_block = lca; + indices.push_back(j); + } + } + } + + /* create new temporary and rename operands */ + Temp new_tmp = Temp{ctx.program->allocateId(), phi->definitions[0].regClass()}; + if (is_logical) + ctx.logical_phi_info[insert_block].emplace_back(Definition(new_tmp), op); + else + ctx.linear_phi_info[insert_block].emplace_back(Definition(new_tmp), op); + for (unsigned index : indices) { + phi->operands[index] = Operand(new_tmp); + phi->operands[index].setKill(true); + def_points[index] = insert_block; + } + } + } + } + return progress; +} + +void insert_parallelcopies(cssa_ctx& ctx) +{ + /* insert the parallelcopies from logical phis before p_logical_end */ + for (auto&& entry : ctx.logical_phi_info) { + Block& block = ctx.program->blocks[entry.first]; + unsigned idx = block.instructions.size() - 1; + while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) { + assert(idx > 0); + idx--; + } + + Builder bld(ctx.program); + bld.reset(&block.instructions, std::next(block.instructions.begin(), idx)); + for (std::pair& pair : entry.second) + bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second); + } + + /* insert parallelcopies for the linear phis at the end of blocks just before the branch */ + for (auto&& entry : ctx.linear_phi_info) { + Block& block = ctx.program->blocks[entry.first]; + std::vector>::iterator it = block.instructions.end(); + --it; + assert((*it)->format == Format::PSEUDO_BRANCH); + + Builder bld(ctx.program); + bld.reset(&block.instructions, it); + for (std::pair& pair : entry.second) + bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second); + } +} + +} /* end namespace */ + + +void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options) +{ + cssa_ctx ctx = {program, live_vars}; + /* collect information about all interfering phi operands */ + bool progress = collect_phi_info(ctx); + + if (!progress) + return; + + insert_parallelcopies(ctx); + + /* update live variable information */ + live_vars = live_var_analysis(program, options); +} +} + diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index fefa8a8221b..d17e02d6165 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -541,9 +541,10 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id bool spill = true; for (unsigned i = 0; i < phi->operands.size(); i++) { - if (!phi->operands[i].isTemp()) - spill = false; - else if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end()) + if (phi->operands[i].isUndefined()) + continue; + assert(phi->operands[i].isTemp()); + if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end()) spill = false; else partial_spills.insert(phi->definitions[0].getTemp()); @@ -720,43 +721,23 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()]; for (unsigned i = 0; i < phi->operands.size(); i++) { - unsigned pred_idx = preds[i]; - - /* we have to spill constants to the same memory address */ - if (phi->operands[i].isConstant()) { - uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass()); - for (std::pair pair : ctx.spills_exit[pred_idx]) { - ctx.interferences[def_spill_id].second.emplace(pair.second); - ctx.interferences[pair.second].second.emplace(def_spill_id); - } - ctx.affinities.emplace_back(std::pair{def_spill_id, spill_id}); - - aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; - spill->operands[0] = phi->operands[i]; - spill->operands[1] = Operand(spill_id); - Block& pred = ctx.program->blocks[pred_idx]; - unsigned idx = pred.instructions.size(); - do { - assert(idx != 0); - idx--; - } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end); - std::vector>::iterator it = std::next(pred.instructions.begin(), idx); - pred.instructions.insert(it, std::move(spill)); - continue; - } - if (!phi->operands[i].isTemp()) + if (phi->operands[i].isUndefined()) continue; + unsigned pred_idx = preds[i]; + assert(phi->operands[i].isTemp() && phi->operands[i].isKill()); + Temp var = phi->operands[i].getTemp(); + /* build interferences between the phi def and all spilled variables at the predecessor blocks */ for (std::pair pair : ctx.spills_exit[pred_idx]) { - if (phi->operands[i].getTemp() == pair.first) + if (var == pair.first) continue; ctx.interferences[def_spill_id].second.emplace(pair.second); ctx.interferences[pair.second].second.emplace(def_spill_id); } - /* variable is already spilled at predecessor */ - std::map::iterator spilled = ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp()); + /* check if variable is already spilled at predecessor */ + std::map::iterator spilled = ctx.spills_exit[pred_idx].find(var); if (spilled != ctx.spills_exit[pred_idx].end()) { if (spilled->second != def_spill_id) ctx.affinities.emplace_back(std::pair{def_spill_id, spilled->second}); @@ -764,7 +745,6 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) } /* rename if necessary */ - Temp var = phi->operands[i].getTemp(); std::map::iterator rename_it = ctx.renames[pred_idx].find(var); if (rename_it != ctx.renames[pred_idx].end()) { var = rename_it->second; @@ -813,7 +793,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) continue; } - /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ // FIXME: lower_to_cssa() + /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end()) continue; @@ -1567,6 +1547,9 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt if (program->num_waves >= 6) return; + /* lower to CSSA before spilling to ensure correctness w.r.t. phis */ + lower_to_cssa(program, live_vars, options); + /* else, we check if we can improve things a bit */ /* calculate target register demand */ diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build index 4b22b82eedd..bce5093651b 100644 --- a/src/amd/compiler/meson.build +++ b/src/amd/compiler/meson.build @@ -69,6 +69,7 @@ libaco_files = files( 'aco_register_allocation.cpp', 'aco_live_var_analysis.cpp', 'aco_lower_bool_phis.cpp', + 'aco_lower_to_cssa.cpp', 'aco_lower_to_hw_instr.cpp', 'aco_optimizer.cpp', 'aco_opt_value_numbering.cpp', -- 2.30.2