--- /dev/null
+/*
+ * Copyright © 2019 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <map>
+#include "aco_ir.h"
+#include "aco_builder.h"
+
+/*
+ * Implements an algorithm to lower to Concentional SSA Form (CSSA).
+ * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
+ * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
+ *
+ * By lowering the IR to CSSA, the insertion of parallelcopies is separated from
+ * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
+ * The algorithm tries to find beneficial insertion points by checking if a basic block
+ * is empty and if the variable already has a new definition in a dominating block.
+ */
+
+
+namespace aco {
+namespace {
+
+typedef std::map<uint32_t, std::vector<std::pair<Definition, Operand>>> phi_info;
+
+struct cssa_ctx {
+ Program* program;
+ live& live_vars;
+ phi_info logical_phi_info;
+ phi_info linear_phi_info;
+
+ cssa_ctx(Program* program, live& live_vars) : program(program), live_vars(live_vars) {}
+};
+
+unsigned get_lca(cssa_ctx& ctx, unsigned x, unsigned y, bool is_logical)
+{
+ while (x != y) {
+ if (x > y) {
+ x = is_logical ? ctx.program->blocks[x].logical_idom : ctx.program->blocks[x].linear_idom;
+ } else {
+ y = is_logical ? ctx.program->blocks[y].logical_idom : ctx.program->blocks[y].linear_idom;
+ }
+ }
+ return x;
+}
+
+bool collect_phi_info(cssa_ctx& ctx)
+{
+ bool progress = false;
+ for (Block& block : ctx.program->blocks) {
+ for (aco_ptr<Instruction>& phi : block.instructions) {
+ bool is_logical;
+ if (phi->opcode == aco_opcode::p_phi)
+ is_logical = true;
+ else if (phi->opcode == aco_opcode::p_linear_phi)
+ is_logical = false;
+ else
+ break;
+
+ /* no CSSA for the exec mask as we don't spill it anyway */
+ if (phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec)
+ continue;
+ std::vector<unsigned>& preds = is_logical ? block.logical_preds : block.linear_preds;
+
+ /* collect definition's block per Operand */
+ std::vector<unsigned> def_points(phi->operands.size());
+ for (unsigned i = 0; i < phi->operands.size(); i++) {
+ Operand& op = phi->operands[i];
+ if (op.isUndefined()) {
+ def_points[i] = preds[i];
+ } else if (op.isConstant()) {
+ /* in theory, we could insert the definition there... */
+ def_points[i] = 0;
+ } else {
+ assert(op.isTemp());
+ unsigned pred = preds[i];
+ do {
+ def_points[i] = pred;
+ pred = is_logical ?
+ ctx.program->blocks[pred].logical_idom :
+ ctx.program->blocks[pred].linear_idom;
+ } while (def_points[i] != pred &&
+ ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
+ }
+ }
+
+ /* check live-range intersections */
+ for (unsigned i = 0; i < phi->operands.size(); i++) {
+ Operand op = phi->operands[i];
+ if (op.isUndefined())
+ continue;
+ /* check if the operand comes from the exec mask of a predecessor */
+ if (op.isTemp() && op.getTemp() == ctx.program->blocks[preds[i]].live_out_exec)
+ op.setFixed(exec);
+
+ /* calculate the earliest block where we can insert a copy if needed */
+ bool intersects = false;
+ unsigned upper_bound = preds[i];
+ while (def_points[i] < upper_bound) {
+ unsigned new_ub = is_logical ?
+ ctx.program->blocks[upper_bound].logical_idom :
+ ctx.program->blocks[upper_bound].linear_idom;
+
+ for (unsigned j = 0; j < phi->operands.size(); j++) {
+ /* same operands cannot intersect */
+ if (phi->operands[j].isTemp() && op.getTemp() == phi->operands[j].getTemp())
+ continue;
+ /* live-ranges intersect if any other definition point is dominated by the current definition */
+ intersects |= def_points[j] == new_ub;
+ }
+ if (intersects)
+ break;
+ else
+ upper_bound = new_ub;
+ }
+
+ if (!intersects) {
+ /* constants and live-through definitions can get a copy
+ * at their definition point if there is no other intersection */
+ if (op.isConstant() || !op.isKill() || op.regClass().type() != phi->definitions[0].regClass().type()) {
+ upper_bound = def_points[i];
+ } else {
+ continue;
+ }
+ }
+
+ progress = true;
+ unsigned insert_block = preds[i];
+
+ /* if the predecessor block is empty, try to insert at the dominator */
+ bool is_empty = (is_logical && ctx.program->blocks[insert_block].instructions.size() == 3) ||
+ ctx.program->blocks[insert_block].instructions.size() == 1;
+ if (is_empty) {
+ unsigned idom = is_logical ?
+ ctx.program->blocks[insert_block].logical_idom :
+ ctx.program->blocks[insert_block].linear_idom;
+ if (idom > upper_bound)
+ insert_block = idom;
+ }
+
+ /* if other operands share the same value, try to insert at LCA */
+ std::vector<unsigned> indices = {i};
+ for (unsigned j = i + 1; j < phi->operands.size(); j++) {
+ if ((phi->operands[j].isTemp() && op.isTemp() && phi->operands[j].getTemp() == op.getTemp()) ||
+ (phi->operands[j].isConstant() && op.isConstant() && phi->operands[j].constantValue() == op.constantValue())) {
+ unsigned lca = get_lca(ctx, insert_block, preds[j], is_logical);
+ if (lca > upper_bound) {
+ insert_block = lca;
+ indices.push_back(j);
+ }
+ }
+ }
+
+ /* create new temporary and rename operands */
+ Temp new_tmp = Temp{ctx.program->allocateId(), phi->definitions[0].regClass()};
+ if (is_logical)
+ ctx.logical_phi_info[insert_block].emplace_back(Definition(new_tmp), op);
+ else
+ ctx.linear_phi_info[insert_block].emplace_back(Definition(new_tmp), op);
+ for (unsigned index : indices) {
+ phi->operands[index] = Operand(new_tmp);
+ phi->operands[index].setKill(true);
+ def_points[index] = insert_block;
+ }
+ }
+ }
+ }
+ return progress;
+}
+
+void insert_parallelcopies(cssa_ctx& ctx)
+{
+ /* insert the parallelcopies from logical phis before p_logical_end */
+ for (auto&& entry : ctx.logical_phi_info) {
+ Block& block = ctx.program->blocks[entry.first];
+ unsigned idx = block.instructions.size() - 1;
+ while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
+ assert(idx > 0);
+ idx--;
+ }
+
+ Builder bld(ctx.program);
+ bld.reset(&block.instructions, std::next(block.instructions.begin(), idx));
+ for (std::pair<Definition, Operand>& pair : entry.second)
+ bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
+ }
+
+ /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
+ for (auto&& entry : ctx.linear_phi_info) {
+ Block& block = ctx.program->blocks[entry.first];
+ std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
+ --it;
+ assert((*it)->format == Format::PSEUDO_BRANCH);
+
+ Builder bld(ctx.program);
+ bld.reset(&block.instructions, it);
+ for (std::pair<Definition, Operand>& pair : entry.second)
+ bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
+ }
+}
+
+} /* end namespace */
+
+
+void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options)
+{
+ cssa_ctx ctx = {program, live_vars};
+ /* collect information about all interfering phi operands */
+ bool progress = collect_phi_info(ctx);
+
+ if (!progress)
+ return;
+
+ insert_parallelcopies(ctx);
+
+ /* update live variable information */
+ live_vars = live_var_analysis(program, options);
+}
+}
+
bool spill = true;
for (unsigned i = 0; i < phi->operands.size(); i++) {
- if (!phi->operands[i].isTemp())
- spill = false;
- else if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
+ if (phi->operands[i].isUndefined())
+ continue;
+ assert(phi->operands[i].isTemp());
+ if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
spill = false;
else
partial_spills.insert(phi->definitions[0].getTemp());
uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
for (unsigned i = 0; i < phi->operands.size(); i++) {
- unsigned pred_idx = preds[i];
-
- /* we have to spill constants to the same memory address */
- if (phi->operands[i].isConstant()) {
- uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass());
- for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
- ctx.interferences[def_spill_id].second.emplace(pair.second);
- ctx.interferences[pair.second].second.emplace(def_spill_id);
- }
- ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spill_id});
-
- aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
- spill->operands[0] = phi->operands[i];
- spill->operands[1] = Operand(spill_id);
- Block& pred = ctx.program->blocks[pred_idx];
- unsigned idx = pred.instructions.size();
- do {
- assert(idx != 0);
- idx--;
- } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
- std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
- pred.instructions.insert(it, std::move(spill));
- continue;
- }
- if (!phi->operands[i].isTemp())
+ if (phi->operands[i].isUndefined())
continue;
+ unsigned pred_idx = preds[i];
+ assert(phi->operands[i].isTemp() && phi->operands[i].isKill());
+ Temp var = phi->operands[i].getTemp();
+
/* build interferences between the phi def and all spilled variables at the predecessor blocks */
for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
- if (phi->operands[i].getTemp() == pair.first)
+ if (var == pair.first)
continue;
ctx.interferences[def_spill_id].second.emplace(pair.second);
ctx.interferences[pair.second].second.emplace(def_spill_id);
}
- /* variable is already spilled at predecessor */
- std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp());
+ /* check if variable is already spilled at predecessor */
+ std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);
if (spilled != ctx.spills_exit[pred_idx].end()) {
if (spilled->second != def_spill_id)
ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spilled->second});
}
/* rename if necessary */
- Temp var = phi->operands[i].getTemp();
std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
if (rename_it != ctx.renames[pred_idx].end()) {
var = rename_it->second;
continue;
}
- /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ // FIXME: lower_to_cssa()
+ /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
continue;
if (program->num_waves >= 6)
return;
+ /* lower to CSSA before spilling to ensure correctness w.r.t. phis */
+ lower_to_cssa(program, live_vars, options);
+
/* else, we check if we can improve things a bit */
/* calculate target register demand */