From: Daniel Schürmann <daniel@schuermann.dev>
Date: Tue, 15 Oct 2019 16:23:52 +0000 (+0200)
Subject: aco: Lower to CSSA
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=0b8216b2cdbcaccfd2bd1a65be6b8ac5654e3067;p=mesa.git

aco: Lower to CSSA

Converting to 'Conventional SSA Form' ensures correctness w.r.t. spilling of phi nodes.
Previously, it was possible that phi operands have intersecting live-ranges, and thus,
couldn't get spilled to the same spilling slot. For this reason, ACO tried to avoid to
spill phis, even if it was beneficial.
This patch implements a conversion pass which is currently only called if spilling is necessary.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
---

diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index 8ee86716a29..7359a6f1ab4 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -115,10 +115,8 @@ public:
 
    Program *program;
    bool use_iterator;
-   union {
-   bool forwards; //when use_iterator == true
-   bool start; //when use_iterator == false
-   };
+   bool start; // only when use_iterator == false
+
    std::vector<aco_ptr<Instruction>> *instructions;
    std::vector<aco_ptr<Instruction>>::iterator it;
 
@@ -148,13 +146,19 @@ public:
       instructions = instrs;
    }
 
+   void reset(std::vector<aco_ptr<Instruction>> *instrs, std::vector<aco_ptr<Instruction>>::iterator instr_it) {
+      use_iterator = true;
+      start = false;
+      instructions = instrs;
+      it = instr_it;
+   }
+
    Result insert(aco_ptr<Instruction> instr) {
       Instruction *instr_ptr = instr.get();
       if (instructions) {
          if (use_iterator) {
             it = instructions->emplace(it, std::move(instr));
-            if (forwards)
-               it = std::next(it);
+            it = std::next(it);
          } else if (!start) {
             instructions->emplace_back(std::move(instr));
          } else {
@@ -168,8 +172,7 @@ public:
       if (instructions) {
          if (use_iterator) {
             it = instructions->emplace(it, aco_ptr<Instruction>(instr));
-            if (forwards)
-               it = std::next(it);
+            it = std::next(it);
          } else if (!start) {
             instructions->emplace_back(aco_ptr<Instruction>(instr));
          } else {
diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp
new file mode 100644
index 00000000000..7d26363ec80
--- /dev/null
+++ b/src/amd/compiler/aco_lower_to_cssa.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright Â© 2019 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <map>
+#include "aco_ir.h"
+#include "aco_builder.h"
+
+/*
+ * Implements an algorithm to lower to Concentional SSA Form (CSSA).
+ * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
+ * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
+ *
+ * By lowering the IR to CSSA, the insertion of parallelcopies is separated from
+ * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
+ * The algorithm tries to find beneficial insertion points by checking if a basic block
+ * is empty and if the variable already has a new definition in a dominating block.
+ */
+
+
+namespace aco {
+namespace {
+
+typedef std::map<uint32_t, std::vector<std::pair<Definition, Operand>>> phi_info;
+
+struct cssa_ctx {
+   Program* program;
+   live& live_vars;
+   phi_info logical_phi_info;
+   phi_info linear_phi_info;
+
+   cssa_ctx(Program* program, live& live_vars) : program(program), live_vars(live_vars) {}
+};
+
+unsigned get_lca(cssa_ctx& ctx, unsigned x, unsigned y, bool is_logical)
+{
+   while (x != y) {
+      if (x > y) {
+         x = is_logical ? ctx.program->blocks[x].logical_idom : ctx.program->blocks[x].linear_idom;
+      } else {
+         y = is_logical ? ctx.program->blocks[y].logical_idom : ctx.program->blocks[y].linear_idom;
+      }
+   }
+   return x;
+}
+
+bool collect_phi_info(cssa_ctx& ctx)
+{
+   bool progress = false;
+   for (Block& block : ctx.program->blocks) {
+      for (aco_ptr<Instruction>& phi : block.instructions) {
+         bool is_logical;
+         if (phi->opcode == aco_opcode::p_phi)
+            is_logical = true;
+         else if (phi->opcode == aco_opcode::p_linear_phi)
+            is_logical = false;
+         else
+            break;
+
+         /* no CSSA for the exec mask as we don't spill it anyway */
+         if (phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec)
+            continue;
+         std::vector<unsigned>& preds = is_logical ? block.logical_preds : block.linear_preds;
+
+         /* collect definition's block per Operand */
+         std::vector<unsigned> def_points(phi->operands.size());
+         for (unsigned i = 0; i < phi->operands.size(); i++) {
+            Operand& op = phi->operands[i];
+            if (op.isUndefined()) {
+               def_points[i] = preds[i];
+            } else if (op.isConstant()) {
+               /* in theory, we could insert the definition there... */
+               def_points[i] = 0;
+            } else {
+               assert(op.isTemp());
+               unsigned pred = preds[i];
+               do {
+                  def_points[i] = pred;
+                  pred = is_logical ?
+                         ctx.program->blocks[pred].logical_idom :
+                         ctx.program->blocks[pred].linear_idom;
+               } while (def_points[i] != pred &&
+                        ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
+            }
+         }
+
+         /* check live-range intersections */
+         for (unsigned i = 0; i < phi->operands.size(); i++) {
+            Operand op = phi->operands[i];
+            if (op.isUndefined())
+               continue;
+            /* check if the operand comes from the exec mask of a predecessor */
+            if (op.isTemp() && op.getTemp() == ctx.program->blocks[preds[i]].live_out_exec)
+               op.setFixed(exec);
+
+            /* calculate the earliest block where we can insert a copy if needed */
+            bool intersects = false;
+            unsigned upper_bound = preds[i];
+            while (def_points[i] < upper_bound) {
+               unsigned new_ub = is_logical ?
+                                 ctx.program->blocks[upper_bound].logical_idom :
+                                 ctx.program->blocks[upper_bound].linear_idom;
+
+               for (unsigned j = 0; j < phi->operands.size(); j++) {
+                  /* same operands cannot intersect */
+                  if (phi->operands[j].isTemp() && op.getTemp() == phi->operands[j].getTemp())
+                     continue;
+                  /* live-ranges intersect if any other definition point is dominated by the current definition */
+                  intersects |= def_points[j] == new_ub;
+               }
+               if (intersects)
+                  break;
+               else
+                  upper_bound = new_ub;
+            }
+
+            if (!intersects) {
+               /* constants and live-through definitions can get a copy
+                * at their definition point if there is no other intersection */
+               if (op.isConstant() || !op.isKill() || op.regClass().type() != phi->definitions[0].regClass().type()) {
+                  upper_bound = def_points[i];
+               } else {
+                  continue;
+               }
+            }
+
+            progress = true;
+            unsigned insert_block = preds[i];
+
+            /* if the predecessor block is empty, try to insert at the dominator */
+            bool is_empty = (is_logical && ctx.program->blocks[insert_block].instructions.size() == 3) ||
+                            ctx.program->blocks[insert_block].instructions.size() == 1;
+            if (is_empty) {
+               unsigned idom = is_logical ?
+                               ctx.program->blocks[insert_block].logical_idom :
+                               ctx.program->blocks[insert_block].linear_idom;
+               if (idom > upper_bound)
+                  insert_block = idom;
+            }
+
+            /* if other operands share the same value, try to insert at LCA */
+            std::vector<unsigned> indices = {i};
+            for (unsigned j = i + 1; j < phi->operands.size(); j++) {
+               if ((phi->operands[j].isTemp() && op.isTemp() && phi->operands[j].getTemp() == op.getTemp()) ||
+                   (phi->operands[j].isConstant() && op.isConstant() && phi->operands[j].constantValue() == op.constantValue())) {
+                  unsigned lca = get_lca(ctx, insert_block, preds[j], is_logical);
+                  if (lca > upper_bound) {
+                     insert_block = lca;
+                     indices.push_back(j);
+                  }
+               }
+            }
+
+            /* create new temporary and rename operands */
+            Temp new_tmp = Temp{ctx.program->allocateId(), phi->definitions[0].regClass()};
+            if (is_logical)
+               ctx.logical_phi_info[insert_block].emplace_back(Definition(new_tmp), op);
+            else
+               ctx.linear_phi_info[insert_block].emplace_back(Definition(new_tmp), op);
+            for (unsigned index : indices) {
+               phi->operands[index] = Operand(new_tmp);
+               phi->operands[index].setKill(true);
+               def_points[index] = insert_block;
+            }
+         }
+      }
+   }
+   return progress;
+}
+
+void insert_parallelcopies(cssa_ctx& ctx)
+{
+   /* insert the parallelcopies from logical phis before p_logical_end */
+   for (auto&& entry : ctx.logical_phi_info) {
+      Block& block = ctx.program->blocks[entry.first];
+      unsigned idx = block.instructions.size() - 1;
+      while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
+         assert(idx > 0);
+         idx--;
+      }
+
+      Builder bld(ctx.program);
+      bld.reset(&block.instructions, std::next(block.instructions.begin(), idx));
+      for (std::pair<Definition, Operand>& pair : entry.second)
+         bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
+   }
+
+   /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
+   for (auto&& entry : ctx.linear_phi_info) {
+      Block& block = ctx.program->blocks[entry.first];
+      std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
+      --it;
+      assert((*it)->format == Format::PSEUDO_BRANCH);
+
+      Builder bld(ctx.program);
+      bld.reset(&block.instructions, it);
+      for (std::pair<Definition, Operand>& pair : entry.second)
+         bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
+   }
+}
+
+} /* end namespace */
+
+
+void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options)
+{
+   cssa_ctx ctx = {program, live_vars};
+   /* collect information about all interfering phi operands */
+   bool progress = collect_phi_info(ctx);
+
+   if (!progress)
+      return;
+
+   insert_parallelcopies(ctx);
+
+   /* update live variable information */
+   live_vars = live_var_analysis(program, options);
+}
+}
+
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
index fefa8a8221b..d17e02d6165 100644
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -541,9 +541,10 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id
       bool spill = true;
 
       for (unsigned i = 0; i < phi->operands.size(); i++) {
-         if (!phi->operands[i].isTemp())
-            spill = false;
-         else if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
+         if (phi->operands[i].isUndefined())
+            continue;
+         assert(phi->operands[i].isTemp());
+         if (ctx.spills_exit[preds[i]].find(phi->operands[i].getTemp()) == ctx.spills_exit[preds[i]].end())
             spill = false;
          else
             partial_spills.insert(phi->definitions[0].getTemp());
@@ -720,43 +721,23 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
       uint32_t def_spill_id = ctx.spills_entry[block_idx][phi->definitions[0].getTemp()];
 
       for (unsigned i = 0; i < phi->operands.size(); i++) {
-         unsigned pred_idx = preds[i];
-
-         /* we have to spill constants to the same memory address */
-         if (phi->operands[i].isConstant()) {
-            uint32_t spill_id = ctx.allocate_spill_id(phi->definitions[0].regClass());
-            for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
-               ctx.interferences[def_spill_id].second.emplace(pair.second);
-               ctx.interferences[pair.second].second.emplace(def_spill_id);
-            }
-            ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spill_id});
-
-            aco_ptr<Pseudo_instruction> spill{create_instruction<Pseudo_instruction>(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
-            spill->operands[0] = phi->operands[i];
-            spill->operands[1] = Operand(spill_id);
-            Block& pred = ctx.program->blocks[pred_idx];
-            unsigned idx = pred.instructions.size();
-            do {
-               assert(idx != 0);
-               idx--;
-            } while (phi->opcode == aco_opcode::p_phi && pred.instructions[idx]->opcode != aco_opcode::p_logical_end);
-            std::vector<aco_ptr<Instruction>>::iterator it = std::next(pred.instructions.begin(), idx);
-            pred.instructions.insert(it, std::move(spill));
-            continue;
-         }
-         if (!phi->operands[i].isTemp())
+         if (phi->operands[i].isUndefined())
             continue;
 
+         unsigned pred_idx = preds[i];
+         assert(phi->operands[i].isTemp() && phi->operands[i].isKill());
+         Temp var = phi->operands[i].getTemp();
+
          /* build interferences between the phi def and all spilled variables at the predecessor blocks */
          for (std::pair<Temp, uint32_t> pair : ctx.spills_exit[pred_idx]) {
-            if (phi->operands[i].getTemp() == pair.first)
+            if (var == pair.first)
                continue;
             ctx.interferences[def_spill_id].second.emplace(pair.second);
             ctx.interferences[pair.second].second.emplace(def_spill_id);
          }
 
-         /* variable is already spilled at predecessor */
-         std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(phi->operands[i].getTemp());
+         /* check if variable is already spilled at predecessor */
+         std::map<Temp, uint32_t>::iterator spilled = ctx.spills_exit[pred_idx].find(var);
          if (spilled != ctx.spills_exit[pred_idx].end()) {
             if (spilled->second != def_spill_id)
                ctx.affinities.emplace_back(std::pair<uint32_t, uint32_t>{def_spill_id, spilled->second});
@@ -764,7 +745,6 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
          }
 
          /* rename if necessary */
-         Temp var = phi->operands[i].getTemp();
          std::map<Temp, Temp>::iterator rename_it = ctx.renames[pred_idx].find(var);
          if (rename_it != ctx.renames[pred_idx].end()) {
             var = rename_it->second;
@@ -813,7 +793,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx)
             continue;
          }
 
-         /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */ // FIXME: lower_to_cssa()
+         /* variable is dead at predecessor, it must be from a phi: this works because of CSSA form */
          if (ctx.next_use_distances_end[pred_idx].find(pair.first) == ctx.next_use_distances_end[pred_idx].end())
             continue;
 
@@ -1567,6 +1547,9 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt
    if (program->num_waves >= 6)
       return;
 
+   /* lower to CSSA before spilling to ensure correctness w.r.t. phis */
+   lower_to_cssa(program, live_vars, options);
+
    /* else, we check if we can improve things a bit */
 
    /* calculate target register demand */
diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build
index 4b22b82eedd..bce5093651b 100644
--- a/src/amd/compiler/meson.build
+++ b/src/amd/compiler/meson.build
@@ -69,6 +69,7 @@ libaco_files = files(
   'aco_register_allocation.cpp',
   'aco_live_var_analysis.cpp',
   'aco_lower_bool_phis.cpp',
+  'aco_lower_to_cssa.cpp',
   'aco_lower_to_hw_instr.cpp',
   'aco_optimizer.cpp',
   'aco_opt_value_numbering.cpp',