src/amd/compiler/aco_lower_to_cssa.cpp

   1 /*
   2  * Copyright © 2019 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include <map>
  26 #include "aco_ir.h"
  27 #include "aco_builder.h"
  28
  29 /*
  30  * Implements an algorithm to lower to Concentional SSA Form (CSSA).
  31  * After "Revisiting Out-of-SSA Translation for Correctness, CodeQuality, and Efficiency"
  32  * by B. Boissinot, A. Darte, F. Rastello, B. Dupont de Dinechin, C. Guillon,
  33  *
  34  * By lowering the IR to CSSA, the insertion of parallelcopies is separated from
  35  * the register coalescing problem. Additionally, correctness is ensured w.r.t. spilling.
  36  * The algorithm tries to find beneficial insertion points by checking if a basic block
  37  * is empty and if the variable already has a new definition in a dominating block.
  38  */
  39
  40
  41 namespace aco {
  42 namespace {
  43
  44 typedef std::map<uint32_t, std::vector<std::pair<Definition, Operand>>> phi_info;
  45
  46 struct cssa_ctx {
  47    Program* program;
  48    live& live_vars;
  49    phi_info logical_phi_info;
  50    phi_info linear_phi_info;
  51
  52    cssa_ctx(Program* program, live& live_vars) : program(program), live_vars(live_vars) {}
  53 };
  54
  55 bool collect_phi_info(cssa_ctx& ctx)
  56 {
  57    bool progress = false;
  58    for (Block& block : ctx.program->blocks) {
  59       for (aco_ptr<Instruction>& phi : block.instructions) {
  60          bool is_logical;
  61          if (phi->opcode == aco_opcode::p_phi)
  62             is_logical = true;
  63          else if (phi->opcode == aco_opcode::p_linear_phi)
  64             is_logical = false;
  65          else
  66             break;
  67
  68          /* no CSSA for the exec mask as we don't spill it anyway */
  69          if (phi->definitions[0].isFixed() && phi->definitions[0].physReg() == exec)
  70             continue;
  71          std::vector<unsigned>& preds = is_logical ? block.logical_preds : block.linear_preds;
  72
  73          /* collect definition's block per Operand */
  74          std::vector<unsigned> def_points(phi->operands.size());
  75          for (unsigned i = 0; i < phi->operands.size(); i++) {
  76             Operand& op = phi->operands[i];
  77             if (op.isUndefined()) {
  78                def_points[i] = preds[i];
  79             } else if (op.isConstant()) {
  80                /* in theory, we could insert the definition there... */
  81                def_points[i] = 0;
  82             } else {
  83                assert(op.isTemp());
  84                unsigned pred = preds[i];
  85                do {
  86                   def_points[i] = pred;
  87                   pred = is_logical ?
  88                          ctx.program->blocks[pred].logical_idom :
  89                          ctx.program->blocks[pred].linear_idom;
  90                } while (def_points[i] != pred &&
  91                         ctx.live_vars.live_out[pred].find(op.getTemp()) != ctx.live_vars.live_out[pred].end());
  92             }
  93          }
  94
  95          /* check live-range intersections */
  96          for (unsigned i = 0; i < phi->operands.size(); i++) {
  97             Operand op = phi->operands[i];
  98             if (op.isUndefined())
  99                continue;
 100             /* check if the operand comes from the exec mask of a predecessor */
 101             if (op.isTemp() && op.getTemp() == ctx.program->blocks[preds[i]].live_out_exec)
 102                op.setFixed(exec);
 103
 104             bool interferes = false;
 105             unsigned idom = is_logical ?
 106                             ctx.program->blocks[def_points[i]].logical_idom :
 107                             ctx.program->blocks[def_points[i]].linear_idom;
 108             /* live-through operands definitely interfere */
 109             if (op.isTemp() && !op.isKill()) {
 110                interferes = true;
 111             /* create copies for constants to ease spilling */
 112             } else if (op.isConstant()) {
 113                interferes = true;
 114             /* create copies for SGPR -> VGPR moves */
 115             } else if (op.regClass() != phi->definitions[0].regClass()) {
 116                interferes = true;
 117             /* operand might interfere with any phi-def*/
 118             } else if (def_points[i] == block.index) {
 119                interferes = true;
 120             /* operand might interfere with phi-def */
 121             } else if (ctx.live_vars.live_out[idom].count(phi->definitions[0].getTemp())) {
 122                interferes = true;
 123             /* else check for interferences with other operands */
 124             } else {
 125                for (unsigned j = 0; !interferes && j < phi->operands.size(); j++) {
 126                   /* don't care about other register classes */
 127                   if (!phi->operands[j].isTemp() || phi->operands[j].regClass() != phi->definitions[0].regClass())
 128                      continue;
 129                   /* same operands cannot interfere */
 130                   if (op.getTemp() == phi->operands[j].getTemp())
 131                      continue;
 132                   /* if def_points[i] dominates any other def_point, assume they interfere.
 133                    * As live-through operands are checked above, only test up the current block. */
 134                   unsigned other_def_point = def_points[j];
 135                   while (def_points[i] < other_def_point && other_def_point != block.index)
 136                      other_def_point = is_logical ?
 137                                        ctx.program->blocks[other_def_point].logical_idom :
 138                                        ctx.program->blocks[other_def_point].linear_idom;
 139                   interferes = def_points[i] == other_def_point;
 140                }
 141             }
 142
 143             if (!interferes)
 144                continue;
 145
 146             progress = true;
 147
 148             /* create new temporary and rename operands */
 149             Temp new_tmp = Temp{ctx.program->allocateId(), phi->definitions[0].regClass()};
 150             if (is_logical)
 151                ctx.logical_phi_info[preds[i]].emplace_back(Definition(new_tmp), op);
 152             else
 153                ctx.linear_phi_info[preds[i]].emplace_back(Definition(new_tmp), op);
 154             phi->operands[i] = Operand(new_tmp);
 155             phi->operands[i].setKill(true);
 156             def_points[i] = preds[i];
 157          }
 158       }
 159    }
 160    return progress;
 161 }
 162
 163 void insert_parallelcopies(cssa_ctx& ctx)
 164 {
 165    /* insert the parallelcopies from logical phis before p_logical_end */
 166    for (auto&& entry : ctx.logical_phi_info) {
 167       Block& block = ctx.program->blocks[entry.first];
 168       unsigned idx = block.instructions.size() - 1;
 169       while (block.instructions[idx]->opcode != aco_opcode::p_logical_end) {
 170          assert(idx > 0);
 171          idx--;
 172       }
 173
 174       Builder bld(ctx.program);
 175       bld.reset(&block.instructions, std::next(block.instructions.begin(), idx));
 176       for (std::pair<Definition, Operand>& pair : entry.second)
 177          bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
 178    }
 179
 180    /* insert parallelcopies for the linear phis at the end of blocks just before the branch */
 181    for (auto&& entry : ctx.linear_phi_info) {
 182       Block& block = ctx.program->blocks[entry.first];
 183       std::vector<aco_ptr<Instruction>>::iterator it = block.instructions.end();
 184       --it;
 185       assert((*it)->format == Format::PSEUDO_BRANCH);
 186
 187       Builder bld(ctx.program);
 188       bld.reset(&block.instructions, it);
 189       for (std::pair<Definition, Operand>& pair : entry.second)
 190          bld.pseudo(aco_opcode::p_parallelcopy, pair.first, pair.second);
 191    }
 192 }
 193
 194 } /* end namespace */
 195
 196
 197 void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options)
 198 {
 199    cssa_ctx ctx = {program, live_vars};
 200    /* collect information about all interfering phi operands */
 201    bool progress = collect_phi_info(ctx);
 202
 203    if (!progress)
 204       return;
 205
 206    insert_parallelcopies(ctx);
 207
 208    /* update live variable information */
 209    live_vars = live_var_analysis(program, options);
 210 }
 211 }
 212