src/amd/compiler/aco_live_var_analysis.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
  26  *    Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
  27  *
  28  */
  29
  30 #include "aco_ir.h"
  31
  32 #include <set>
  33 #include <vector>
  34
  35 #include "vulkan/radv_shader.h"
  36
  37 namespace aco {
  38 namespace {
  39
  40 void process_live_temps_per_block(Program *program, live& lives, Block* block,
  41                                   std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
  42 {
  43    std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
  44    RegisterDemand new_demand;
  45
  46    register_demand.resize(block->instructions.size());
  47    block->register_demand = RegisterDemand();
  48
  49    std::set<Temp> live_sgprs;
  50    std::set<Temp> live_vgprs;
  51
  52    /* add the live_out_exec to live */
  53    bool exec_live = false;
  54    if (block->live_out_exec != Temp()) {
  55       live_sgprs.insert(block->live_out_exec);
  56       new_demand.sgpr += 2;
  57       exec_live = true;
  58    }
  59
  60    /* split the live-outs from this block into the temporary sets */
  61    std::vector<std::set<Temp>>& live_temps = lives.live_out;
  62    for (const Temp temp : live_temps[block->index]) {
  63       const bool inserted = temp.is_linear()
  64                           ? live_sgprs.insert(temp).second
  65                           : live_vgprs.insert(temp).second;
  66       if (inserted) {
  67          new_demand += temp;
  68       }
  69    }
  70    new_demand.sgpr -= phi_sgpr_ops[block->index];
  71
  72    /* traverse the instructions backwards */
  73    for (int idx = block->instructions.size() -1; idx >= 0; idx--)
  74    {
  75       /* substract the 2 sgprs from exec */
  76       if (exec_live)
  77          assert(new_demand.sgpr >= 2);
  78       register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? 2 : 0));
  79
  80       Instruction *insn = block->instructions[idx].get();
  81       /* KILL */
  82       for (Definition& definition : insn->definitions) {
  83          if (!definition.isTemp()) {
  84             continue;
  85          }
  86
  87          const Temp temp = definition.getTemp();
  88          size_t n = 0;
  89          if (temp.is_linear())
  90             n = live_sgprs.erase(temp);
  91          else
  92             n = live_vgprs.erase(temp);
  93
  94          if (n) {
  95             new_demand -= temp;
  96             definition.setKill(false);
  97          } else {
  98             register_demand[idx] += temp;
  99             definition.setKill(true);
 100          }
 101
 102          if (definition.isFixed() && definition.physReg() == exec)
 103             exec_live = false;
 104       }
 105
 106       /* GEN */
 107       if (insn->opcode == aco_opcode::p_phi ||
 108           insn->opcode == aco_opcode::p_linear_phi) {
 109          /* directly insert into the predecessors live-out set */
 110          std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
 111                                       ? block->logical_preds
 112                                       : block->linear_preds;
 113          for (unsigned i = 0; i < preds.size(); ++i)
 114          {
 115             Operand &operand = insn->operands[i];
 116             if (!operand.isTemp()) {
 117                continue;
 118             }
 119             /* check if we changed an already processed block */
 120             const bool inserted = live_temps[preds[i]].insert(operand.getTemp()).second;
 121             if (inserted) {
 122                operand.setFirstKill(true);
 123                worklist.insert(preds[i]);
 124                if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr)
 125                   phi_sgpr_ops[preds[i]] += operand.size();
 126             }
 127          }
 128       } else if (insn->opcode == aco_opcode::p_logical_end) {
 129          new_demand.sgpr += phi_sgpr_ops[block->index];
 130       } else {
 131          for (unsigned i = 0; i < insn->operands.size(); ++i)
 132          {
 133             Operand& operand = insn->operands[i];
 134             if (!operand.isTemp()) {
 135                continue;
 136             }
 137             const Temp temp = operand.getTemp();
 138             const bool inserted = temp.is_linear()
 139                                 ? live_sgprs.insert(temp).second
 140                                 : live_vgprs.insert(temp).second;
 141             if (inserted) {
 142                operand.setFirstKill(true);
 143                for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
 144                   if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
 145                      insn->operands[j].setFirstKill(false);
 146                      insn->operands[j].setKill(true);
 147                   }
 148                }
 149                new_demand += temp;
 150             } else {
 151                operand.setKill(false);
 152             }
 153
 154             if (operand.isFixed() && operand.physReg() == exec)
 155                exec_live = true;
 156          }
 157       }
 158
 159       block->register_demand.update(register_demand[idx]);
 160    }
 161
 162    /* now, we have the live-in sets and need to merge them into the live-out sets */
 163    for (unsigned pred_idx : block->logical_preds) {
 164       for (Temp vgpr : live_vgprs) {
 165          auto it = live_temps[pred_idx].insert(vgpr);
 166          if (it.second)
 167             worklist.insert(pred_idx);
 168       }
 169    }
 170
 171    for (unsigned pred_idx : block->linear_preds) {
 172       for (Temp sgpr : live_sgprs) {
 173          auto it = live_temps[pred_idx].insert(sgpr);
 174          if (it.second)
 175             worklist.insert(pred_idx);
 176       }
 177    }
 178
 179    if (!(block->index != 0 || (live_vgprs.empty() && live_sgprs.empty()))) {
 180       aco_print_program(program, stderr);
 181       fprintf(stderr, "These temporaries are never defined or are defined after use:\n");
 182       for (Temp vgpr : live_vgprs)
 183          fprintf(stderr, "%%%d\n", vgpr.id());
 184       for (Temp sgpr : live_sgprs)
 185          fprintf(stderr, "%%%d\n", sgpr.id());
 186       abort();
 187    }
 188
 189    assert(block->index != 0 || new_demand == RegisterDemand());
 190 }
 191 } /* end namespace */
 192
 193 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
 194 {
 195    // TODO: also take shared mem into account
 196    const int16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512;
 197    const int16_t max_addressible_sgpr = program->sgpr_limit;
 198    /* VGPRs are allocated in chunks of 4 */
 199    const int16_t rounded_vgpr_demand = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3);
 200    /* SGPRs are allocated in chunks of 16 between 8 and 104. VCC occupies the last 2 registers */
 201    const int16_t rounded_sgpr_demand = std::min(std::max<int16_t>(8, (new_demand.sgpr + 2 + 7) & ~7), max_addressible_sgpr);
 202    /* this won't compile, register pressure reduction necessary */
 203    if (new_demand.vgpr > 256 || new_demand.sgpr > max_addressible_sgpr) {
 204       program->num_waves = 0;
 205       program->max_reg_demand = new_demand;
 206    } else {
 207       program->num_waves = std::min<uint16_t>(10,
 208                                               std::min<uint16_t>(256 / rounded_vgpr_demand,
 209                                                                  total_sgpr_regs / rounded_sgpr_demand));
 210
 211       program->max_reg_demand = {  int16_t((256 / program->num_waves) & ~3), std::min<int16_t>(((total_sgpr_regs / program->num_waves) & ~7) - 2, max_addressible_sgpr)};
 212    }
 213 }
 214
 215 live live_var_analysis(Program* program,
 216                        const struct radv_nir_compiler_options *options)
 217 {
 218    live result;
 219    result.live_out.resize(program->blocks.size());
 220    result.register_demand.resize(program->blocks.size());
 221    std::set<unsigned> worklist;
 222    std::vector<uint16_t> phi_sgpr_ops(program->blocks.size());
 223    RegisterDemand new_demand;
 224
 225    /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
 226    for (Block& block : program->blocks)
 227       worklist.insert(block.index);
 228    while (!worklist.empty()) {
 229       std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
 230       unsigned block_idx = *b_it;
 231       worklist.erase(block_idx);
 232       process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
 233       new_demand.update(program->blocks[block_idx].register_demand);
 234    }
 235
 236    /* calculate the program's register demand and number of waves */
 237    update_vgpr_sgpr_demand(program, new_demand);
 238
 239    return result;
 240 }
 241
 242 }
 243