src/amd/compiler/aco_live_var_analysis.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  * Authors:
  25  *    Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
  26  *    Bas Nieuwenhuizen (bas@basnieuwenhuizen.nl)
  27  *
  28  */
  29
  30 #include "aco_ir.h"
  31 #include "util/u_math.h"
  32
  33 #include <set>
  34 #include <vector>
  35
  36 #include "vulkan/radv_shader.h"
  37
  38 namespace aco {
  39 RegisterDemand get_live_changes(aco_ptr<Instruction>& instr)
  40 {
  41    RegisterDemand changes;
  42    for (const Definition& def : instr->definitions) {
  43       if (!def.isTemp() || def.isKill())
  44          continue;
  45       changes += def.getTemp();
  46    }
  47
  48    for (const Operand& op : instr->operands) {
  49       if (!op.isTemp() || !op.isFirstKill())
  50          continue;
  51       changes -= op.getTemp();
  52    }
  53
  54    return changes;
  55 }
  56
  57 RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr)
  58 {
  59    RegisterDemand temp_registers;
  60
  61    for (Definition def : instr->definitions) {
  62       if (!def.isTemp())
  63          continue;
  64       if (def.isKill())
  65          temp_registers += def.getTemp();
  66    }
  67
  68    for (Operand op : instr->operands) {
  69       if (op.isTemp() && op.isLateKill() && op.isFirstKill())
  70          temp_registers += op.getTemp();
  71    }
  72
  73    return temp_registers;
  74 }
  75
  76 RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before)
  77 {
  78    demand -= get_live_changes(instr);
  79    demand -= get_temp_registers(instr);
  80    if (instr_before)
  81       demand += get_temp_registers(instr_before);
  82    return demand;
  83 }
  84
  85 namespace {
  86 void process_live_temps_per_block(Program *program, live& lives, Block* block,
  87                                   std::set<unsigned>& worklist, std::vector<uint16_t>& phi_sgpr_ops)
  88 {
  89    std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index];
  90    RegisterDemand new_demand;
  91
  92    register_demand.resize(block->instructions.size());
  93    block->register_demand = RegisterDemand();
  94    TempSet live = lives.live_out[block->index];
  95
  96    /* add the live_out_exec to live */
  97    bool exec_live = false;
  98    if (block->live_out_exec != Temp()) {
  99       live.insert(block->live_out_exec);
 100       exec_live = true;
 101    }
 102
 103    /* initialize register demand */
 104    for (Temp t : live)
 105       new_demand += t;
 106    new_demand.sgpr -= phi_sgpr_ops[block->index];
 107
 108    /* traverse the instructions backwards */
 109    int idx;
 110    for (idx = block->instructions.size() -1; idx >= 0; idx--) {
 111       Instruction *insn = block->instructions[idx].get();
 112       if (is_phi(insn))
 113          break;
 114
 115       /* substract the 1 or 2 sgprs from exec */
 116       if (exec_live)
 117          assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
 118       register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr - (exec_live ? program->lane_mask.size() : 0));
 119
 120       /* KILL */
 121       for (Definition& definition : insn->definitions) {
 122          if (!definition.isTemp()) {
 123             continue;
 124          }
 125          if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
 126             program->needs_vcc = true;
 127
 128          const Temp temp = definition.getTemp();
 129          const size_t n = live.erase(temp);
 130
 131          if (n) {
 132             new_demand -= temp;
 133             definition.setKill(false);
 134          } else {
 135             register_demand[idx] += temp;
 136             definition.setKill(true);
 137          }
 138
 139          if (definition.isFixed() && definition.physReg() == exec)
 140             exec_live = false;
 141       }
 142
 143       /* GEN */
 144       if (insn->opcode == aco_opcode::p_logical_end) {
 145          new_demand.sgpr += phi_sgpr_ops[block->index];
 146       } else {
 147          /* we need to do this in a separate loop because the next one can
 148           * setKill() for several operands at once and we don't want to
 149           * overwrite that in a later iteration */
 150          for (Operand& op : insn->operands)
 151             op.setKill(false);
 152
 153          for (unsigned i = 0; i < insn->operands.size(); ++i)
 154          {
 155             Operand& operand = insn->operands[i];
 156             if (!operand.isTemp())
 157                continue;
 158             if (operand.isFixed() && operand.physReg() == vcc)
 159                program->needs_vcc = true;
 160             const Temp temp = operand.getTemp();
 161             const bool inserted = live.insert(temp).second;
 162             if (inserted) {
 163                operand.setFirstKill(true);
 164                for (unsigned j = i + 1; j < insn->operands.size(); ++j) {
 165                   if (insn->operands[j].isTemp() && insn->operands[j].tempId() == operand.tempId()) {
 166                      insn->operands[j].setFirstKill(false);
 167                      insn->operands[j].setKill(true);
 168                   }
 169                }
 170                if (operand.isLateKill())
 171                   register_demand[idx] += temp;
 172                new_demand += temp;
 173             }
 174
 175             if (operand.isFixed() && operand.physReg() == exec)
 176                exec_live = true;
 177          }
 178       }
 179
 180       block->register_demand.update(register_demand[idx]);
 181    }
 182
 183    /* update block's register demand for a last time */
 184    if (exec_live)
 185       assert(new_demand.sgpr >= (int16_t) program->lane_mask.size());
 186    new_demand.sgpr -= exec_live ? program->lane_mask.size() : 0;
 187    block->register_demand.update(new_demand);
 188
 189    /* handle phi definitions */
 190    int phi_idx = idx;
 191    while (phi_idx >= 0) {
 192       register_demand[phi_idx] = new_demand;
 193       Instruction *insn = block->instructions[phi_idx].get();
 194
 195       assert(is_phi(insn));
 196       assert(insn->definitions.size() == 1 && insn->definitions[0].isTemp());
 197       Definition& definition = insn->definitions[0];
 198       if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc)
 199          program->needs_vcc = true;
 200       const Temp temp = definition.getTemp();
 201       const size_t n = live.erase(temp);
 202
 203       if (n)
 204          definition.setKill(false);
 205       else
 206          definition.setKill(true);
 207
 208       phi_idx--;
 209    }
 210
 211    /* now, we need to merge the live-ins into the live-out sets */
 212    for (Temp t : live) {
 213       std::vector<unsigned>& preds = t.is_linear() ? block->linear_preds : block->logical_preds;
 214
 215 #ifndef NDEBUG
 216       if (preds.empty())
 217          aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t.id(), block->index);
 218 #endif
 219
 220       for (unsigned pred_idx : preds) {
 221          auto it = lives.live_out[pred_idx].insert(t);
 222          if (it.second)
 223             worklist.insert(pred_idx);
 224       }
 225    }
 226
 227    /* handle phi operands */
 228    phi_idx = idx;
 229    while (phi_idx >= 0) {
 230       Instruction *insn = block->instructions[phi_idx].get();
 231       assert(is_phi(insn));
 232       /* directly insert into the predecessors live-out set */
 233       std::vector<unsigned>& preds = insn->opcode == aco_opcode::p_phi
 234                                    ? block->logical_preds
 235                                    : block->linear_preds;
 236       for (unsigned i = 0; i < preds.size(); ++i) {
 237          Operand &operand = insn->operands[i];
 238          if (!operand.isTemp())
 239             continue;
 240          if (operand.isFixed() && operand.physReg() == vcc)
 241             program->needs_vcc = true;
 242          /* check if we changed an already processed block */
 243          const bool inserted = lives.live_out[preds[i]].insert(operand.getTemp()).second;
 244          if (inserted) {
 245             operand.setKill(true);
 246             worklist.insert(preds[i]);
 247             if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr)
 248                phi_sgpr_ops[preds[i]] += operand.size();
 249          }
 250       }
 251       phi_idx--;
 252    }
 253
 254    assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty()));
 255 }
 256
 257 unsigned calc_waves_per_workgroup(Program *program)
 258 {
 259    /* When workgroup size is not known, just go with wave_size */
 260    unsigned workgroup_size = program->workgroup_size == UINT_MAX
 261                              ? program->wave_size
 262                              : program->workgroup_size;
 263
 264    return align(workgroup_size, program->wave_size) / program->wave_size;
 265 }
 266 } /* end namespace */
 267
 268 uint16_t get_extra_sgprs(Program *program)
 269 {
 270    if (program->chip_class >= GFX10) {
 271       assert(!program->needs_flat_scr);
 272       assert(!program->xnack_enabled);
 273       return 2;
 274    } else if (program->chip_class >= GFX8) {
 275       if (program->needs_flat_scr)
 276          return 6;
 277       else if (program->xnack_enabled)
 278          return 4;
 279       else if (program->needs_vcc)
 280          return 2;
 281       else
 282          return 0;
 283    } else {
 284       assert(!program->xnack_enabled);
 285       if (program->needs_flat_scr)
 286          return 4;
 287       else if (program->needs_vcc)
 288          return 2;
 289       else
 290          return 0;
 291    }
 292 }
 293
 294 uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs)
 295 {
 296    assert(addressable_sgprs <= program->sgpr_limit);
 297    uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program);
 298    uint16_t granule = program->sgpr_alloc_granule + 1;
 299    return align(std::max(sgprs, granule), granule);
 300 }
 301
 302 uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs)
 303 {
 304    assert(addressable_vgprs <= program->vgpr_limit);
 305    uint16_t granule = program->vgpr_alloc_granule + 1;
 306    return align(std::max(addressable_vgprs, granule), granule);
 307 }
 308
 309 uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves)
 310 {
 311     uint16_t sgprs = program->physical_sgprs / max_waves & ~program->sgpr_alloc_granule;
 312     sgprs -= get_extra_sgprs(program);
 313     return std::min(sgprs, program->sgpr_limit);
 314 }
 315
 316 uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves)
 317 {
 318     uint16_t vgprs = 256 / max_waves & ~program->vgpr_alloc_granule;
 319     return std::min(vgprs, program->vgpr_limit);
 320 }
 321
 322 void calc_min_waves(Program* program)
 323 {
 324    unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
 325    /* currently min_waves is in wave64 waves */
 326    if (program->wave_size == 32)
 327       waves_per_workgroup = DIV_ROUND_UP(waves_per_workgroup, 2);
 328
 329    unsigned simd_per_cu = 4; /* TODO: different on Navi */
 330    bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
 331    unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
 332
 333    program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp);
 334 }
 335
 336 void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
 337 {
 338    /* TODO: max_waves_per_simd, simd_per_cu and the number of physical vgprs for Navi */
 339    unsigned max_waves_per_simd = 10;
 340    if ((program->family >= CHIP_POLARIS10 && program->family <= CHIP_VEGAM) || program->chip_class >= GFX10_3)
 341       max_waves_per_simd = 8;
 342    unsigned simd_per_cu = 4;
 343
 344    bool wgp = program->chip_class >= GFX10; /* assume WGP is used on Navi */
 345    unsigned simd_per_cu_wgp = wgp ? simd_per_cu * 2 : simd_per_cu;
 346    unsigned lds_limit = wgp ? program->lds_limit * 2 : program->lds_limit;
 347
 348    /* this won't compile, register pressure reduction necessary */
 349    if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) {
 350       program->num_waves = 0;
 351       program->max_reg_demand = new_demand;
 352    } else {
 353       program->num_waves = program->physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
 354       program->num_waves = std::min<uint16_t>(program->num_waves, 256 / get_vgpr_alloc(program, new_demand.vgpr));
 355       program->max_waves = max_waves_per_simd;
 356
 357       /* adjust max_waves for workgroup and LDS limits */
 358       unsigned waves_per_workgroup = calc_waves_per_workgroup(program);
 359       unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup;
 360       if (program->config->lds_size) {
 361          unsigned lds = program->config->lds_size * program->lds_alloc_granule;
 362          workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds);
 363       }
 364       if (waves_per_workgroup > 1 && program->chip_class < GFX10)
 365          workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, 16u); /* TODO: is this a SI-only limit? what about Navi? */
 366
 367       /* in cases like waves_per_workgroup=3 or lds=65536 and
 368        * waves_per_workgroup=1, we want the maximum possible number of waves per
 369        * SIMD and not the minimum. so DIV_ROUND_UP is used */
 370       program->max_waves = std::min<uint16_t>(program->max_waves, DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp));
 371
 372       /* incorporate max_waves and calculate max_reg_demand */
 373       program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves);
 374       program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
 375       program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
 376    }
 377 }
 378
 379 live live_var_analysis(Program* program,
 380                        const struct radv_nir_compiler_options *options)
 381 {
 382    live result;
 383    result.live_out.resize(program->blocks.size());
 384    result.register_demand.resize(program->blocks.size());
 385    std::set<unsigned> worklist;
 386    std::vector<uint16_t> phi_sgpr_ops(program->blocks.size());
 387    RegisterDemand new_demand;
 388
 389    program->needs_vcc = false;
 390
 391    /* this implementation assumes that the block idx corresponds to the block's position in program->blocks vector */
 392    for (Block& block : program->blocks)
 393       worklist.insert(block.index);
 394    while (!worklist.empty()) {
 395       std::set<unsigned>::reverse_iterator b_it = worklist.rbegin();
 396       unsigned block_idx = *b_it;
 397       worklist.erase(block_idx);
 398       process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, phi_sgpr_ops);
 399       new_demand.update(program->blocks[block_idx].register_demand);
 400    }
 401
 402    /* calculate the program's register demand and number of waves */
 403    update_vgpr_sgpr_demand(program, new_demand);
 404
 405    return result;
 406 }
 407
 408 }
 409