src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include <unordered_set>
  27 #include <algorithm>
  28
  29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  30 #include "amdgfxregs.h"
  31
  32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  34 #define POS_EXP_WINDOW_SIZE 512
  35 #define SMEM_MAX_MOVES (80 - ctx.num_waves * 8)
  36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 4)
  37 #define POS_EXP_MAX_MOVES 512
  38
  39 namespace aco {
  40
  41 struct sched_ctx {
  42    std::vector<bool> depends_on;
  43    std::vector<bool> RAR_dependencies;
  44    RegisterDemand max_registers;
  45    int16_t num_waves;
  46    int16_t last_SMEM_stall;
  47    int last_SMEM_dep_idx;
  48 };
  49
  50 /* This scheduler is a simple bottom-up pass based on ideas from
  51  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  52  * from Xiaohua Shi and Peng Guo.
  53  * The basic approach is to iterate over all instructions. When a memory instruction
  54  * is encountered it tries to move independent instructions from above and below
  55  * between the memory instruction and it's first user.
  56  * The novelty is that this scheduler cares for the current register pressure:
  57  * Instructions will only be moved if the register pressure won't exceed a certain bound.
  58  */
  59
  60 template <typename T>
  61 void move_element(T& list, size_t idx, size_t before) {
  62     if (idx < before) {
  63         auto begin = std::next(list.begin(), idx);
  64         auto end = std::next(list.begin(), before);
  65         std::rotate(begin, begin + 1, end);
  66     } else if (idx > before) {
  67         auto begin = std::next(list.begin(), before);
  68         auto end = std::next(list.begin(), idx + 1);
  69         std::rotate(begin, end - 1, end);
  70     }
  71 }
  72
  73 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
  74 {
  75    RegisterDemand changes;
  76    for (const Definition& def : instr->definitions) {
  77       if (!def.isTemp() || def.isKill())
  78          continue;
  79       changes += def.getTemp();
  80    }
  81
  82    for (const Operand& op : instr->operands) {
  83       if (!op.isTemp() || !op.isFirstKill())
  84          continue;
  85       changes -= op.getTemp();
  86    }
  87
  88    return changes;
  89 }
  90
  91 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
  92 {
  93    RegisterDemand temp_registers;
  94    for (const Definition& def : instr->definitions) {
  95       if (!def.isTemp() || !def.isKill())
  96          continue;
  97       temp_registers += def.getTemp();
  98    }
  99    return temp_registers;
 100 }
 101
 102 static bool is_spill_reload(aco_ptr<Instruction>& instr)
 103 {
 104    return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
 105 }
 106
 107 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
 108 {
 109    /* don't move exports so that they stay closer together */
 110    if (instr->format == Format::EXP)
 111       return false;
 112
 113    /* don't move s_memtime/s_memrealtime */
 114    if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
 115       return false;
 116
 117    /* handle barriers */
 118
 119    /* TODO: instead of stopping, maybe try to move the barriers and any
 120     * instructions interacting with them instead? */
 121    if (instr->format != Format::PSEUDO_BARRIER) {
 122       if (instr->opcode == aco_opcode::s_barrier) {
 123          bool can_reorder = false;
 124          switch (current->format) {
 125          case Format::SMEM:
 126             can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
 127             break;
 128          case Format::MUBUF:
 129             can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
 130             break;
 131          case Format::MIMG:
 132             can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
 133             break;
 134          default:
 135             break;
 136          }
 137          return can_reorder && moving_interaction == barrier_none;
 138       } else {
 139          return true;
 140       }
 141    }
 142
 143    int interaction = get_barrier_interaction(current);
 144    interaction |= moving_interaction;
 145
 146    switch (instr->opcode) {
 147    case aco_opcode::p_memory_barrier_atomic:
 148       return !(interaction & barrier_atomic);
 149    /* For now, buffer and image barriers are treated the same. this is because of
 150     * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
 151     * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
 152     * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
 153     * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
 154     * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
 155     * Either way, this solution should work. */
 156    case aco_opcode::p_memory_barrier_buffer:
 157    case aco_opcode::p_memory_barrier_image:
 158       return !(interaction & (barrier_image | barrier_buffer));
 159    case aco_opcode::p_memory_barrier_shared:
 160       return !(interaction & barrier_shared);
 161    case aco_opcode::p_memory_barrier_all:
 162       return interaction == barrier_none;
 163    default:
 164       return false;
 165    }
 166 }
 167
 168 bool can_reorder(Instruction* candidate, bool allow_smem)
 169 {
 170    switch (candidate->format) {
 171    case Format::SMEM:
 172       return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder;
 173    case Format::MUBUF:
 174       return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
 175    case Format::MIMG:
 176       return static_cast<MIMG_instruction*>(candidate)->can_reorder;
 177    case Format::MTBUF:
 178       return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
 179    case Format::FLAT:
 180    case Format::GLOBAL:
 181    case Format::SCRATCH:
 182       return false;
 183    default:
 184       return true;
 185    }
 186 }
 187
 188 void schedule_SMEM(sched_ctx& ctx, Block* block,
 189                    std::vector<RegisterDemand>& register_demand,
 190                    Instruction* current, int idx)
 191 {
 192    assert(idx != 0);
 193    int window_size = SMEM_WINDOW_SIZE;
 194    int max_moves = SMEM_MAX_MOVES;
 195    int16_t k = 0;
 196    bool can_reorder_cur = can_reorder(current, false);
 197
 198    /* don't move s_memtime/s_memrealtime */
 199    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
 200       return;
 201
 202    /* create the initial set of values which current depends on */
 203    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 204    for (const Operand& op : current->operands) {
 205       if (op.isTemp())
 206          ctx.depends_on[op.tempId()] = true;
 207    }
 208
 209    /* maintain how many registers remain free when moving instructions */
 210    RegisterDemand register_pressure = register_demand[idx];
 211
 212    /* first, check if we have instructions before current to move down */
 213    int insert_idx = idx + 1;
 214    int moving_interaction = barrier_none;
 215    bool moving_spill = false;
 216
 217    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 218       assert(candidate_idx >= 0);
 219       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 220
 221       /* break if we'd make the previous SMEM instruction stall */
 222       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 223       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 224          break;
 225
 226       /* break when encountering another MEM instruction, logical_start or barriers */
 227       if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
 228          break;
 229       if (candidate->opcode == aco_opcode::p_logical_start)
 230          break;
 231       if (candidate->opcode == aco_opcode::p_exit_early_if)
 232          break;
 233       if (!can_move_instr(candidate, current, moving_interaction))
 234          break;
 235       register_pressure.update(register_demand[candidate_idx]);
 236
 237       /* if current depends on candidate, add additional dependencies and continue */
 238       bool can_move_down = true;
 239       bool writes_exec = false;
 240       for (const Definition& def : candidate->definitions) {
 241          if (def.isTemp() && ctx.depends_on[def.tempId()])
 242             can_move_down = false;
 243          if (def.isFixed() && def.physReg() == exec)
 244             writes_exec = true;
 245       }
 246       if (writes_exec)
 247          break;
 248
 249       if (moving_spill && is_spill_reload(candidate))
 250          can_move_down = false;
 251       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 252          can_move_down = false;
 253       moving_interaction |= get_barrier_interaction(candidate.get());
 254       moving_spill |= is_spill_reload(candidate);
 255       if (!can_move_down) {
 256          for (const Operand& op : candidate->operands) {
 257             if (op.isTemp())
 258                ctx.depends_on[op.tempId()] = true;
 259          }
 260          continue;
 261       }
 262
 263       bool register_pressure_unknown = false;
 264       /* check if one of candidate's operands is killed by depending instruction */
 265       for (const Operand& op : candidate->operands) {
 266          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 267             // FIXME: account for difference in register pressure
 268             register_pressure_unknown = true;
 269          }
 270       }
 271       if (register_pressure_unknown) {
 272          for (const Operand& op : candidate->operands) {
 273             if (op.isTemp())
 274                ctx.depends_on[op.tempId()] = true;
 275          }
 276          continue;
 277       }
 278
 279       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 280       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 281       const RegisterDemand tempDemand = getTempRegisters(candidate);
 282       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 283          break;
 284       const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
 285       const RegisterDemand new_demand  = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
 286       if (new_demand.exceeds(ctx.max_registers))
 287          break;
 288       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 289
 290       /* move the candidate below the memory load */
 291       move_element(block->instructions, candidate_idx, insert_idx);
 292
 293       /* update register pressure */
 294       move_element(register_demand, candidate_idx, insert_idx);
 295       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 296          register_demand[i] -= candidate_diff;
 297       }
 298       register_demand[insert_idx - 1] = new_demand;
 299       register_pressure -= candidate_diff;
 300
 301       if (candidate_idx < ctx.last_SMEM_dep_idx)
 302          ctx.last_SMEM_stall++;
 303       insert_idx--;
 304       k++;
 305    }
 306
 307    /* create the initial set of values which depend on current */
 308    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 309    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 310    for (const Definition& def : current->definitions) {
 311       if (def.isTemp())
 312          ctx.depends_on[def.tempId()] = true;
 313    }
 314
 315    /* find the first instruction depending on current or find another MEM */
 316    insert_idx = idx + 1;
 317    moving_interaction = barrier_none;
 318    moving_spill = false;
 319
 320    bool found_dependency = false;
 321    /* second, check if we have instructions after current to move up */
 322    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 323       assert(candidate_idx < (int) block->instructions.size());
 324       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 325
 326       if (candidate->opcode == aco_opcode::p_logical_end)
 327          break;
 328       if (!can_move_instr(candidate, current, moving_interaction))
 329          break;
 330
 331       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 332                                            [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
 333       if (writes_exec)
 334          break;
 335
 336       /* check if candidate depends on current */
 337       bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
 338                                        [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
 339       if (moving_spill && is_spill_reload(candidate))
 340          is_dependency = true;
 341       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 342          is_dependency = true;
 343       moving_interaction |= get_barrier_interaction(candidate.get());
 344       moving_spill |= is_spill_reload(candidate);
 345       if (is_dependency) {
 346          for (const Definition& def : candidate->definitions) {
 347             if (def.isTemp())
 348                ctx.depends_on[def.tempId()] = true;
 349          }
 350          for (const Operand& op : candidate->operands) {
 351             if (op.isTemp())
 352                ctx.RAR_dependencies[op.tempId()] = true;
 353          }
 354          if (!found_dependency) {
 355             insert_idx = candidate_idx;
 356             found_dependency = true;
 357             /* init register pressure */
 358             register_pressure = register_demand[insert_idx - 1];
 359          }
 360       }
 361
 362       if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
 363          break;
 364
 365       if (!found_dependency) {
 366          k++;
 367          continue;
 368       }
 369
 370       /* update register pressure */
 371       register_pressure.update(register_demand[candidate_idx - 1]);
 372
 373       if (is_dependency)
 374          continue;
 375       assert(insert_idx != idx);
 376
 377       // TODO: correctly calculate register pressure for this case
 378       bool register_pressure_unknown = false;
 379       /* check if candidate uses/kills an operand which is used by a dependency */
 380       for (const Operand& op : candidate->operands) {
 381          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
 382             register_pressure_unknown = true;
 383       }
 384       if (register_pressure_unknown) {
 385          for (const Definition& def : candidate->definitions) {
 386             if (def.isTemp())
 387                ctx.RAR_dependencies[def.tempId()] = true;
 388          }
 389          for (const Operand& op : candidate->operands) {
 390             if (op.isTemp())
 391                ctx.RAR_dependencies[op.tempId()] = true;
 392          }
 393          continue;
 394       }
 395
 396       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 397       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 398       const RegisterDemand temp = getTempRegisters(candidate);
 399       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 400          break;
 401       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 402       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 403       if (new_demand.exceeds(ctx.max_registers))
 404          break;
 405
 406       /* move the candidate above the insert_idx */
 407       move_element(block->instructions, candidate_idx, insert_idx);
 408
 409       /* update register pressure */
 410       move_element(register_demand, candidate_idx, insert_idx);
 411       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 412          register_demand[i] += candidate_diff;
 413       }
 414       register_demand[insert_idx] = new_demand;
 415       register_pressure += candidate_diff;
 416       insert_idx++;
 417       k++;
 418    }
 419
 420    ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
 421    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 422 }
 423
 424 void schedule_VMEM(sched_ctx& ctx, Block* block,
 425                    std::vector<RegisterDemand>& register_demand,
 426                    Instruction* current, int idx)
 427 {
 428    assert(idx != 0);
 429    int window_size = VMEM_WINDOW_SIZE;
 430    int max_moves = VMEM_MAX_MOVES;
 431    int16_t k = 0;
 432    bool can_reorder_cur = can_reorder(current, false);
 433
 434    /* create the initial set of values which current depends on */
 435    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 436    for (const Operand& op : current->operands) {
 437       if (op.isTemp())
 438          ctx.depends_on[op.tempId()] = true;
 439    }
 440
 441    /* maintain how many registers remain free when moving instructions */
 442    RegisterDemand register_pressure = register_demand[idx];
 443
 444    /* first, check if we have instructions before current to move down */
 445    int insert_idx = idx + 1;
 446    int moving_interaction = barrier_none;
 447    bool moving_spill = false;
 448
 449    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 450       assert(candidate_idx >= 0);
 451       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 452
 453       /* break when encountering another VMEM instruction, logical_start or barriers */
 454       if (!can_reorder(candidate.get(), true) && !can_reorder_cur)
 455          break;
 456       if (candidate->opcode == aco_opcode::p_logical_start)
 457          break;
 458       if (candidate->opcode == aco_opcode::p_exit_early_if)
 459          break;
 460       if (!can_move_instr(candidate, current, moving_interaction))
 461          break;
 462
 463       /* break if we'd make the previous SMEM instruction stall */
 464       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 465       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 466          break;
 467       register_pressure.update(register_demand[candidate_idx]);
 468
 469       /* if current depends on candidate, add additional dependencies and continue */
 470       bool can_move_down = true;
 471       bool writes_exec = false;
 472       for (const Definition& def : candidate->definitions) {
 473          if (def.isTemp() && ctx.depends_on[def.tempId()])
 474             can_move_down = false;
 475          if (def.isFixed() && def.physReg() == exec)
 476             writes_exec = true;
 477       }
 478       if (writes_exec)
 479          break;
 480
 481       if (moving_spill && is_spill_reload(candidate))
 482          can_move_down = false;
 483       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 484          can_move_down = false;
 485       moving_interaction |= get_barrier_interaction(candidate.get());
 486       moving_spill |= is_spill_reload(candidate);
 487       if (!can_move_down) {
 488          for (const Operand& op : candidate->operands) {
 489             if (op.isTemp())
 490                ctx.depends_on[op.tempId()] = true;
 491          }
 492          continue;
 493       }
 494
 495       bool register_pressure_unknown = false;
 496       /* check if one of candidate's operands is killed by depending instruction */
 497       for (const Operand& op : candidate->operands) {
 498          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 499             // FIXME: account for difference in register pressure
 500             register_pressure_unknown = true;
 501          }
 502       }
 503       if (register_pressure_unknown) {
 504          for (const Operand& op : candidate->operands) {
 505             if (op.isTemp())
 506                ctx.depends_on[op.tempId()] = true;
 507          }
 508          continue;
 509       }
 510
 511       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 512       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 513       const RegisterDemand temp = getTempRegisters(candidate);;
 514       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 515          break;
 516       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 517       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 518       if (new_demand.exceeds(ctx.max_registers))
 519          break;
 520       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 521
 522       /* move the candidate below the memory load */
 523       move_element(block->instructions, candidate_idx, insert_idx);
 524
 525       /* update register pressure */
 526       move_element(register_demand, candidate_idx, insert_idx);
 527       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 528          register_demand[i] -= candidate_diff;
 529       }
 530       register_demand[insert_idx - 1] = new_demand;
 531       register_pressure -=  candidate_diff;
 532       insert_idx--;
 533       k++;
 534       if (candidate_idx < ctx.last_SMEM_dep_idx)
 535          ctx.last_SMEM_stall++;
 536    }
 537
 538    /* create the initial set of values which depend on current */
 539    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 540    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 541    for (const Definition& def : current->definitions) {
 542       if (def.isTemp())
 543          ctx.depends_on[def.tempId()] = true;
 544    }
 545
 546    /* find the first instruction depending on current or find another VMEM */
 547    insert_idx = idx;
 548    moving_interaction = barrier_none;
 549    moving_spill = false;
 550
 551    bool found_dependency = false;
 552    /* second, check if we have instructions after current to move up */
 553    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 554       assert(candidate_idx < (int) block->instructions.size());
 555       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 556
 557       if (candidate->opcode == aco_opcode::p_logical_end)
 558          break;
 559       if (!can_move_instr(candidate, current, moving_interaction))
 560          break;
 561
 562       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 563                                            [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
 564       if (writes_exec)
 565          break;
 566
 567       /* check if candidate depends on current */
 568       bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur;
 569       for (const Operand& op : candidate->operands) {
 570          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 571             is_dependency = true;
 572             break;
 573          }
 574       }
 575       if (moving_spill && is_spill_reload(candidate))
 576          is_dependency = true;
 577       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 578          is_dependency = true;
 579       moving_interaction |= get_barrier_interaction(candidate.get());
 580       moving_spill |= is_spill_reload(candidate);
 581       if (is_dependency) {
 582          for (const Definition& def : candidate->definitions) {
 583             if (def.isTemp())
 584                ctx.depends_on[def.tempId()] = true;
 585          }
 586          for (const Operand& op : candidate->operands) {
 587             if (op.isTemp())
 588                ctx.RAR_dependencies[op.tempId()] = true;
 589          }
 590          if (!found_dependency) {
 591             insert_idx = candidate_idx;
 592             found_dependency = true;
 593             /* init register pressure */
 594             register_pressure = register_demand[insert_idx - 1];
 595             continue;
 596          }
 597       }
 598
 599       /* update register pressure */
 600       register_pressure.update(register_demand[candidate_idx - 1]);
 601
 602       if (is_dependency || !found_dependency)
 603          continue;
 604       assert(insert_idx != idx);
 605
 606       bool register_pressure_unknown = false;
 607       /* check if candidate uses/kills an operand which is used by a dependency */
 608       for (const Operand& op : candidate->operands) {
 609          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
 610             register_pressure_unknown = true;
 611       }
 612       if (register_pressure_unknown) {
 613          for (const Definition& def : candidate->definitions) {
 614             if (def.isTemp())
 615                ctx.RAR_dependencies[def.tempId()] = true;
 616          }
 617          for (const Operand& op : candidate->operands) {
 618             if (op.isTemp())
 619                ctx.RAR_dependencies[op.tempId()] = true;
 620          }
 621          continue;
 622       }
 623
 624       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 625       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 626       const RegisterDemand temp = getTempRegisters(candidate);
 627       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 628          break;
 629       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 630       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 631       if (new_demand.exceeds(ctx.max_registers))
 632          break;
 633
 634       /* move the candidate above the insert_idx */
 635       move_element(block->instructions, candidate_idx, insert_idx);
 636
 637       /* update register pressure */
 638       move_element(register_demand, candidate_idx, insert_idx);
 639       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 640          register_demand[i] += candidate_diff;
 641       }
 642       register_demand[insert_idx] = new_demand;
 643       register_pressure += candidate_diff;
 644       insert_idx++;
 645       k++;
 646    }
 647 }
 648
 649 void schedule_position_export(sched_ctx& ctx, Block* block,
 650                               std::vector<RegisterDemand>& register_demand,
 651                               Instruction* current, int idx)
 652 {
 653    assert(idx != 0);
 654    int window_size = POS_EXP_WINDOW_SIZE;
 655    int max_moves = POS_EXP_MAX_MOVES;
 656    int16_t k = 0;
 657
 658    /* create the initial set of values which current depends on */
 659    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 660    for (unsigned i = 0; i < current->operands.size(); i++) {
 661       if (current->operands[i].isTemp())
 662          ctx.depends_on[current->operands[i].tempId()] = true;
 663    }
 664
 665    /* maintain how many registers remain free when moving instructions */
 666    RegisterDemand register_pressure = register_demand[idx];
 667
 668    /* first, check if we have instructions before current to move down */
 669    int insert_idx = idx + 1;
 670    int moving_interaction = barrier_none;
 671    bool moving_spill = false;
 672
 673    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 674       assert(candidate_idx >= 0);
 675       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 676
 677       /* break when encountering logical_start or barriers */
 678       if (candidate->opcode == aco_opcode::p_logical_start)
 679          break;
 680       if (candidate->opcode == aco_opcode::p_exit_early_if)
 681          break;
 682       if (candidate->isVMEM() || candidate->format == Format::SMEM)
 683          break;
 684       if (!can_move_instr(candidate, current, moving_interaction))
 685          break;
 686
 687       register_pressure.update(register_demand[candidate_idx]);
 688
 689       /* if current depends on candidate, add additional dependencies and continue */
 690       bool can_move_down = true;
 691       bool writes_exec = false;
 692       for (unsigned i = 0; i < candidate->definitions.size(); i++) {
 693          if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
 694             can_move_down = false;
 695          if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
 696             writes_exec = true;
 697       }
 698       if (writes_exec)
 699          break;
 700
 701       if (moving_spill && is_spill_reload(candidate))
 702          can_move_down = false;
 703       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 704          can_move_down = false;
 705       moving_interaction |= get_barrier_interaction(candidate.get());
 706       moving_spill |= is_spill_reload(candidate);
 707       if (!can_move_down) {
 708          for (unsigned i = 0; i < candidate->operands.size(); i++) {
 709             if (candidate->operands[i].isTemp())
 710                ctx.depends_on[candidate->operands[i].tempId()] = true;
 711          }
 712          continue;
 713       }
 714
 715       bool register_pressure_unknown = false;
 716       /* check if one of candidate's operands is killed by depending instruction */
 717       for (unsigned i = 0; i < candidate->operands.size(); i++) {
 718          if (candidate->operands[i].isTemp() && ctx.depends_on[candidate->operands[i].tempId()]) {
 719             // FIXME: account for difference in register pressure
 720             register_pressure_unknown = true;
 721          }
 722       }
 723       if (register_pressure_unknown) {
 724          for (unsigned i = 0; i < candidate->operands.size(); i++) {
 725             if (candidate->operands[i].isTemp())
 726                ctx.depends_on[candidate->operands[i].tempId()] = true;
 727          }
 728          continue;
 729       }
 730
 731       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 732       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 733       const RegisterDemand temp = getTempRegisters(candidate);;
 734       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 735          break;
 736       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 737       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 738       if (new_demand.exceeds(ctx.max_registers))
 739          break;
 740       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 741
 742       /* move the candidate below the export */
 743       move_element(block->instructions, candidate_idx, insert_idx);
 744
 745       /* update register pressure */
 746       move_element(register_demand, candidate_idx, insert_idx);
 747       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 748          register_demand[i] -= candidate_diff;
 749       }
 750       register_demand[insert_idx - 1] = new_demand;
 751       register_pressure -=  candidate_diff;
 752       insert_idx--;
 753       k++;
 754    }
 755 }
 756
 757 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 758 {
 759    ctx.last_SMEM_dep_idx = 0;
 760    ctx.last_SMEM_stall = INT16_MIN;
 761
 762    /* go through all instructions and find memory loads */
 763    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 764       Instruction* current = block->instructions[idx].get();
 765
 766       if (current->definitions.empty())
 767          continue;
 768
 769       if (current->isVMEM())
 770          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 771       if (current->format == Format::SMEM)
 772          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 773    }
 774
 775    if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
 776       /* Try to move position exports as far up as possible, to reduce register
 777        * usage and because ISA reference guides say so. */
 778       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 779          Instruction* current = block->instructions[idx].get();
 780
 781          if (current->format == Format::EXP) {
 782             unsigned target = static_cast<Export_instruction*>(current)->dest;
 783             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
 784                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 785          }
 786       }
 787    }
 788
 789    /* resummarize the block's register demand */
 790    block->register_demand = RegisterDemand();
 791    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 792       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 793    }
 794 }
 795
 796
 797 void schedule_program(Program *program, live& live_vars)
 798 {
 799    sched_ctx ctx;
 800    ctx.depends_on.resize(program->peekAllocationId());
 801    ctx.RAR_dependencies.resize(program->peekAllocationId());
 802    /* Allowing the scheduler to reduce the number of waves to as low as 5
 803     * improves performance of Thrones of Britannia significantly and doesn't
 804     * seem to hurt anything else. */
 805    //TODO: maybe use some sort of heuristic instead
 806    //TODO: this also increases window-size/max-moves? did I realize that at the time?
 807    ctx.num_waves = std::min<uint16_t>(program->num_waves, 5);
 808    assert(ctx.num_waves);
 809    uint16_t total_sgpr_regs = program->physical_sgprs;
 810    uint16_t max_addressible_sgpr = program->sgpr_limit;
 811    ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~program->sgpr_alloc_granule) - 2, max_addressible_sgpr)};
 812
 813    for (Block& block : program->blocks)
 814       schedule_block(ctx, program, &block, live_vars);
 815
 816    /* update max_reg_demand and num_waves */
 817    RegisterDemand new_demand;
 818    for (Block& block : program->blocks) {
 819       new_demand.update(block.register_demand);
 820    }
 821    update_vgpr_sgpr_demand(program, new_demand);
 822
 823    /* if enabled, this code asserts that register_demand is updated correctly */
 824    #if 0
 825    int prev_num_waves = program->num_waves;
 826    const RegisterDemand prev_max_demand = program->max_reg_demand;
 827
 828    std::vector<RegisterDemand> demands(program->blocks.size());
 829    for (unsigned j = 0; j < program->blocks.size(); j++) {
 830       demands[j] = program->blocks[j].register_demand;
 831    }
 832
 833    struct radv_nir_compiler_options options;
 834    options.chip_class = program->chip_class;
 835    live live_vars2 = aco::live_var_analysis(program, &options);
 836
 837    for (unsigned j = 0; j < program->blocks.size(); j++) {
 838       Block &b = program->blocks[j];
 839       for (unsigned i = 0; i < b.instructions.size(); i++)
 840          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
 841       assert(b.register_demand == demands[j]);
 842    }
 843
 844    assert(program->max_reg_demand == prev_max_demand);
 845    assert(program->num_waves == prev_num_waves);
 846    #endif
 847 }
 848
 849 }