src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include <unordered_set>
  27 #include <algorithm>
  28
  29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  30 #include "amdgfxregs.h"
  31
  32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  34 #define POS_EXP_WINDOW_SIZE 512
  35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
  36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
  37 #define POS_EXP_MAX_MOVES 512
  38
  39 namespace aco {
  40
  41 struct sched_ctx {
  42    std::vector<bool> depends_on;
  43    std::vector<bool> RAR_dependencies;
  44    RegisterDemand max_registers;
  45    int16_t num_waves;
  46    int16_t last_SMEM_stall;
  47    int last_SMEM_dep_idx;
  48 };
  49
  50 /* This scheduler is a simple bottom-up pass based on ideas from
  51  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  52  * from Xiaohua Shi and Peng Guo.
  53  * The basic approach is to iterate over all instructions. When a memory instruction
  54  * is encountered it tries to move independent instructions from above and below
  55  * between the memory instruction and it's first user.
  56  * The novelty is that this scheduler cares for the current register pressure:
  57  * Instructions will only be moved if the register pressure won't exceed a certain bound.
  58  */
  59
  60 template <typename T>
  61 void move_element(T& list, size_t idx, size_t before) {
  62     if (idx < before) {
  63         auto begin = std::next(list.begin(), idx);
  64         auto end = std::next(list.begin(), before);
  65         std::rotate(begin, begin + 1, end);
  66     } else if (idx > before) {
  67         auto begin = std::next(list.begin(), before);
  68         auto end = std::next(list.begin(), idx + 1);
  69         std::rotate(begin, end - 1, end);
  70     }
  71 }
  72
  73 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
  74 {
  75    RegisterDemand changes;
  76    for (const Definition& def : instr->definitions) {
  77       if (!def.isTemp() || def.isKill())
  78          continue;
  79       changes += def.getTemp();
  80    }
  81
  82    for (const Operand& op : instr->operands) {
  83       if (!op.isTemp() || !op.isFirstKill())
  84          continue;
  85       changes -= op.getTemp();
  86    }
  87
  88    return changes;
  89 }
  90
  91 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
  92 {
  93    RegisterDemand temp_registers;
  94    for (const Definition& def : instr->definitions) {
  95       if (!def.isTemp() || !def.isKill())
  96          continue;
  97       temp_registers += def.getTemp();
  98    }
  99    return temp_registers;
 100 }
 101
 102 static bool is_spill_reload(aco_ptr<Instruction>& instr)
 103 {
 104    return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
 105 }
 106
 107 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
 108 {
 109    /* don't move exports so that they stay closer together */
 110    if (instr->format == Format::EXP)
 111       return false;
 112
 113    /* don't move s_memtime/s_memrealtime */
 114    if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
 115       return false;
 116
 117    /* handle barriers */
 118
 119    /* TODO: instead of stopping, maybe try to move the barriers and any
 120     * instructions interacting with them instead? */
 121    if (instr->format != Format::PSEUDO_BARRIER) {
 122       if (instr->opcode == aco_opcode::s_barrier) {
 123          bool can_reorder = false;
 124          switch (current->format) {
 125          case Format::SMEM:
 126             can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
 127             break;
 128          case Format::MUBUF:
 129             can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
 130             break;
 131          case Format::MIMG:
 132             can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
 133             break;
 134          default:
 135             break;
 136          }
 137          return can_reorder && moving_interaction == barrier_none;
 138       } else {
 139          return true;
 140       }
 141    }
 142
 143    int interaction = get_barrier_interaction(current);
 144    interaction |= moving_interaction;
 145
 146    switch (instr->opcode) {
 147    case aco_opcode::p_memory_barrier_atomic:
 148       return !(interaction & barrier_atomic);
 149    /* For now, buffer and image barriers are treated the same. this is because of
 150     * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
 151     * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
 152     * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
 153     * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
 154     * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
 155     * Either way, this solution should work. */
 156    case aco_opcode::p_memory_barrier_buffer:
 157    case aco_opcode::p_memory_barrier_image:
 158       return !(interaction & (barrier_image | barrier_buffer));
 159    case aco_opcode::p_memory_barrier_shared:
 160       return !(interaction & barrier_shared);
 161    case aco_opcode::p_memory_barrier_all:
 162       return interaction == barrier_none;
 163    default:
 164       return false;
 165    }
 166 }
 167
 168 bool can_reorder(Instruction* candidate, bool allow_smem)
 169 {
 170    switch (candidate->format) {
 171    case Format::SMEM:
 172       return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder;
 173    case Format::MUBUF:
 174       return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
 175    case Format::MIMG:
 176       return static_cast<MIMG_instruction*>(candidate)->can_reorder;
 177    case Format::MTBUF:
 178       return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
 179    case Format::FLAT:
 180    case Format::GLOBAL:
 181    case Format::SCRATCH:
 182       return false;
 183    default:
 184       return true;
 185    }
 186 }
 187
 188 void schedule_SMEM(sched_ctx& ctx, Block* block,
 189                    std::vector<RegisterDemand>& register_demand,
 190                    Instruction* current, int idx)
 191 {
 192    assert(idx != 0);
 193    int window_size = SMEM_WINDOW_SIZE;
 194    int max_moves = SMEM_MAX_MOVES;
 195    int16_t k = 0;
 196    bool can_reorder_cur = can_reorder(current, false);
 197
 198    /* don't move s_memtime/s_memrealtime */
 199    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
 200       return;
 201
 202    /* create the initial set of values which current depends on */
 203    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 204    for (const Operand& op : current->operands) {
 205       if (op.isTemp())
 206          ctx.depends_on[op.tempId()] = true;
 207    }
 208
 209    /* maintain how many registers remain free when moving instructions */
 210    RegisterDemand register_pressure = register_demand[idx];
 211
 212    /* first, check if we have instructions before current to move down */
 213    int insert_idx = idx + 1;
 214    int moving_interaction = barrier_none;
 215    bool moving_spill = false;
 216
 217    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 218       assert(candidate_idx >= 0);
 219       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 220
 221       /* break if we'd make the previous SMEM instruction stall */
 222       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 223       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 224          break;
 225
 226       /* break when encountering another MEM instruction, logical_start or barriers */
 227       if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
 228          break;
 229       if (candidate->opcode == aco_opcode::p_logical_start)
 230          break;
 231       if (candidate->opcode == aco_opcode::p_exit_early_if)
 232          break;
 233       if (!can_move_instr(candidate, current, moving_interaction))
 234          break;
 235       register_pressure.update(register_demand[candidate_idx]);
 236
 237       /* if current depends on candidate, add additional dependencies and continue */
 238       bool can_move_down = true;
 239       bool writes_exec = false;
 240       for (const Definition& def : candidate->definitions) {
 241          if (def.isTemp() && ctx.depends_on[def.tempId()])
 242             can_move_down = false;
 243          if (def.isFixed() && def.physReg() == exec)
 244             writes_exec = true;
 245       }
 246       if (writes_exec)
 247          break;
 248
 249       if (moving_spill && is_spill_reload(candidate))
 250          can_move_down = false;
 251       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 252          can_move_down = false;
 253       moving_interaction |= get_barrier_interaction(candidate.get());
 254       moving_spill |= is_spill_reload(candidate);
 255       if (!can_move_down) {
 256          for (const Operand& op : candidate->operands) {
 257             if (op.isTemp())
 258                ctx.depends_on[op.tempId()] = true;
 259          }
 260          continue;
 261       }
 262
 263       bool register_pressure_unknown = false;
 264       /* check if one of candidate's operands is killed by depending instruction */
 265       for (const Operand& op : candidate->operands) {
 266          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 267             // FIXME: account for difference in register pressure
 268             register_pressure_unknown = true;
 269          }
 270       }
 271       if (register_pressure_unknown) {
 272          for (const Operand& op : candidate->operands) {
 273             if (op.isTemp())
 274                ctx.depends_on[op.tempId()] = true;
 275          }
 276          continue;
 277       }
 278
 279       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 280       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 281       const RegisterDemand tempDemand = getTempRegisters(candidate);
 282       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 283          break;
 284       const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
 285       const RegisterDemand new_demand  = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
 286       if (new_demand.exceeds(ctx.max_registers))
 287          break;
 288       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 289
 290       /* move the candidate below the memory load */
 291       move_element(block->instructions, candidate_idx, insert_idx);
 292
 293       /* update register pressure */
 294       move_element(register_demand, candidate_idx, insert_idx);
 295       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 296          register_demand[i] -= candidate_diff;
 297       }
 298       register_demand[insert_idx - 1] = new_demand;
 299       register_pressure -= candidate_diff;
 300
 301       if (candidate_idx < ctx.last_SMEM_dep_idx)
 302          ctx.last_SMEM_stall++;
 303       insert_idx--;
 304       k++;
 305    }
 306
 307    /* create the initial set of values which depend on current */
 308    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 309    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 310    for (const Definition& def : current->definitions) {
 311       if (def.isTemp())
 312          ctx.depends_on[def.tempId()] = true;
 313    }
 314
 315    /* find the first instruction depending on current or find another MEM */
 316    insert_idx = idx + 1;
 317    moving_interaction = barrier_none;
 318    moving_spill = false;
 319
 320    bool found_dependency = false;
 321    /* second, check if we have instructions after current to move up */
 322    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 323       assert(candidate_idx < (int) block->instructions.size());
 324       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 325
 326       if (candidate->opcode == aco_opcode::p_logical_end)
 327          break;
 328       if (!can_move_instr(candidate, current, moving_interaction))
 329          break;
 330
 331       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 332                                            [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
 333       if (writes_exec)
 334          break;
 335
 336       /* check if candidate depends on current */
 337       bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
 338                                        [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
 339       if (moving_spill && is_spill_reload(candidate))
 340          is_dependency = true;
 341       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 342          is_dependency = true;
 343       moving_interaction |= get_barrier_interaction(candidate.get());
 344       moving_spill |= is_spill_reload(candidate);
 345       if (is_dependency) {
 346          for (const Definition& def : candidate->definitions) {
 347             if (def.isTemp())
 348                ctx.depends_on[def.tempId()] = true;
 349          }
 350          for (const Operand& op : candidate->operands) {
 351             if (op.isTemp())
 352                ctx.RAR_dependencies[op.tempId()] = true;
 353          }
 354          if (!found_dependency) {
 355             insert_idx = candidate_idx;
 356             found_dependency = true;
 357             /* init register pressure */
 358             register_pressure = register_demand[insert_idx - 1];
 359          }
 360       }
 361
 362       if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
 363          break;
 364
 365       if (!found_dependency) {
 366          k++;
 367          continue;
 368       }
 369
 370       /* update register pressure */
 371       register_pressure.update(register_demand[candidate_idx - 1]);
 372
 373       if (is_dependency)
 374          continue;
 375       assert(insert_idx != idx);
 376
 377       // TODO: correctly calculate register pressure for this case
 378       bool register_pressure_unknown = false;
 379       /* check if candidate uses/kills an operand which is used by a dependency */
 380       for (const Operand& op : candidate->operands) {
 381          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
 382             register_pressure_unknown = true;
 383       }
 384       if (register_pressure_unknown) {
 385          for (const Definition& def : candidate->definitions) {
 386             if (def.isTemp())
 387                ctx.RAR_dependencies[def.tempId()] = true;
 388          }
 389          for (const Operand& op : candidate->operands) {
 390             if (op.isTemp())
 391                ctx.RAR_dependencies[op.tempId()] = true;
 392          }
 393          continue;
 394       }
 395
 396       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 397       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 398       const RegisterDemand temp = getTempRegisters(candidate);
 399       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 400          break;
 401       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 402       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 403       if (new_demand.exceeds(ctx.max_registers))
 404          break;
 405
 406       /* move the candidate above the insert_idx */
 407       move_element(block->instructions, candidate_idx, insert_idx);
 408
 409       /* update register pressure */
 410       move_element(register_demand, candidate_idx, insert_idx);
 411       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 412          register_demand[i] += candidate_diff;
 413       }
 414       register_demand[insert_idx] = new_demand;
 415       register_pressure += candidate_diff;
 416       insert_idx++;
 417       k++;
 418    }
 419
 420    ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
 421    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 422 }
 423
 424 void schedule_VMEM(sched_ctx& ctx, Block* block,
 425                    std::vector<RegisterDemand>& register_demand,
 426                    Instruction* current, int idx)
 427 {
 428    assert(idx != 0);
 429    int window_size = VMEM_WINDOW_SIZE;
 430    int max_moves = VMEM_MAX_MOVES;
 431    int16_t k = 0;
 432    bool can_reorder_cur = can_reorder(current, false);
 433
 434    /* create the initial set of values which current depends on */
 435    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 436    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 437    for (const Operand& op : current->operands) {
 438       if (op.isTemp()) {
 439          ctx.depends_on[op.tempId()] = true;
 440          if (op.isFirstKill())
 441             ctx.RAR_dependencies[op.tempId()] = true;
 442       }
 443    }
 444
 445    /* maintain how many registers remain free when moving instructions */
 446    RegisterDemand register_pressure = register_demand[idx];
 447
 448    /* first, check if we have instructions before current to move down */
 449    int insert_idx = idx + 1;
 450    int moving_interaction = barrier_none;
 451    bool moving_spill = false;
 452
 453    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 454       assert(candidate_idx >= 0);
 455       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 456
 457       /* break when encountering another VMEM instruction, logical_start or barriers */
 458       if (!can_reorder(candidate.get(), true) && !can_reorder_cur)
 459          break;
 460       if (candidate->opcode == aco_opcode::p_logical_start)
 461          break;
 462       if (candidate->opcode == aco_opcode::p_exit_early_if)
 463          break;
 464       if (!can_move_instr(candidate, current, moving_interaction))
 465          break;
 466
 467       /* break if we'd make the previous SMEM instruction stall */
 468       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 469       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 470          break;
 471       register_pressure.update(register_demand[candidate_idx]);
 472
 473       /* if current depends on candidate, add additional dependencies and continue */
 474       bool can_move_down = !candidate->isVMEM();
 475       bool writes_exec = false;
 476       for (const Definition& def : candidate->definitions) {
 477          if (def.isTemp() && ctx.depends_on[def.tempId()])
 478             can_move_down = false;
 479          if (def.isFixed() && def.physReg() == exec)
 480             writes_exec = true;
 481       }
 482       if (writes_exec)
 483          break;
 484
 485       if (moving_spill && is_spill_reload(candidate))
 486          can_move_down = false;
 487       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 488          can_move_down = false;
 489       moving_interaction |= get_barrier_interaction(candidate.get());
 490       moving_spill |= is_spill_reload(candidate);
 491       if (!can_move_down) {
 492          for (const Operand& op : candidate->operands) {
 493             if (op.isTemp()) {
 494                ctx.depends_on[op.tempId()] = true;
 495                if (op.isFirstKill())
 496                   ctx.RAR_dependencies[op.tempId()] = true;
 497             }
 498          }
 499          continue;
 500       }
 501
 502       bool register_pressure_unknown = false;
 503       /* check if one of candidate's operands is killed by depending instruction */
 504       for (const Operand& op : candidate->operands) {
 505          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
 506             // FIXME: account for difference in register pressure
 507             register_pressure_unknown = true;
 508          }
 509       }
 510       if (register_pressure_unknown) {
 511          for (const Operand& op : candidate->operands) {
 512             if (op.isTemp()) {
 513                ctx.depends_on[op.tempId()] = true;
 514                if (op.isFirstKill())
 515                   ctx.RAR_dependencies[op.tempId()] = true;
 516             }
 517          }
 518          continue;
 519       }
 520
 521       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 522       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 523       const RegisterDemand temp = getTempRegisters(candidate);;
 524       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 525          break;
 526       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 527       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 528       if (new_demand.exceeds(ctx.max_registers))
 529          break;
 530       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 531
 532       /* move the candidate below the memory load */
 533       move_element(block->instructions, candidate_idx, insert_idx);
 534
 535       /* update register pressure */
 536       move_element(register_demand, candidate_idx, insert_idx);
 537       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 538          register_demand[i] -= candidate_diff;
 539       }
 540       register_demand[insert_idx - 1] = new_demand;
 541       register_pressure -=  candidate_diff;
 542       insert_idx--;
 543       k++;
 544       if (candidate_idx < ctx.last_SMEM_dep_idx)
 545          ctx.last_SMEM_stall++;
 546    }
 547
 548    /* create the initial set of values which depend on current */
 549    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 550    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 551    for (const Definition& def : current->definitions) {
 552       if (def.isTemp())
 553          ctx.depends_on[def.tempId()] = true;
 554    }
 555
 556    /* find the first instruction depending on current or find another VMEM */
 557    insert_idx = idx;
 558    moving_interaction = barrier_none;
 559    moving_spill = false;
 560
 561    bool found_dependency = false;
 562    /* second, check if we have instructions after current to move up */
 563    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 564       assert(candidate_idx < (int) block->instructions.size());
 565       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 566
 567       if (candidate->opcode == aco_opcode::p_logical_end)
 568          break;
 569       if (!can_move_instr(candidate, current, moving_interaction))
 570          break;
 571
 572       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 573                                            [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
 574       if (writes_exec)
 575          break;
 576
 577       /* check if candidate depends on current */
 578       bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur;
 579       for (const Operand& op : candidate->operands) {
 580          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 581             is_dependency = true;
 582             break;
 583          }
 584       }
 585       if (moving_spill && is_spill_reload(candidate))
 586          is_dependency = true;
 587       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 588          is_dependency = true;
 589       moving_interaction |= get_barrier_interaction(candidate.get());
 590       moving_spill |= is_spill_reload(candidate);
 591       if (is_dependency) {
 592          for (const Definition& def : candidate->definitions) {
 593             if (def.isTemp())
 594                ctx.depends_on[def.tempId()] = true;
 595          }
 596          for (const Operand& op : candidate->operands) {
 597             if (op.isTemp())
 598                ctx.RAR_dependencies[op.tempId()] = true;
 599          }
 600          if (!found_dependency) {
 601             insert_idx = candidate_idx;
 602             found_dependency = true;
 603             /* init register pressure */
 604             register_pressure = register_demand[insert_idx - 1];
 605             continue;
 606          }
 607       } else if (candidate->isVMEM()) {
 608          for (const Definition& def : candidate->definitions) {
 609             if (def.isTemp())
 610                ctx.depends_on[def.tempId()] = true;
 611          }
 612       }
 613
 614       /* update register pressure */
 615       register_pressure.update(register_demand[candidate_idx - 1]);
 616
 617       if (is_dependency || !found_dependency)
 618          continue;
 619       assert(insert_idx != idx);
 620
 621       bool register_pressure_unknown = false;
 622       /* check if candidate uses/kills an operand which is used by a dependency */
 623       for (const Operand& op : candidate->operands) {
 624          if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()])
 625             register_pressure_unknown = true;
 626       }
 627       if (register_pressure_unknown) {
 628          for (const Definition& def : candidate->definitions) {
 629             if (def.isTemp())
 630                ctx.depends_on[def.tempId()] = true;
 631          }
 632          for (const Operand& op : candidate->operands) {
 633             if (op.isTemp())
 634                ctx.RAR_dependencies[op.tempId()] = true;
 635          }
 636          continue;
 637       }
 638
 639       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 640       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 641       const RegisterDemand temp = getTempRegisters(candidate);
 642       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 643          break;
 644       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 645       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 646       if (new_demand.exceeds(ctx.max_registers))
 647          break;
 648
 649       /* move the candidate above the insert_idx */
 650       move_element(block->instructions, candidate_idx, insert_idx);
 651
 652       /* update register pressure */
 653       move_element(register_demand, candidate_idx, insert_idx);
 654       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 655          register_demand[i] += candidate_diff;
 656       }
 657       register_demand[insert_idx] = new_demand;
 658       register_pressure += candidate_diff;
 659       insert_idx++;
 660       k++;
 661    }
 662 }
 663
 664 void schedule_position_export(sched_ctx& ctx, Block* block,
 665                               std::vector<RegisterDemand>& register_demand,
 666                               Instruction* current, int idx)
 667 {
 668    assert(idx != 0);
 669    int window_size = POS_EXP_WINDOW_SIZE;
 670    int max_moves = POS_EXP_MAX_MOVES;
 671    int16_t k = 0;
 672
 673    /* create the initial set of values which current depends on */
 674    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 675    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 676    for (const Operand& op : current->operands) {
 677       if (op.isTemp()) {
 678          ctx.depends_on[op.tempId()] = true;
 679          if (op.isFirstKill())
 680             ctx.RAR_dependencies[op.tempId()] = true;
 681       }
 682    }
 683
 684    /* maintain how many registers remain free when moving instructions */
 685    RegisterDemand register_pressure = register_demand[idx];
 686
 687    /* first, check if we have instructions before current to move down */
 688    int insert_idx = idx + 1;
 689    int moving_interaction = barrier_none;
 690    bool moving_spill = false;
 691
 692    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 693       assert(candidate_idx >= 0);
 694       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 695
 696       /* break when encountering logical_start or barriers */
 697       if (candidate->opcode == aco_opcode::p_logical_start)
 698          break;
 699       if (candidate->opcode == aco_opcode::p_exit_early_if)
 700          break;
 701       if (candidate->isVMEM() || candidate->format == Format::SMEM)
 702          break;
 703       if (!can_move_instr(candidate, current, moving_interaction))
 704          break;
 705
 706       register_pressure.update(register_demand[candidate_idx]);
 707
 708       /* if current depends on candidate, add additional dependencies and continue */
 709       bool can_move_down = true;
 710       bool writes_exec = false;
 711       for (unsigned i = 0; i < candidate->definitions.size(); i++) {
 712          if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
 713             can_move_down = false;
 714          if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
 715             writes_exec = true;
 716       }
 717       if (writes_exec)
 718          break;
 719
 720       if (moving_spill && is_spill_reload(candidate))
 721          can_move_down = false;
 722       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 723          can_move_down = false;
 724       moving_interaction |= get_barrier_interaction(candidate.get());
 725       moving_spill |= is_spill_reload(candidate);
 726       if (!can_move_down) {
 727          for (const Operand& op : candidate->operands) {
 728             if (op.isTemp()) {
 729                ctx.depends_on[op.tempId()] = true;
 730                if (op.isFirstKill())
 731                   ctx.RAR_dependencies[op.tempId()] = true;
 732             }
 733          }
 734          continue;
 735       }
 736
 737       bool register_pressure_unknown = false;
 738       /* check if one of candidate's operands is killed by depending instruction */
 739       for (const Operand& op : candidate->operands) {
 740          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
 741             // FIXME: account for difference in register pressure
 742             register_pressure_unknown = true;
 743          }
 744       }
 745       if (register_pressure_unknown) {
 746          for (const Operand& op : candidate->operands) {
 747             if (op.isTemp()) {
 748                ctx.depends_on[op.tempId()] = true;
 749                if (op.isFirstKill())
 750                   ctx.RAR_dependencies[op.tempId()] = true;
 751             }
 752          }
 753          continue;
 754       }
 755
 756       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 757       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 758       const RegisterDemand temp = getTempRegisters(candidate);;
 759       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 760          break;
 761       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 762       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 763       if (new_demand.exceeds(ctx.max_registers))
 764          break;
 765       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 766
 767       /* move the candidate below the export */
 768       move_element(block->instructions, candidate_idx, insert_idx);
 769
 770       /* update register pressure */
 771       move_element(register_demand, candidate_idx, insert_idx);
 772       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 773          register_demand[i] -= candidate_diff;
 774       }
 775       register_demand[insert_idx - 1] = new_demand;
 776       register_pressure -=  candidate_diff;
 777       insert_idx--;
 778       k++;
 779    }
 780 }
 781
 782 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 783 {
 784    ctx.last_SMEM_dep_idx = 0;
 785    ctx.last_SMEM_stall = INT16_MIN;
 786
 787    /* go through all instructions and find memory loads */
 788    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 789       Instruction* current = block->instructions[idx].get();
 790
 791       if (current->definitions.empty())
 792          continue;
 793
 794       if (current->isVMEM())
 795          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 796       if (current->format == Format::SMEM)
 797          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 798    }
 799
 800    if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
 801       /* Try to move position exports as far up as possible, to reduce register
 802        * usage and because ISA reference guides say so. */
 803       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 804          Instruction* current = block->instructions[idx].get();
 805
 806          if (current->format == Format::EXP) {
 807             unsigned target = static_cast<Export_instruction*>(current)->dest;
 808             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
 809                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 810          }
 811       }
 812    }
 813
 814    /* resummarize the block's register demand */
 815    block->register_demand = RegisterDemand();
 816    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 817       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 818    }
 819 }
 820
 821
 822 void schedule_program(Program *program, live& live_vars)
 823 {
 824    sched_ctx ctx;
 825    ctx.depends_on.resize(program->peekAllocationId());
 826    ctx.RAR_dependencies.resize(program->peekAllocationId());
 827    /* Allowing the scheduler to reduce the number of waves to as low as 5
 828     * improves performance of Thrones of Britannia significantly and doesn't
 829     * seem to hurt anything else. */
 830    if (program->num_waves <= 5)
 831       ctx.num_waves = program->num_waves;
 832    else if (program->max_reg_demand.vgpr >= 32)
 833       ctx.num_waves = 5;
 834    else if (program->max_reg_demand.vgpr >= 28)
 835       ctx.num_waves = 6;
 836    else if (program->max_reg_demand.vgpr >= 24)
 837       ctx.num_waves = 7;
 838    else
 839       ctx.num_waves = 8;
 840
 841    assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
 842    ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
 843
 844    for (Block& block : program->blocks)
 845       schedule_block(ctx, program, &block, live_vars);
 846
 847    /* update max_reg_demand and num_waves */
 848    RegisterDemand new_demand;
 849    for (Block& block : program->blocks) {
 850       new_demand.update(block.register_demand);
 851    }
 852    update_vgpr_sgpr_demand(program, new_demand);
 853
 854    /* if enabled, this code asserts that register_demand is updated correctly */
 855    #if 0
 856    int prev_num_waves = program->num_waves;
 857    const RegisterDemand prev_max_demand = program->max_reg_demand;
 858
 859    std::vector<RegisterDemand> demands(program->blocks.size());
 860    for (unsigned j = 0; j < program->blocks.size(); j++) {
 861       demands[j] = program->blocks[j].register_demand;
 862    }
 863
 864    struct radv_nir_compiler_options options;
 865    options.chip_class = program->chip_class;
 866    live live_vars2 = aco::live_var_analysis(program, &options);
 867
 868    for (unsigned j = 0; j < program->blocks.size(); j++) {
 869       Block &b = program->blocks[j];
 870       for (unsigned i = 0; i < b.instructions.size(); i++)
 871          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
 872       assert(b.register_demand == demands[j]);
 873    }
 874
 875    assert(program->max_reg_demand == prev_max_demand);
 876    assert(program->num_waves == prev_num_waves);
 877    #endif
 878 }
 879
 880 }