src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include <unordered_set>
  27 #include <algorithm>
  28
  29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  30 #include "amdgfxregs.h"
  31
  32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  34 #define POS_EXP_WINDOW_SIZE 512
  35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
  36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
  37 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
  38 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
  39 #define POS_EXP_MAX_MOVES 512
  40
  41 namespace aco {
  42
  43 struct sched_ctx {
  44    std::vector<bool> depends_on;
  45    std::vector<bool> RAR_dependencies;
  46    /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
  47     * instructions in the clause, since new instructions in the clause are not
  48     * moved past any other instructions in the clause. */
  49    std::vector<bool> new_RAR_dependencies;
  50
  51    RegisterDemand max_registers;
  52    int16_t num_waves;
  53    int16_t last_SMEM_stall;
  54    int last_SMEM_dep_idx;
  55 };
  56
  57 /* This scheduler is a simple bottom-up pass based on ideas from
  58  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  59  * from Xiaohua Shi and Peng Guo.
  60  * The basic approach is to iterate over all instructions. When a memory instruction
  61  * is encountered it tries to move independent instructions from above and below
  62  * between the memory instruction and it's first user.
  63  * The novelty is that this scheduler cares for the current register pressure:
  64  * Instructions will only be moved if the register pressure won't exceed a certain bound.
  65  */
  66
  67 template <typename T>
  68 void move_element(T& list, size_t idx, size_t before) {
  69     if (idx < before) {
  70         auto begin = std::next(list.begin(), idx);
  71         auto end = std::next(list.begin(), before);
  72         std::rotate(begin, begin + 1, end);
  73     } else if (idx > before) {
  74         auto begin = std::next(list.begin(), before);
  75         auto end = std::next(list.begin(), idx + 1);
  76         std::rotate(begin, end - 1, end);
  77     }
  78 }
  79
  80 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
  81 {
  82    RegisterDemand changes;
  83    for (const Definition& def : instr->definitions) {
  84       if (!def.isTemp() || def.isKill())
  85          continue;
  86       changes += def.getTemp();
  87    }
  88
  89    for (const Operand& op : instr->operands) {
  90       if (!op.isTemp() || !op.isFirstKill())
  91          continue;
  92       changes -= op.getTemp();
  93    }
  94
  95    return changes;
  96 }
  97
  98 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
  99 {
 100    RegisterDemand temp_registers;
 101    for (const Definition& def : instr->definitions) {
 102       if (!def.isTemp() || !def.isKill())
 103          continue;
 104       temp_registers += def.getTemp();
 105    }
 106    return temp_registers;
 107 }
 108
 109 static bool is_spill_reload(aco_ptr<Instruction>& instr)
 110 {
 111    return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
 112 }
 113
 114 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
 115 {
 116    /* don't move exports so that they stay closer together */
 117    if (instr->format == Format::EXP)
 118       return false;
 119
 120    /* don't move s_memtime/s_memrealtime */
 121    if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
 122       return false;
 123
 124    /* handle barriers */
 125
 126    /* TODO: instead of stopping, maybe try to move the barriers and any
 127     * instructions interacting with them instead? */
 128    if (instr->format != Format::PSEUDO_BARRIER) {
 129       if (instr->opcode == aco_opcode::s_barrier) {
 130          bool can_reorder = false;
 131          switch (current->format) {
 132          case Format::SMEM:
 133             can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
 134             break;
 135          case Format::MUBUF:
 136             can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
 137             break;
 138          case Format::MIMG:
 139             can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
 140             break;
 141          default:
 142             break;
 143          }
 144          return can_reorder && moving_interaction == barrier_none;
 145       } else {
 146          return true;
 147       }
 148    }
 149
 150    int interaction = get_barrier_interaction(current);
 151    interaction |= moving_interaction;
 152
 153    switch (instr->opcode) {
 154    case aco_opcode::p_memory_barrier_atomic:
 155       return !(interaction & barrier_atomic);
 156    /* For now, buffer and image barriers are treated the same. this is because of
 157     * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
 158     * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
 159     * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
 160     * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
 161     * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
 162     * Either way, this solution should work. */
 163    case aco_opcode::p_memory_barrier_buffer:
 164    case aco_opcode::p_memory_barrier_image:
 165       return !(interaction & (barrier_image | barrier_buffer));
 166    case aco_opcode::p_memory_barrier_shared:
 167       return !(interaction & barrier_shared);
 168    case aco_opcode::p_memory_barrier_all:
 169       return interaction == barrier_none;
 170    default:
 171       return false;
 172    }
 173 }
 174
 175 bool can_reorder(Instruction* candidate)
 176 {
 177    switch (candidate->format) {
 178    case Format::SMEM:
 179       return static_cast<SMEM_instruction*>(candidate)->can_reorder;
 180    case Format::MUBUF:
 181       return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
 182    case Format::MIMG:
 183       return static_cast<MIMG_instruction*>(candidate)->can_reorder;
 184    case Format::MTBUF:
 185       return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
 186    case Format::FLAT:
 187    case Format::GLOBAL:
 188    case Format::SCRATCH:
 189       return false;
 190    default:
 191       return true;
 192    }
 193 }
 194
 195 void schedule_SMEM(sched_ctx& ctx, Block* block,
 196                    std::vector<RegisterDemand>& register_demand,
 197                    Instruction* current, int idx)
 198 {
 199    assert(idx != 0);
 200    int window_size = SMEM_WINDOW_SIZE;
 201    int max_moves = SMEM_MAX_MOVES;
 202    int16_t k = 0;
 203    bool can_reorder_cur = can_reorder(current);
 204
 205    /* don't move s_memtime/s_memrealtime */
 206    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
 207       return;
 208
 209    /* create the initial set of values which current depends on */
 210    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 211    for (const Operand& op : current->operands) {
 212       if (op.isTemp())
 213          ctx.depends_on[op.tempId()] = true;
 214    }
 215
 216    /* maintain how many registers remain free when moving instructions */
 217    RegisterDemand register_pressure = register_demand[idx];
 218
 219    /* first, check if we have instructions before current to move down */
 220    int insert_idx = idx + 1;
 221    int moving_interaction = barrier_none;
 222    bool moving_spill = false;
 223
 224    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 225       assert(candidate_idx >= 0);
 226       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 227       bool can_reorder_candidate = can_reorder(candidate.get());
 228
 229       /* break if we'd make the previous SMEM instruction stall */
 230       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 231       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 232          break;
 233
 234       /* break when encountering another MEM instruction, logical_start or barriers */
 235       if (!can_reorder_candidate && !can_reorder_cur)
 236          break;
 237       if (candidate->opcode == aco_opcode::p_logical_start)
 238          break;
 239       if (candidate->opcode == aco_opcode::p_exit_early_if)
 240          break;
 241       if (!can_move_instr(candidate, current, moving_interaction))
 242          break;
 243       if (candidate->isVMEM())
 244          break;
 245       register_pressure.update(register_demand[candidate_idx]);
 246
 247       /* if current depends on candidate, add additional dependencies and continue */
 248       bool can_move_down = true;
 249       bool writes_exec = false;
 250       for (const Definition& def : candidate->definitions) {
 251          if (def.isTemp() && ctx.depends_on[def.tempId()])
 252             can_move_down = false;
 253          if (def.isFixed() && def.physReg() == exec)
 254             writes_exec = true;
 255       }
 256       if (writes_exec)
 257          break;
 258
 259       if (moving_spill && is_spill_reload(candidate))
 260          can_move_down = false;
 261       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 262          can_move_down = false;
 263       moving_interaction |= get_barrier_interaction(candidate.get());
 264       moving_spill |= is_spill_reload(candidate);
 265       if (!can_move_down) {
 266          for (const Operand& op : candidate->operands) {
 267             if (op.isTemp())
 268                ctx.depends_on[op.tempId()] = true;
 269          }
 270          can_reorder_cur &= can_reorder_candidate;
 271          continue;
 272       }
 273
 274       bool register_pressure_unknown = false;
 275       /* check if one of candidate's operands is killed by depending instruction */
 276       for (const Operand& op : candidate->operands) {
 277          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 278             // FIXME: account for difference in register pressure
 279             register_pressure_unknown = true;
 280          }
 281       }
 282       if (register_pressure_unknown) {
 283          for (const Operand& op : candidate->operands) {
 284             if (op.isTemp())
 285                ctx.depends_on[op.tempId()] = true;
 286          }
 287          can_reorder_cur &= can_reorder_candidate;
 288          continue;
 289       }
 290
 291       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 292       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 293       const RegisterDemand tempDemand = getTempRegisters(candidate);
 294       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 295          break;
 296       const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
 297       const RegisterDemand new_demand  = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
 298       if (new_demand.exceeds(ctx.max_registers))
 299          break;
 300       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 301
 302       /* move the candidate below the memory load */
 303       move_element(block->instructions, candidate_idx, insert_idx);
 304
 305       /* update register pressure */
 306       move_element(register_demand, candidate_idx, insert_idx);
 307       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 308          register_demand[i] -= candidate_diff;
 309       }
 310       register_demand[insert_idx - 1] = new_demand;
 311       register_pressure -= candidate_diff;
 312
 313       if (candidate_idx < ctx.last_SMEM_dep_idx)
 314          ctx.last_SMEM_stall++;
 315       insert_idx--;
 316       k++;
 317    }
 318
 319    /* create the initial set of values which depend on current */
 320    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 321    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 322    for (const Definition& def : current->definitions) {
 323       if (def.isTemp())
 324          ctx.depends_on[def.tempId()] = true;
 325    }
 326
 327    /* find the first instruction depending on current or find another MEM */
 328    insert_idx = idx + 1;
 329    moving_interaction = barrier_none;
 330    moving_spill = false;
 331    can_reorder_cur = true;
 332
 333    bool found_dependency = false;
 334    /* second, check if we have instructions after current to move up */
 335    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 336       assert(candidate_idx < (int) block->instructions.size());
 337       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 338       bool can_reorder_candidate = can_reorder(candidate.get());
 339
 340       if (candidate->opcode == aco_opcode::p_logical_end)
 341          break;
 342       if (!can_move_instr(candidate, current, moving_interaction))
 343          break;
 344
 345       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 346                                            [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
 347       if (writes_exec)
 348          break;
 349
 350       /* check if candidate depends on current */
 351       bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
 352                                        [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
 353       /* no need to steal from following VMEM instructions */
 354       if (is_dependency && candidate->isVMEM())
 355          break;
 356       if (moving_spill && is_spill_reload(candidate))
 357          is_dependency = true;
 358       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 359          is_dependency = true;
 360       moving_interaction |= get_barrier_interaction(candidate.get());
 361       moving_spill |= is_spill_reload(candidate);
 362       if (is_dependency) {
 363          for (const Definition& def : candidate->definitions) {
 364             if (def.isTemp())
 365                ctx.depends_on[def.tempId()] = true;
 366          }
 367          for (const Operand& op : candidate->operands) {
 368             if (op.isTemp())
 369                ctx.RAR_dependencies[op.tempId()] = true;
 370          }
 371          if (!found_dependency) {
 372             insert_idx = candidate_idx;
 373             found_dependency = true;
 374             /* init register pressure */
 375             register_pressure = register_demand[insert_idx - 1];
 376          }
 377       }
 378
 379       if (!can_reorder_candidate && !can_reorder_cur)
 380          break;
 381
 382       if (!found_dependency) {
 383          k++;
 384          continue;
 385       }
 386
 387       /* update register pressure */
 388       register_pressure.update(register_demand[candidate_idx - 1]);
 389
 390       if (is_dependency) {
 391          can_reorder_cur &= can_reorder_candidate;
 392          continue;
 393       }
 394       assert(insert_idx != idx);
 395
 396       // TODO: correctly calculate register pressure for this case
 397       bool register_pressure_unknown = false;
 398       /* check if candidate uses/kills an operand which is used by a dependency */
 399       for (const Operand& op : candidate->operands) {
 400          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
 401             register_pressure_unknown = true;
 402       }
 403       if (register_pressure_unknown) {
 404          if (candidate->isVMEM())
 405             break;
 406          for (const Definition& def : candidate->definitions) {
 407             if (def.isTemp())
 408                ctx.RAR_dependencies[def.tempId()] = true;
 409          }
 410          for (const Operand& op : candidate->operands) {
 411             if (op.isTemp())
 412                ctx.RAR_dependencies[op.tempId()] = true;
 413          }
 414          can_reorder_cur &= can_reorder_candidate;
 415          continue;
 416       }
 417
 418       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 419       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 420       const RegisterDemand temp = getTempRegisters(candidate);
 421       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 422          break;
 423       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 424       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 425       if (new_demand.exceeds(ctx.max_registers))
 426          break;
 427
 428       /* move the candidate above the insert_idx */
 429       move_element(block->instructions, candidate_idx, insert_idx);
 430
 431       /* update register pressure */
 432       move_element(register_demand, candidate_idx, insert_idx);
 433       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 434          register_demand[i] += candidate_diff;
 435       }
 436       register_demand[insert_idx] = new_demand;
 437       register_pressure += candidate_diff;
 438       insert_idx++;
 439       k++;
 440    }
 441
 442    ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
 443    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 444 }
 445
 446 void schedule_VMEM(sched_ctx& ctx, Block* block,
 447                    std::vector<RegisterDemand>& register_demand,
 448                    Instruction* current, int idx)
 449 {
 450    assert(idx != 0);
 451    int window_size = VMEM_WINDOW_SIZE;
 452    int max_moves = VMEM_MAX_MOVES;
 453    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
 454    int16_t k = 0;
 455    /* initially true as we don't pull other VMEM instructions
 456     * through the current instruction */
 457    bool can_reorder_vmem = true;
 458    bool can_reorder_smem = true;
 459
 460    /* create the initial set of values which current depends on */
 461    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 462    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 463    std::fill(ctx.new_RAR_dependencies.begin(), ctx.new_RAR_dependencies.end(), false);
 464    for (const Operand& op : current->operands) {
 465       if (op.isTemp()) {
 466          ctx.depends_on[op.tempId()] = true;
 467          if (op.isFirstKill())
 468             ctx.RAR_dependencies[op.tempId()] = true;
 469       }
 470    }
 471
 472    /* maintain how many registers remain free when moving instructions */
 473    RegisterDemand register_pressure_indep = register_demand[idx];
 474    RegisterDemand register_pressure_clause = register_demand[idx];
 475
 476    /* first, check if we have instructions before current to move down */
 477    int indep_insert_idx = idx + 1;
 478    int clause_insert_idx = idx;
 479    int moving_interaction = barrier_none;
 480    bool moving_spill = false;
 481
 482    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 483       assert(candidate_idx >= 0);
 484       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 485       bool can_reorder_candidate = can_reorder(candidate.get());
 486
 487       /* break when encountering another VMEM instruction, logical_start or barriers */
 488       if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
 489          break;
 490       if (candidate->opcode == aco_opcode::p_logical_start)
 491          break;
 492       if (candidate->opcode == aco_opcode::p_exit_early_if)
 493          break;
 494       if (!can_move_instr(candidate, current, moving_interaction))
 495          break;
 496
 497       /* break if we'd make the previous SMEM instruction stall */
 498       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 499       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 500          break;
 501       register_pressure_indep.update(register_demand[candidate_idx]);
 502
 503       bool part_of_clause = false;
 504       if (candidate->isVMEM()) {
 505          bool same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
 506          bool can_reorder = can_reorder_vmem || can_reorder_candidate;
 507          int grab_dist = clause_insert_idx - candidate_idx;
 508          /* We can't easily tell how much this will decrease the def-to-use
 509           * distances, so just use how far it will be moved as a heuristic. */
 510          part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist;
 511       }
 512
 513       /* if current depends on candidate, add additional dependencies and continue */
 514       bool can_move_down = !candidate->isVMEM() || part_of_clause;
 515       bool writes_exec = false;
 516       for (const Definition& def : candidate->definitions) {
 517          if (def.isTemp() && ctx.depends_on[def.tempId()])
 518             can_move_down = false;
 519          if (def.isFixed() && def.physReg() == exec)
 520             writes_exec = true;
 521       }
 522       if (writes_exec)
 523          break;
 524
 525       if (moving_spill && is_spill_reload(candidate))
 526          can_move_down = false;
 527       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 528          can_move_down = false;
 529       moving_interaction |= get_barrier_interaction(candidate.get());
 530       moving_spill |= is_spill_reload(candidate);
 531       if (!can_move_down) {
 532          for (const Operand& op : candidate->operands) {
 533             if (op.isTemp()) {
 534                ctx.depends_on[op.tempId()] = true;
 535                if (op.isFirstKill()) {
 536                   ctx.RAR_dependencies[op.tempId()] = true;
 537                   ctx.new_RAR_dependencies[op.tempId()] = true;
 538                }
 539             }
 540          }
 541          register_pressure_clause.update(register_demand[candidate_idx]);
 542          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 543          can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
 544          continue;
 545       }
 546
 547       if (part_of_clause) {
 548          for (const Operand& op : candidate->operands) {
 549             if (op.isTemp()) {
 550                ctx.depends_on[op.tempId()] = true;
 551                if (op.isFirstKill())
 552                   ctx.RAR_dependencies[op.tempId()] = true;
 553             }
 554          }
 555       }
 556
 557       bool register_pressure_unknown = false;
 558       std::vector<bool>& RAR_deps = part_of_clause ? ctx.new_RAR_dependencies : ctx.RAR_dependencies;
 559       /* check if one of candidate's operands is killed by depending instruction */
 560       for (const Operand& op : candidate->operands) {
 561          if (op.isTemp() && RAR_deps[op.tempId()]) {
 562             // FIXME: account for difference in register pressure
 563             register_pressure_unknown = true;
 564          }
 565       }
 566       if (register_pressure_unknown) {
 567          for (const Operand& op : candidate->operands) {
 568             if (op.isTemp()) {
 569                ctx.depends_on[op.tempId()] = true;
 570                if (op.isFirstKill()) {
 571                   ctx.RAR_dependencies[op.tempId()] = true;
 572                   ctx.new_RAR_dependencies[op.tempId()] = true;
 573                }
 574             }
 575          }
 576          register_pressure_clause.update(register_demand[candidate_idx]);
 577          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 578          can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
 579          continue;
 580       }
 581
 582       int insert_idx = part_of_clause ? clause_insert_idx : indep_insert_idx;
 583       RegisterDemand register_pressure = part_of_clause ? register_pressure_clause : register_pressure_indep;
 584
 585       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 586       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 587       const RegisterDemand temp = getTempRegisters(candidate);;
 588       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 589          break;
 590       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 591       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 592       if (new_demand.exceeds(ctx.max_registers))
 593          break;
 594       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 595
 596       /* move the candidate below the memory load */
 597       move_element(block->instructions, candidate_idx, insert_idx);
 598
 599       /* update register pressure */
 600       move_element(register_demand, candidate_idx, insert_idx);
 601       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 602          register_demand[i] -= candidate_diff;
 603       }
 604       register_demand[insert_idx - 1] = new_demand;
 605       register_pressure_clause -= candidate_diff;
 606       clause_insert_idx--;
 607       if (!part_of_clause) {
 608          register_pressure_indep -= candidate_diff;
 609          indep_insert_idx--;
 610       }
 611       k++;
 612       if (candidate_idx < ctx.last_SMEM_dep_idx)
 613          ctx.last_SMEM_stall++;
 614    }
 615
 616    /* create the initial set of values which depend on current */
 617    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 618    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 619    for (const Definition& def : current->definitions) {
 620       if (def.isTemp())
 621          ctx.depends_on[def.tempId()] = true;
 622    }
 623
 624    /* find the first instruction depending on current or find another VMEM */
 625    RegisterDemand register_pressure;
 626    int insert_idx = idx;
 627    moving_interaction = barrier_none;
 628    moving_spill = false;
 629    // TODO: differentiate between loads and stores (load-load can always reorder)
 630    can_reorder_vmem = true;
 631    can_reorder_smem = true;
 632
 633    bool found_dependency = false;
 634    /* second, check if we have instructions after current to move up */
 635    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 636       assert(candidate_idx < (int) block->instructions.size());
 637       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 638       bool can_reorder_candidate = can_reorder(candidate.get());
 639
 640       if (candidate->opcode == aco_opcode::p_logical_end)
 641          break;
 642       if (!can_move_instr(candidate, current, moving_interaction))
 643          break;
 644
 645       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 646                                            [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
 647       if (writes_exec)
 648          break;
 649
 650       /* check if candidate depends on current */
 651       bool is_dependency = false;
 652       if (candidate->format == Format::SMEM)
 653          is_dependency = !can_reorder_smem && !can_reorder_candidate;
 654       if (candidate->isVMEM())
 655          is_dependency = !can_reorder_vmem && !can_reorder_candidate;
 656       for (const Operand& op : candidate->operands) {
 657          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 658             is_dependency = true;
 659             break;
 660          }
 661       }
 662       if (moving_spill && is_spill_reload(candidate))
 663          is_dependency = true;
 664       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 665          is_dependency = true;
 666       moving_interaction |= get_barrier_interaction(candidate.get());
 667       moving_spill |= is_spill_reload(candidate);
 668       if (is_dependency) {
 669          for (const Definition& def : candidate->definitions) {
 670             if (def.isTemp())
 671                ctx.depends_on[def.tempId()] = true;
 672          }
 673          for (const Operand& op : candidate->operands) {
 674             if (op.isTemp())
 675                ctx.RAR_dependencies[op.tempId()] = true;
 676          }
 677          /* update flag whether we can reorder other memory instructions */
 678          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 679          can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
 680
 681          if (!found_dependency) {
 682             insert_idx = candidate_idx;
 683             found_dependency = true;
 684             /* init register pressure */
 685             register_pressure = register_demand[insert_idx - 1];
 686             continue;
 687          }
 688
 689       } else if (candidate->isVMEM()) {
 690          /* don't move up dependencies of other VMEM instructions */
 691          for (const Definition& def : candidate->definitions) {
 692             if (def.isTemp())
 693                ctx.depends_on[def.tempId()] = true;
 694          }
 695       }
 696
 697       /* update register pressure */
 698       register_pressure.update(register_demand[candidate_idx - 1]);
 699
 700       if (is_dependency || !found_dependency)
 701          continue;
 702       assert(insert_idx != idx);
 703
 704       bool register_pressure_unknown = false;
 705       /* check if candidate uses/kills an operand which is used by a dependency */
 706       for (const Operand& op : candidate->operands) {
 707          if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()])
 708             register_pressure_unknown = true;
 709       }
 710       if (register_pressure_unknown) {
 711          for (const Definition& def : candidate->definitions) {
 712             if (def.isTemp())
 713                ctx.depends_on[def.tempId()] = true;
 714          }
 715          for (const Operand& op : candidate->operands) {
 716             if (op.isTemp())
 717                ctx.RAR_dependencies[op.tempId()] = true;
 718          }
 719          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 720          can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate;
 721          continue;
 722       }
 723
 724       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 725       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 726       const RegisterDemand temp = getTempRegisters(candidate);
 727       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 728          break;
 729       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 730       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 731       if (new_demand.exceeds(ctx.max_registers))
 732          break;
 733
 734       /* move the candidate above the insert_idx */
 735       move_element(block->instructions, candidate_idx, insert_idx);
 736
 737       /* update register pressure */
 738       move_element(register_demand, candidate_idx, insert_idx);
 739       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 740          register_demand[i] += candidate_diff;
 741       }
 742       register_demand[insert_idx] = new_demand;
 743       register_pressure += candidate_diff;
 744       insert_idx++;
 745       k++;
 746    }
 747 }
 748
 749 void schedule_position_export(sched_ctx& ctx, Block* block,
 750                               std::vector<RegisterDemand>& register_demand,
 751                               Instruction* current, int idx)
 752 {
 753    assert(idx != 0);
 754    int window_size = POS_EXP_WINDOW_SIZE;
 755    int max_moves = POS_EXP_MAX_MOVES;
 756    int16_t k = 0;
 757
 758    /* create the initial set of values which current depends on */
 759    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 760    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 761    for (const Operand& op : current->operands) {
 762       if (op.isTemp()) {
 763          ctx.depends_on[op.tempId()] = true;
 764          if (op.isFirstKill())
 765             ctx.RAR_dependencies[op.tempId()] = true;
 766       }
 767    }
 768
 769    /* maintain how many registers remain free when moving instructions */
 770    RegisterDemand register_pressure = register_demand[idx];
 771
 772    /* first, check if we have instructions before current to move down */
 773    int insert_idx = idx + 1;
 774    int moving_interaction = barrier_none;
 775    bool moving_spill = false;
 776
 777    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 778       assert(candidate_idx >= 0);
 779       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 780
 781       /* break when encountering logical_start or barriers */
 782       if (candidate->opcode == aco_opcode::p_logical_start)
 783          break;
 784       if (candidate->opcode == aco_opcode::p_exit_early_if)
 785          break;
 786       if (candidate->isVMEM() || candidate->format == Format::SMEM)
 787          break;
 788       if (!can_move_instr(candidate, current, moving_interaction))
 789          break;
 790
 791       register_pressure.update(register_demand[candidate_idx]);
 792
 793       /* if current depends on candidate, add additional dependencies and continue */
 794       bool can_move_down = true;
 795       bool writes_exec = false;
 796       for (unsigned i = 0; i < candidate->definitions.size(); i++) {
 797          if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
 798             can_move_down = false;
 799          if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
 800             writes_exec = true;
 801       }
 802       if (writes_exec)
 803          break;
 804
 805       if (moving_spill && is_spill_reload(candidate))
 806          can_move_down = false;
 807       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 808          can_move_down = false;
 809       moving_interaction |= get_barrier_interaction(candidate.get());
 810       moving_spill |= is_spill_reload(candidate);
 811       if (!can_move_down) {
 812          for (const Operand& op : candidate->operands) {
 813             if (op.isTemp()) {
 814                ctx.depends_on[op.tempId()] = true;
 815                if (op.isFirstKill())
 816                   ctx.RAR_dependencies[op.tempId()] = true;
 817             }
 818          }
 819          continue;
 820       }
 821
 822       bool register_pressure_unknown = false;
 823       /* check if one of candidate's operands is killed by depending instruction */
 824       for (const Operand& op : candidate->operands) {
 825          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
 826             // FIXME: account for difference in register pressure
 827             register_pressure_unknown = true;
 828          }
 829       }
 830       if (register_pressure_unknown) {
 831          for (const Operand& op : candidate->operands) {
 832             if (op.isTemp()) {
 833                ctx.depends_on[op.tempId()] = true;
 834                if (op.isFirstKill())
 835                   ctx.RAR_dependencies[op.tempId()] = true;
 836             }
 837          }
 838          continue;
 839       }
 840
 841       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 842       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 843       const RegisterDemand temp = getTempRegisters(candidate);;
 844       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 845          break;
 846       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 847       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 848       if (new_demand.exceeds(ctx.max_registers))
 849          break;
 850       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 851
 852       /* move the candidate below the export */
 853       move_element(block->instructions, candidate_idx, insert_idx);
 854
 855       /* update register pressure */
 856       move_element(register_demand, candidate_idx, insert_idx);
 857       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 858          register_demand[i] -= candidate_diff;
 859       }
 860       register_demand[insert_idx - 1] = new_demand;
 861       register_pressure -=  candidate_diff;
 862       insert_idx--;
 863       k++;
 864    }
 865 }
 866
 867 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 868 {
 869    ctx.last_SMEM_dep_idx = 0;
 870    ctx.last_SMEM_stall = INT16_MIN;
 871
 872    /* go through all instructions and find memory loads */
 873    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 874       Instruction* current = block->instructions[idx].get();
 875
 876       if (current->definitions.empty())
 877          continue;
 878
 879       if (current->isVMEM())
 880          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 881       if (current->format == Format::SMEM)
 882          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 883    }
 884
 885    if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
 886       /* Try to move position exports as far up as possible, to reduce register
 887        * usage and because ISA reference guides say so. */
 888       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 889          Instruction* current = block->instructions[idx].get();
 890
 891          if (current->format == Format::EXP) {
 892             unsigned target = static_cast<Export_instruction*>(current)->dest;
 893             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
 894                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 895          }
 896       }
 897    }
 898
 899    /* resummarize the block's register demand */
 900    block->register_demand = RegisterDemand();
 901    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 902       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 903    }
 904 }
 905
 906
 907 void schedule_program(Program *program, live& live_vars)
 908 {
 909    sched_ctx ctx;
 910    ctx.depends_on.resize(program->peekAllocationId());
 911    ctx.RAR_dependencies.resize(program->peekAllocationId());
 912    ctx.new_RAR_dependencies.resize(program->peekAllocationId());
 913    /* Allowing the scheduler to reduce the number of waves to as low as 5
 914     * improves performance of Thrones of Britannia significantly and doesn't
 915     * seem to hurt anything else. */
 916    if (program->num_waves <= 5)
 917       ctx.num_waves = program->num_waves;
 918    else if (program->max_reg_demand.vgpr >= 32)
 919       ctx.num_waves = 5;
 920    else if (program->max_reg_demand.vgpr >= 28)
 921       ctx.num_waves = 6;
 922    else if (program->max_reg_demand.vgpr >= 24)
 923       ctx.num_waves = 7;
 924    else
 925       ctx.num_waves = 8;
 926
 927    assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
 928    ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
 929
 930    for (Block& block : program->blocks)
 931       schedule_block(ctx, program, &block, live_vars);
 932
 933    /* update max_reg_demand and num_waves */
 934    RegisterDemand new_demand;
 935    for (Block& block : program->blocks) {
 936       new_demand.update(block.register_demand);
 937    }
 938    update_vgpr_sgpr_demand(program, new_demand);
 939
 940    /* if enabled, this code asserts that register_demand is updated correctly */
 941    #if 0
 942    int prev_num_waves = program->num_waves;
 943    const RegisterDemand prev_max_demand = program->max_reg_demand;
 944
 945    std::vector<RegisterDemand> demands(program->blocks.size());
 946    for (unsigned j = 0; j < program->blocks.size(); j++) {
 947       demands[j] = program->blocks[j].register_demand;
 948    }
 949
 950    struct radv_nir_compiler_options options;
 951    options.chip_class = program->chip_class;
 952    live live_vars2 = aco::live_var_analysis(program, &options);
 953
 954    for (unsigned j = 0; j < program->blocks.size(); j++) {
 955       Block &b = program->blocks[j];
 956       for (unsigned i = 0; i < b.instructions.size(); i++)
 957          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
 958       assert(b.register_demand == demands[j]);
 959    }
 960
 961    assert(program->max_reg_demand == prev_max_demand);
 962    assert(program->num_waves == prev_num_waves);
 963    #endif
 964 }
 965
 966 }