src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include "aco_builder.h"
  27 #include <unordered_set>
  28 #include <algorithm>
  29
  30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  31 #include "amdgfxregs.h"
  32
  33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  35 #define POS_EXP_WINDOW_SIZE 512
  36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
  37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
  38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
  39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
  40 #define POS_EXP_MAX_MOVES 512
  41
  42 namespace aco {
  43
  44 struct sched_ctx {
  45    std::vector<bool> depends_on;
  46    std::vector<bool> RAR_dependencies;
  47    /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
  48     * instructions in the clause, since new instructions in the clause are not
  49     * moved past any other instructions in the clause. */
  50    std::vector<bool> new_RAR_dependencies;
  51
  52    RegisterDemand max_registers;
  53    int16_t num_waves;
  54    int16_t last_SMEM_stall;
  55    int last_SMEM_dep_idx;
  56 };
  57
  58 /* This scheduler is a simple bottom-up pass based on ideas from
  59  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  60  * from Xiaohua Shi and Peng Guo.
  61  * The basic approach is to iterate over all instructions. When a memory instruction
  62  * is encountered it tries to move independent instructions from above and below
  63  * between the memory instruction and it's first user.
  64  * The novelty is that this scheduler cares for the current register pressure:
  65  * Instructions will only be moved if the register pressure won't exceed a certain bound.
  66  */
  67
  68 template <typename T>
  69 void move_element(T& list, size_t idx, size_t before) {
  70     if (idx < before) {
  71         auto begin = std::next(list.begin(), idx);
  72         auto end = std::next(list.begin(), before);
  73         std::rotate(begin, begin + 1, end);
  74     } else if (idx > before) {
  75         auto begin = std::next(list.begin(), before);
  76         auto end = std::next(list.begin(), idx + 1);
  77         std::rotate(begin, end - 1, end);
  78     }
  79 }
  80
  81 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
  82 {
  83    RegisterDemand changes;
  84    for (const Definition& def : instr->definitions) {
  85       if (!def.isTemp() || def.isKill())
  86          continue;
  87       changes += def.getTemp();
  88    }
  89
  90    for (const Operand& op : instr->operands) {
  91       if (!op.isTemp() || !op.isFirstKill())
  92          continue;
  93       changes -= op.getTemp();
  94    }
  95
  96    return changes;
  97 }
  98
  99 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
 100 {
 101    RegisterDemand temp_registers;
 102    for (const Definition& def : instr->definitions) {
 103       if (!def.isTemp() || !def.isKill())
 104          continue;
 105       temp_registers += def.getTemp();
 106    }
 107    return temp_registers;
 108 }
 109
 110 static bool is_spill_reload(aco_ptr<Instruction>& instr)
 111 {
 112    return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
 113 }
 114
 115 bool can_reorder(Instruction* candidate)
 116 {
 117    switch (candidate->format) {
 118    case Format::SMEM:
 119       return static_cast<SMEM_instruction*>(candidate)->can_reorder;
 120    case Format::MUBUF:
 121       return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
 122    case Format::MIMG:
 123       return static_cast<MIMG_instruction*>(candidate)->can_reorder;
 124    case Format::MTBUF:
 125       return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
 126    case Format::FLAT:
 127    case Format::GLOBAL:
 128    case Format::SCRATCH:
 129       return static_cast<FLAT_instruction*>(candidate)->can_reorder;
 130    default:
 131       return true;
 132    }
 133 }
 134
 135 bool is_gs_or_done_sendmsg(Instruction *instr)
 136 {
 137    if (instr->opcode == aco_opcode::s_sendmsg) {
 138       uint16_t imm = static_cast<SOPP_instruction*>(instr)->imm;
 139       return (imm & sendmsg_id_mask) == _sendmsg_gs ||
 140              (imm & sendmsg_id_mask) == _sendmsg_gs_done;
 141    }
 142    return false;
 143 }
 144
 145 bool is_done_sendmsg(Instruction *instr)
 146 {
 147    if (instr->opcode == aco_opcode::s_sendmsg) {
 148       uint16_t imm = static_cast<SOPP_instruction*>(instr)->imm;
 149       return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
 150    }
 151    return false;
 152 }
 153
 154 barrier_interaction get_barrier_interaction(Instruction* instr)
 155 {
 156    switch (instr->format) {
 157    case Format::SMEM:
 158       return static_cast<SMEM_instruction*>(instr)->barrier;
 159    case Format::MUBUF:
 160       return static_cast<MUBUF_instruction*>(instr)->barrier;
 161    case Format::MIMG:
 162       return static_cast<MIMG_instruction*>(instr)->barrier;
 163    case Format::MTBUF:
 164       return static_cast<MTBUF_instruction*>(instr)->barrier;
 165    case Format::FLAT:
 166    case Format::GLOBAL:
 167    case Format::SCRATCH:
 168       return static_cast<FLAT_instruction*>(instr)->barrier;
 169    case Format::DS:
 170       return barrier_shared;
 171    case Format::SOPP:
 172       if (is_done_sendmsg(instr))
 173          return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
 174       else if (is_gs_or_done_sendmsg(instr))
 175          return barrier_gs_sendmsg;
 176       else
 177          return barrier_none;
 178    default:
 179       return barrier_none;
 180    }
 181 }
 182
 183 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
 184 {
 185    /* don't move exports so that they stay closer together */
 186    if (instr->format == Format::EXP)
 187       return false;
 188
 189    /* don't move s_memtime/s_memrealtime */
 190    if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
 191       return false;
 192
 193    /* handle barriers */
 194
 195    /* TODO: instead of stopping, maybe try to move the barriers and any
 196     * instructions interacting with them instead? */
 197    if (instr->format != Format::PSEUDO_BARRIER) {
 198       if (instr->opcode == aco_opcode::s_barrier) {
 199          return can_reorder(current) && moving_interaction == barrier_none;
 200       } else if (is_gs_or_done_sendmsg(instr.get())) {
 201          int interaction = get_barrier_interaction(current);
 202          interaction |= moving_interaction;
 203          return !(interaction & get_barrier_interaction(instr.get()));
 204       } else {
 205          return true;
 206       }
 207    }
 208
 209    int interaction = get_barrier_interaction(current);
 210    interaction |= moving_interaction;
 211
 212    switch (instr->opcode) {
 213    case aco_opcode::p_memory_barrier_atomic:
 214       return !(interaction & barrier_atomic);
 215    /* For now, buffer and image barriers are treated the same. this is because of
 216     * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
 217     * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
 218     * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
 219     * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
 220     * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
 221     * Either way, this solution should work. */
 222    case aco_opcode::p_memory_barrier_buffer:
 223    case aco_opcode::p_memory_barrier_image:
 224       return !(interaction & (barrier_image | barrier_buffer));
 225    case aco_opcode::p_memory_barrier_shared:
 226       return !(interaction & barrier_shared);
 227    case aco_opcode::p_memory_barrier_common:
 228       return !(interaction & (barrier_image | barrier_buffer | barrier_shared | barrier_atomic));
 229    case aco_opcode::p_memory_barrier_gs_data:
 230       return !(interaction & barrier_gs_data);
 231    case aco_opcode::p_memory_barrier_gs_sendmsg:
 232       return !(interaction & barrier_gs_sendmsg);
 233    default:
 234       return false;
 235    }
 236 }
 237
 238 void schedule_SMEM(sched_ctx& ctx, Block* block,
 239                    std::vector<RegisterDemand>& register_demand,
 240                    Instruction* current, int idx)
 241 {
 242    assert(idx != 0);
 243    int window_size = SMEM_WINDOW_SIZE;
 244    int max_moves = SMEM_MAX_MOVES;
 245    int16_t k = 0;
 246    bool can_reorder_cur = can_reorder(current);
 247
 248    /* don't move s_memtime/s_memrealtime */
 249    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
 250       return;
 251
 252    /* create the initial set of values which current depends on */
 253    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 254    for (const Operand& op : current->operands) {
 255       if (op.isTemp())
 256          ctx.depends_on[op.tempId()] = true;
 257    }
 258
 259    /* maintain how many registers remain free when moving instructions */
 260    RegisterDemand register_pressure = register_demand[idx];
 261
 262    /* first, check if we have instructions before current to move down */
 263    int insert_idx = idx + 1;
 264    int moving_interaction = barrier_none;
 265    bool moving_spill = false;
 266
 267    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 268       assert(candidate_idx >= 0);
 269       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 270       bool can_reorder_candidate = can_reorder(candidate.get());
 271
 272       /* break if we'd make the previous SMEM instruction stall */
 273       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 274       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 275          break;
 276
 277       /* break when encountering another MEM instruction, logical_start or barriers */
 278       if (!can_reorder_candidate && !can_reorder_cur)
 279          break;
 280       if (candidate->opcode == aco_opcode::p_logical_start)
 281          break;
 282       if (candidate->opcode == aco_opcode::p_exit_early_if)
 283          break;
 284       if (!can_move_instr(candidate, current, moving_interaction))
 285          break;
 286       if (candidate->isVMEM())
 287          break;
 288       register_pressure.update(register_demand[candidate_idx]);
 289
 290       /* if current depends on candidate, add additional dependencies and continue */
 291       bool can_move_down = true;
 292       bool writes_exec = false;
 293       for (const Definition& def : candidate->definitions) {
 294          if (def.isTemp() && ctx.depends_on[def.tempId()])
 295             can_move_down = false;
 296          if (def.isFixed() && def.physReg() == exec)
 297             writes_exec = true;
 298       }
 299       if (writes_exec)
 300          break;
 301
 302       if (moving_spill && is_spill_reload(candidate))
 303          can_move_down = false;
 304       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 305          can_move_down = false;
 306       moving_interaction |= get_barrier_interaction(candidate.get());
 307       moving_spill |= is_spill_reload(candidate);
 308       if (!can_move_down) {
 309          for (const Operand& op : candidate->operands) {
 310             if (op.isTemp())
 311                ctx.depends_on[op.tempId()] = true;
 312          }
 313          can_reorder_cur &= can_reorder_candidate;
 314          continue;
 315       }
 316
 317       bool register_pressure_unknown = false;
 318       /* check if one of candidate's operands is killed by depending instruction */
 319       for (const Operand& op : candidate->operands) {
 320          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 321             // FIXME: account for difference in register pressure
 322             register_pressure_unknown = true;
 323          }
 324       }
 325       if (register_pressure_unknown) {
 326          for (const Operand& op : candidate->operands) {
 327             if (op.isTemp())
 328                ctx.depends_on[op.tempId()] = true;
 329          }
 330          can_reorder_cur &= can_reorder_candidate;
 331          continue;
 332       }
 333
 334       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 335       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 336       const RegisterDemand tempDemand = getTempRegisters(candidate);
 337       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 338          break;
 339       const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
 340       const RegisterDemand new_demand  = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
 341       if (new_demand.exceeds(ctx.max_registers))
 342          break;
 343       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 344
 345       /* move the candidate below the memory load */
 346       move_element(block->instructions, candidate_idx, insert_idx);
 347
 348       /* update register pressure */
 349       move_element(register_demand, candidate_idx, insert_idx);
 350       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 351          register_demand[i] -= candidate_diff;
 352       }
 353       register_demand[insert_idx - 1] = new_demand;
 354       register_pressure -= candidate_diff;
 355
 356       if (candidate_idx < ctx.last_SMEM_dep_idx)
 357          ctx.last_SMEM_stall++;
 358       insert_idx--;
 359       k++;
 360    }
 361
 362    /* create the initial set of values which depend on current */
 363    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 364    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 365    for (const Definition& def : current->definitions) {
 366       if (def.isTemp())
 367          ctx.depends_on[def.tempId()] = true;
 368    }
 369
 370    /* find the first instruction depending on current or find another MEM */
 371    insert_idx = idx + 1;
 372    moving_interaction = barrier_none;
 373    moving_spill = false;
 374    can_reorder_cur = true;
 375
 376    bool found_dependency = false;
 377    /* second, check if we have instructions after current to move up */
 378    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 379       assert(candidate_idx < (int) block->instructions.size());
 380       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 381       bool can_reorder_candidate = can_reorder(candidate.get());
 382
 383       if (candidate->opcode == aco_opcode::p_logical_end)
 384          break;
 385       if (!can_move_instr(candidate, current, moving_interaction))
 386          break;
 387
 388       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 389                                            [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
 390       if (writes_exec)
 391          break;
 392
 393       /* check if candidate depends on current */
 394       bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
 395                                        [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
 396       /* no need to steal from following VMEM instructions */
 397       if (is_dependency && candidate->isVMEM())
 398          break;
 399       if (moving_spill && is_spill_reload(candidate))
 400          is_dependency = true;
 401       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 402          is_dependency = true;
 403       moving_interaction |= get_barrier_interaction(candidate.get());
 404       moving_spill |= is_spill_reload(candidate);
 405       if (is_dependency) {
 406          for (const Definition& def : candidate->definitions) {
 407             if (def.isTemp())
 408                ctx.depends_on[def.tempId()] = true;
 409          }
 410          for (const Operand& op : candidate->operands) {
 411             if (op.isTemp())
 412                ctx.RAR_dependencies[op.tempId()] = true;
 413          }
 414          if (!found_dependency) {
 415             insert_idx = candidate_idx;
 416             found_dependency = true;
 417             /* init register pressure */
 418             register_pressure = register_demand[insert_idx - 1];
 419          }
 420       }
 421
 422       if (!can_reorder_candidate && !can_reorder_cur)
 423          break;
 424
 425       if (!found_dependency) {
 426          k++;
 427          continue;
 428       }
 429
 430       /* update register pressure */
 431       register_pressure.update(register_demand[candidate_idx - 1]);
 432
 433       if (is_dependency) {
 434          can_reorder_cur &= can_reorder_candidate;
 435          continue;
 436       }
 437       assert(insert_idx != idx);
 438
 439       // TODO: correctly calculate register pressure for this case
 440       bool register_pressure_unknown = false;
 441       /* check if candidate uses/kills an operand which is used by a dependency */
 442       for (const Operand& op : candidate->operands) {
 443          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
 444             register_pressure_unknown = true;
 445       }
 446       if (register_pressure_unknown) {
 447          if (candidate->isVMEM())
 448             break;
 449          for (const Definition& def : candidate->definitions) {
 450             if (def.isTemp())
 451                ctx.RAR_dependencies[def.tempId()] = true;
 452          }
 453          for (const Operand& op : candidate->operands) {
 454             if (op.isTemp())
 455                ctx.RAR_dependencies[op.tempId()] = true;
 456          }
 457          can_reorder_cur &= can_reorder_candidate;
 458          continue;
 459       }
 460
 461       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 462       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 463       const RegisterDemand temp = getTempRegisters(candidate);
 464       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 465          break;
 466       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 467       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 468       if (new_demand.exceeds(ctx.max_registers))
 469          break;
 470
 471       /* move the candidate above the insert_idx */
 472       move_element(block->instructions, candidate_idx, insert_idx);
 473
 474       /* update register pressure */
 475       move_element(register_demand, candidate_idx, insert_idx);
 476       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 477          register_demand[i] += candidate_diff;
 478       }
 479       register_demand[insert_idx] = new_demand;
 480       register_pressure += candidate_diff;
 481       insert_idx++;
 482       k++;
 483    }
 484
 485    ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
 486    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 487 }
 488
 489 void schedule_VMEM(sched_ctx& ctx, Block* block,
 490                    std::vector<RegisterDemand>& register_demand,
 491                    Instruction* current, int idx)
 492 {
 493    assert(idx != 0);
 494    int window_size = VMEM_WINDOW_SIZE;
 495    int max_moves = VMEM_MAX_MOVES;
 496    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
 497    int16_t k = 0;
 498    /* initially true as we don't pull other VMEM instructions
 499     * through the current instruction */
 500    bool can_reorder_vmem = true;
 501    bool can_reorder_smem = true;
 502
 503    /* create the initial set of values which current depends on */
 504    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 505    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 506    std::fill(ctx.new_RAR_dependencies.begin(), ctx.new_RAR_dependencies.end(), false);
 507    for (const Operand& op : current->operands) {
 508       if (op.isTemp()) {
 509          ctx.depends_on[op.tempId()] = true;
 510          if (op.isFirstKill())
 511             ctx.RAR_dependencies[op.tempId()] = true;
 512       }
 513    }
 514
 515    /* maintain how many registers remain free when moving instructions */
 516    RegisterDemand register_pressure_indep = register_demand[idx];
 517    RegisterDemand register_pressure_clause = register_demand[idx];
 518
 519    /* first, check if we have instructions before current to move down */
 520    int indep_insert_idx = idx + 1;
 521    int clause_insert_idx = idx;
 522    int moving_interaction = barrier_none;
 523    bool moving_spill = false;
 524
 525    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 526       assert(candidate_idx >= 0);
 527       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 528       bool can_reorder_candidate = can_reorder(candidate.get());
 529       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 530
 531       /* break when encountering another VMEM instruction, logical_start or barriers */
 532       if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
 533          break;
 534       if (candidate->opcode == aco_opcode::p_logical_start)
 535          break;
 536       if (candidate->opcode == aco_opcode::p_exit_early_if)
 537          break;
 538       if (!can_move_instr(candidate, current, moving_interaction))
 539          break;
 540
 541       /* break if we'd make the previous SMEM instruction stall */
 542       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 543       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 544          break;
 545       register_pressure_indep.update(register_demand[candidate_idx]);
 546
 547       bool part_of_clause = false;
 548       if (current->isVMEM() == candidate->isVMEM()) {
 549          bool same_resource = true;
 550          if (current->isVMEM())
 551             same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
 552          bool can_reorder = can_reorder_vmem || can_reorder_candidate;
 553          int grab_dist = clause_insert_idx - candidate_idx;
 554          /* We can't easily tell how much this will decrease the def-to-use
 555           * distances, so just use how far it will be moved as a heuristic. */
 556          part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist;
 557       }
 558
 559       /* if current depends on candidate, add additional dependencies and continue */
 560       bool can_move_down = !is_vmem || part_of_clause;
 561       bool writes_exec = false;
 562       for (const Definition& def : candidate->definitions) {
 563          if (def.isTemp() && ctx.depends_on[def.tempId()])
 564             can_move_down = false;
 565          if (def.isFixed() && def.physReg() == exec)
 566             writes_exec = true;
 567       }
 568       if (writes_exec)
 569          break;
 570
 571       if (moving_spill && is_spill_reload(candidate))
 572          can_move_down = false;
 573       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 574          can_move_down = false;
 575       moving_interaction |= get_barrier_interaction(candidate.get());
 576       moving_spill |= is_spill_reload(candidate);
 577       if (!can_move_down) {
 578          for (const Operand& op : candidate->operands) {
 579             if (op.isTemp()) {
 580                ctx.depends_on[op.tempId()] = true;
 581                if (op.isFirstKill()) {
 582                   ctx.RAR_dependencies[op.tempId()] = true;
 583                   ctx.new_RAR_dependencies[op.tempId()] = true;
 584                }
 585             }
 586          }
 587          register_pressure_clause.update(register_demand[candidate_idx]);
 588          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 589          can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 590          continue;
 591       }
 592
 593       if (part_of_clause) {
 594          for (const Operand& op : candidate->operands) {
 595             if (op.isTemp()) {
 596                ctx.depends_on[op.tempId()] = true;
 597                if (op.isFirstKill())
 598                   ctx.RAR_dependencies[op.tempId()] = true;
 599             }
 600          }
 601       }
 602
 603       bool register_pressure_unknown = false;
 604       std::vector<bool>& RAR_deps = part_of_clause ? ctx.new_RAR_dependencies : ctx.RAR_dependencies;
 605       /* check if one of candidate's operands is killed by depending instruction */
 606       for (const Operand& op : candidate->operands) {
 607          if (op.isTemp() && RAR_deps[op.tempId()]) {
 608             // FIXME: account for difference in register pressure
 609             register_pressure_unknown = true;
 610          }
 611       }
 612       if (register_pressure_unknown) {
 613          for (const Operand& op : candidate->operands) {
 614             if (op.isTemp()) {
 615                ctx.depends_on[op.tempId()] = true;
 616                if (op.isFirstKill()) {
 617                   ctx.RAR_dependencies[op.tempId()] = true;
 618                   ctx.new_RAR_dependencies[op.tempId()] = true;
 619                }
 620             }
 621          }
 622          register_pressure_clause.update(register_demand[candidate_idx]);
 623          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 624          can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 625          continue;
 626       }
 627
 628       int insert_idx = part_of_clause ? clause_insert_idx : indep_insert_idx;
 629       RegisterDemand register_pressure = part_of_clause ? register_pressure_clause : register_pressure_indep;
 630
 631       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 632       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 633       const RegisterDemand temp = getTempRegisters(candidate);;
 634       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 635          break;
 636       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 637       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 638       if (new_demand.exceeds(ctx.max_registers))
 639          break;
 640       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 641
 642       /* move the candidate below the memory load */
 643       move_element(block->instructions, candidate_idx, insert_idx);
 644
 645       /* update register pressure */
 646       move_element(register_demand, candidate_idx, insert_idx);
 647       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 648          register_demand[i] -= candidate_diff;
 649       }
 650       register_demand[insert_idx - 1] = new_demand;
 651       register_pressure_clause -= candidate_diff;
 652       clause_insert_idx--;
 653       if (!part_of_clause) {
 654          register_pressure_indep -= candidate_diff;
 655          indep_insert_idx--;
 656       }
 657       k++;
 658       if (candidate_idx < ctx.last_SMEM_dep_idx)
 659          ctx.last_SMEM_stall++;
 660    }
 661
 662    /* create the initial set of values which depend on current */
 663    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 664    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 665    for (const Definition& def : current->definitions) {
 666       if (def.isTemp())
 667          ctx.depends_on[def.tempId()] = true;
 668    }
 669
 670    /* find the first instruction depending on current or find another VMEM */
 671    RegisterDemand register_pressure;
 672    int insert_idx = idx;
 673    moving_interaction = barrier_none;
 674    moving_spill = false;
 675    // TODO: differentiate between loads and stores (load-load can always reorder)
 676    can_reorder_vmem = true;
 677    can_reorder_smem = true;
 678
 679    bool found_dependency = false;
 680    /* second, check if we have instructions after current to move up */
 681    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 682       assert(candidate_idx < (int) block->instructions.size());
 683       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 684       bool can_reorder_candidate = can_reorder(candidate.get());
 685       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 686
 687       if (candidate->opcode == aco_opcode::p_logical_end)
 688          break;
 689       if (!can_move_instr(candidate, current, moving_interaction))
 690          break;
 691
 692       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 693                                            [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
 694       if (writes_exec)
 695          break;
 696
 697       /* check if candidate depends on current */
 698       bool is_dependency = false;
 699       if (candidate->format == Format::SMEM)
 700          is_dependency = !can_reorder_smem && !can_reorder_candidate;
 701       if (is_vmem)
 702          is_dependency = !can_reorder_vmem && !can_reorder_candidate;
 703       for (const Operand& op : candidate->operands) {
 704          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 705             is_dependency = true;
 706             break;
 707          }
 708       }
 709       if (moving_spill && is_spill_reload(candidate))
 710          is_dependency = true;
 711       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 712          is_dependency = true;
 713       moving_interaction |= get_barrier_interaction(candidate.get());
 714       moving_spill |= is_spill_reload(candidate);
 715       if (is_dependency) {
 716          for (const Definition& def : candidate->definitions) {
 717             if (def.isTemp())
 718                ctx.depends_on[def.tempId()] = true;
 719          }
 720          for (const Operand& op : candidate->operands) {
 721             if (op.isTemp())
 722                ctx.RAR_dependencies[op.tempId()] = true;
 723          }
 724          /* update flag whether we can reorder other memory instructions */
 725          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 726          can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 727
 728          if (!found_dependency) {
 729             insert_idx = candidate_idx;
 730             found_dependency = true;
 731             /* init register pressure */
 732             register_pressure = register_demand[insert_idx - 1];
 733             continue;
 734          }
 735
 736       } else if (is_vmem) {
 737          /* don't move up dependencies of other VMEM instructions */
 738          for (const Definition& def : candidate->definitions) {
 739             if (def.isTemp())
 740                ctx.depends_on[def.tempId()] = true;
 741          }
 742       }
 743
 744       /* update register pressure */
 745       register_pressure.update(register_demand[candidate_idx - 1]);
 746
 747       if (is_dependency || !found_dependency)
 748          continue;
 749       assert(insert_idx != idx);
 750
 751       bool register_pressure_unknown = false;
 752       /* check if candidate uses/kills an operand which is used by a dependency */
 753       for (const Operand& op : candidate->operands) {
 754          if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()])
 755             register_pressure_unknown = true;
 756       }
 757       if (register_pressure_unknown) {
 758          for (const Definition& def : candidate->definitions) {
 759             if (def.isTemp())
 760                ctx.depends_on[def.tempId()] = true;
 761          }
 762          for (const Operand& op : candidate->operands) {
 763             if (op.isTemp())
 764                ctx.RAR_dependencies[op.tempId()] = true;
 765          }
 766          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 767          can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 768          continue;
 769       }
 770
 771       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 772       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 773       const RegisterDemand temp = getTempRegisters(candidate);
 774       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 775          break;
 776       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 777       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 778       if (new_demand.exceeds(ctx.max_registers))
 779          break;
 780
 781       /* move the candidate above the insert_idx */
 782       move_element(block->instructions, candidate_idx, insert_idx);
 783
 784       /* update register pressure */
 785       move_element(register_demand, candidate_idx, insert_idx);
 786       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 787          register_demand[i] += candidate_diff;
 788       }
 789       register_demand[insert_idx] = new_demand;
 790       register_pressure += candidate_diff;
 791       insert_idx++;
 792       k++;
 793    }
 794 }
 795
 796 void schedule_position_export(sched_ctx& ctx, Block* block,
 797                               std::vector<RegisterDemand>& register_demand,
 798                               Instruction* current, int idx)
 799 {
 800    assert(idx != 0);
 801    int window_size = POS_EXP_WINDOW_SIZE;
 802    int max_moves = POS_EXP_MAX_MOVES;
 803    int16_t k = 0;
 804
 805    /* create the initial set of values which current depends on */
 806    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 807    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 808    for (const Operand& op : current->operands) {
 809       if (op.isTemp()) {
 810          ctx.depends_on[op.tempId()] = true;
 811          if (op.isFirstKill())
 812             ctx.RAR_dependencies[op.tempId()] = true;
 813       }
 814    }
 815
 816    /* maintain how many registers remain free when moving instructions */
 817    RegisterDemand register_pressure = register_demand[idx];
 818
 819    /* first, check if we have instructions before current to move down */
 820    int insert_idx = idx + 1;
 821    int moving_interaction = barrier_none;
 822    bool moving_spill = false;
 823
 824    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 825       assert(candidate_idx >= 0);
 826       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 827
 828       /* break when encountering logical_start or barriers */
 829       if (candidate->opcode == aco_opcode::p_logical_start)
 830          break;
 831       if (candidate->opcode == aco_opcode::p_exit_early_if)
 832          break;
 833       if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
 834          break;
 835       if (!can_move_instr(candidate, current, moving_interaction))
 836          break;
 837
 838       register_pressure.update(register_demand[candidate_idx]);
 839
 840       /* if current depends on candidate, add additional dependencies and continue */
 841       bool can_move_down = true;
 842       bool writes_exec = false;
 843       for (unsigned i = 0; i < candidate->definitions.size(); i++) {
 844          if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
 845             can_move_down = false;
 846          if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
 847             writes_exec = true;
 848       }
 849       if (writes_exec)
 850          break;
 851
 852       if (moving_spill && is_spill_reload(candidate))
 853          can_move_down = false;
 854       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 855          can_move_down = false;
 856       moving_interaction |= get_barrier_interaction(candidate.get());
 857       moving_spill |= is_spill_reload(candidate);
 858       if (!can_move_down) {
 859          for (const Operand& op : candidate->operands) {
 860             if (op.isTemp()) {
 861                ctx.depends_on[op.tempId()] = true;
 862                if (op.isFirstKill())
 863                   ctx.RAR_dependencies[op.tempId()] = true;
 864             }
 865          }
 866          continue;
 867       }
 868
 869       bool register_pressure_unknown = false;
 870       /* check if one of candidate's operands is killed by depending instruction */
 871       for (const Operand& op : candidate->operands) {
 872          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
 873             // FIXME: account for difference in register pressure
 874             register_pressure_unknown = true;
 875          }
 876       }
 877       if (register_pressure_unknown) {
 878          for (const Operand& op : candidate->operands) {
 879             if (op.isTemp()) {
 880                ctx.depends_on[op.tempId()] = true;
 881                if (op.isFirstKill())
 882                   ctx.RAR_dependencies[op.tempId()] = true;
 883             }
 884          }
 885          continue;
 886       }
 887
 888       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 889       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 890       const RegisterDemand temp = getTempRegisters(candidate);;
 891       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 892          break;
 893       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 894       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 895       if (new_demand.exceeds(ctx.max_registers))
 896          break;
 897       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 898
 899       /* move the candidate below the export */
 900       move_element(block->instructions, candidate_idx, insert_idx);
 901
 902       /* update register pressure */
 903       move_element(register_demand, candidate_idx, insert_idx);
 904       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 905          register_demand[i] -= candidate_diff;
 906       }
 907       register_demand[insert_idx - 1] = new_demand;
 908       register_pressure -=  candidate_diff;
 909       insert_idx--;
 910       k++;
 911    }
 912 }
 913
 914 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 915 {
 916    ctx.last_SMEM_dep_idx = 0;
 917    ctx.last_SMEM_stall = INT16_MIN;
 918
 919    /* go through all instructions and find memory loads */
 920    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 921       Instruction* current = block->instructions[idx].get();
 922
 923       if (current->definitions.empty())
 924          continue;
 925
 926       if (current->isVMEM() || current->isFlatOrGlobal())
 927          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 928       if (current->format == Format::SMEM)
 929          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 930    }
 931
 932    if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
 933       /* Try to move position exports as far up as possible, to reduce register
 934        * usage and because ISA reference guides say so. */
 935       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 936          Instruction* current = block->instructions[idx].get();
 937
 938          if (current->format == Format::EXP) {
 939             unsigned target = static_cast<Export_instruction*>(current)->dest;
 940             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
 941                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 942          }
 943       }
 944    }
 945
 946    /* resummarize the block's register demand */
 947    block->register_demand = RegisterDemand();
 948    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 949       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 950    }
 951 }
 952
 953
 954 void schedule_program(Program *program, live& live_vars)
 955 {
 956    sched_ctx ctx;
 957    ctx.depends_on.resize(program->peekAllocationId());
 958    ctx.RAR_dependencies.resize(program->peekAllocationId());
 959    ctx.new_RAR_dependencies.resize(program->peekAllocationId());
 960    /* Allowing the scheduler to reduce the number of waves to as low as 5
 961     * improves performance of Thrones of Britannia significantly and doesn't
 962     * seem to hurt anything else. */
 963    if (program->num_waves <= 5)
 964       ctx.num_waves = program->num_waves;
 965    else if (program->max_reg_demand.vgpr >= 32)
 966       ctx.num_waves = 5;
 967    else if (program->max_reg_demand.vgpr >= 28)
 968       ctx.num_waves = 6;
 969    else if (program->max_reg_demand.vgpr >= 24)
 970       ctx.num_waves = 7;
 971    else
 972       ctx.num_waves = 8;
 973    ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
 974
 975    assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
 976    ctx.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
 977                          int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
 978
 979    for (Block& block : program->blocks)
 980       schedule_block(ctx, program, &block, live_vars);
 981
 982    /* update max_reg_demand and num_waves */
 983    RegisterDemand new_demand;
 984    for (Block& block : program->blocks) {
 985       new_demand.update(block.register_demand);
 986    }
 987    update_vgpr_sgpr_demand(program, new_demand);
 988
 989    /* if enabled, this code asserts that register_demand is updated correctly */
 990    #if 0
 991    int prev_num_waves = program->num_waves;
 992    const RegisterDemand prev_max_demand = program->max_reg_demand;
 993
 994    std::vector<RegisterDemand> demands(program->blocks.size());
 995    for (unsigned j = 0; j < program->blocks.size(); j++) {
 996       demands[j] = program->blocks[j].register_demand;
 997    }
 998
 999    struct radv_nir_compiler_options options;
1000    options.chip_class = program->chip_class;
1001    live live_vars2 = aco::live_var_analysis(program, &options);
1002
1003    for (unsigned j = 0; j < program->blocks.size(); j++) {
1004       Block &b = program->blocks[j];
1005       for (unsigned i = 0; i < b.instructions.size(); i++)
1006          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
1007       assert(b.register_demand == demands[j]);
1008    }
1009
1010    assert(program->max_reg_demand == prev_max_demand);
1011    assert(program->num_waves == prev_num_waves);
1012    #endif
1013 }
1014
1015 }