src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include <unordered_set>
  27 #include <algorithm>
  28
  29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  30 #include "amdgfxregs.h"
  31
  32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  34 #define POS_EXP_WINDOW_SIZE 512
  35 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
  36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
  37 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
  38 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
  39 #define POS_EXP_MAX_MOVES 512
  40
  41 namespace aco {
  42
  43 struct sched_ctx {
  44    std::vector<bool> depends_on;
  45    std::vector<bool> RAR_dependencies;
  46    /* For downwards VMEM scheduling, same as RAR_dependencies but excludes the
  47     * instructions in the clause, since new instructions in the clause are not
  48     * moved past any other instructions in the clause. */
  49    std::vector<bool> new_RAR_dependencies;
  50
  51    RegisterDemand max_registers;
  52    int16_t num_waves;
  53    int16_t last_SMEM_stall;
  54    int last_SMEM_dep_idx;
  55 };
  56
  57 /* This scheduler is a simple bottom-up pass based on ideas from
  58  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  59  * from Xiaohua Shi and Peng Guo.
  60  * The basic approach is to iterate over all instructions. When a memory instruction
  61  * is encountered it tries to move independent instructions from above and below
  62  * between the memory instruction and it's first user.
  63  * The novelty is that this scheduler cares for the current register pressure:
  64  * Instructions will only be moved if the register pressure won't exceed a certain bound.
  65  */
  66
  67 template <typename T>
  68 void move_element(T& list, size_t idx, size_t before) {
  69     if (idx < before) {
  70         auto begin = std::next(list.begin(), idx);
  71         auto end = std::next(list.begin(), before);
  72         std::rotate(begin, begin + 1, end);
  73     } else if (idx > before) {
  74         auto begin = std::next(list.begin(), before);
  75         auto end = std::next(list.begin(), idx + 1);
  76         std::rotate(begin, end - 1, end);
  77     }
  78 }
  79
  80 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
  81 {
  82    RegisterDemand changes;
  83    for (const Definition& def : instr->definitions) {
  84       if (!def.isTemp() || def.isKill())
  85          continue;
  86       changes += def.getTemp();
  87    }
  88
  89    for (const Operand& op : instr->operands) {
  90       if (!op.isTemp() || !op.isFirstKill())
  91          continue;
  92       changes -= op.getTemp();
  93    }
  94
  95    return changes;
  96 }
  97
  98 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
  99 {
 100    RegisterDemand temp_registers;
 101    for (const Definition& def : instr->definitions) {
 102       if (!def.isTemp() || !def.isKill())
 103          continue;
 104       temp_registers += def.getTemp();
 105    }
 106    return temp_registers;
 107 }
 108
 109 static bool is_spill_reload(aco_ptr<Instruction>& instr)
 110 {
 111    return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
 112 }
 113
 114 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
 115 {
 116    /* don't move exports so that they stay closer together */
 117    if (instr->format == Format::EXP)
 118       return false;
 119
 120    /* don't move s_memtime/s_memrealtime */
 121    if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime)
 122       return false;
 123
 124    /* handle barriers */
 125
 126    /* TODO: instead of stopping, maybe try to move the barriers and any
 127     * instructions interacting with them instead? */
 128    if (instr->format != Format::PSEUDO_BARRIER) {
 129       if (instr->opcode == aco_opcode::s_barrier) {
 130          bool can_reorder = false;
 131          switch (current->format) {
 132          case Format::SMEM:
 133             can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
 134             break;
 135          case Format::MUBUF:
 136             can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
 137             break;
 138          case Format::MIMG:
 139             can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
 140             break;
 141          case Format::FLAT:
 142          case Format::GLOBAL:
 143          case Format::SCRATCH:
 144             can_reorder = static_cast<FLAT_instruction*>(current)->can_reorder;
 145             break;
 146          default:
 147             break;
 148          }
 149          return can_reorder && moving_interaction == barrier_none;
 150       } else {
 151          return true;
 152       }
 153    }
 154
 155    int interaction = get_barrier_interaction(current);
 156    interaction |= moving_interaction;
 157
 158    switch (instr->opcode) {
 159    case aco_opcode::p_memory_barrier_atomic:
 160       return !(interaction & barrier_atomic);
 161    /* For now, buffer and image barriers are treated the same. this is because of
 162     * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
 163     * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
 164     * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
 165     * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
 166     * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
 167     * Either way, this solution should work. */
 168    case aco_opcode::p_memory_barrier_buffer:
 169    case aco_opcode::p_memory_barrier_image:
 170       return !(interaction & (barrier_image | barrier_buffer));
 171    case aco_opcode::p_memory_barrier_shared:
 172       return !(interaction & barrier_shared);
 173    case aco_opcode::p_memory_barrier_all:
 174       return interaction == barrier_none;
 175    default:
 176       return false;
 177    }
 178 }
 179
 180 bool can_reorder(Instruction* candidate)
 181 {
 182    switch (candidate->format) {
 183    case Format::SMEM:
 184       return static_cast<SMEM_instruction*>(candidate)->can_reorder;
 185    case Format::MUBUF:
 186       return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
 187    case Format::MIMG:
 188       return static_cast<MIMG_instruction*>(candidate)->can_reorder;
 189    case Format::MTBUF:
 190       return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
 191    case Format::FLAT:
 192    case Format::GLOBAL:
 193    case Format::SCRATCH:
 194       return static_cast<FLAT_instruction*>(candidate)->can_reorder;
 195    default:
 196       return true;
 197    }
 198 }
 199
 200 void schedule_SMEM(sched_ctx& ctx, Block* block,
 201                    std::vector<RegisterDemand>& register_demand,
 202                    Instruction* current, int idx)
 203 {
 204    assert(idx != 0);
 205    int window_size = SMEM_WINDOW_SIZE;
 206    int max_moves = SMEM_MAX_MOVES;
 207    int16_t k = 0;
 208    bool can_reorder_cur = can_reorder(current);
 209
 210    /* don't move s_memtime/s_memrealtime */
 211    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
 212       return;
 213
 214    /* create the initial set of values which current depends on */
 215    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 216    for (const Operand& op : current->operands) {
 217       if (op.isTemp())
 218          ctx.depends_on[op.tempId()] = true;
 219    }
 220
 221    /* maintain how many registers remain free when moving instructions */
 222    RegisterDemand register_pressure = register_demand[idx];
 223
 224    /* first, check if we have instructions before current to move down */
 225    int insert_idx = idx + 1;
 226    int moving_interaction = barrier_none;
 227    bool moving_spill = false;
 228
 229    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 230       assert(candidate_idx >= 0);
 231       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 232       bool can_reorder_candidate = can_reorder(candidate.get());
 233
 234       /* break if we'd make the previous SMEM instruction stall */
 235       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 236       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 237          break;
 238
 239       /* break when encountering another MEM instruction, logical_start or barriers */
 240       if (!can_reorder_candidate && !can_reorder_cur)
 241          break;
 242       if (candidate->opcode == aco_opcode::p_logical_start)
 243          break;
 244       if (candidate->opcode == aco_opcode::p_exit_early_if)
 245          break;
 246       if (!can_move_instr(candidate, current, moving_interaction))
 247          break;
 248       if (candidate->isVMEM())
 249          break;
 250       register_pressure.update(register_demand[candidate_idx]);
 251
 252       /* if current depends on candidate, add additional dependencies and continue */
 253       bool can_move_down = true;
 254       bool writes_exec = false;
 255       for (const Definition& def : candidate->definitions) {
 256          if (def.isTemp() && ctx.depends_on[def.tempId()])
 257             can_move_down = false;
 258          if (def.isFixed() && def.physReg() == exec)
 259             writes_exec = true;
 260       }
 261       if (writes_exec)
 262          break;
 263
 264       if (moving_spill && is_spill_reload(candidate))
 265          can_move_down = false;
 266       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 267          can_move_down = false;
 268       moving_interaction |= get_barrier_interaction(candidate.get());
 269       moving_spill |= is_spill_reload(candidate);
 270       if (!can_move_down) {
 271          for (const Operand& op : candidate->operands) {
 272             if (op.isTemp())
 273                ctx.depends_on[op.tempId()] = true;
 274          }
 275          can_reorder_cur &= can_reorder_candidate;
 276          continue;
 277       }
 278
 279       bool register_pressure_unknown = false;
 280       /* check if one of candidate's operands is killed by depending instruction */
 281       for (const Operand& op : candidate->operands) {
 282          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 283             // FIXME: account for difference in register pressure
 284             register_pressure_unknown = true;
 285          }
 286       }
 287       if (register_pressure_unknown) {
 288          for (const Operand& op : candidate->operands) {
 289             if (op.isTemp())
 290                ctx.depends_on[op.tempId()] = true;
 291          }
 292          can_reorder_cur &= can_reorder_candidate;
 293          continue;
 294       }
 295
 296       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 297       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 298       const RegisterDemand tempDemand = getTempRegisters(candidate);
 299       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 300          break;
 301       const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
 302       const RegisterDemand new_demand  = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
 303       if (new_demand.exceeds(ctx.max_registers))
 304          break;
 305       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 306
 307       /* move the candidate below the memory load */
 308       move_element(block->instructions, candidate_idx, insert_idx);
 309
 310       /* update register pressure */
 311       move_element(register_demand, candidate_idx, insert_idx);
 312       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 313          register_demand[i] -= candidate_diff;
 314       }
 315       register_demand[insert_idx - 1] = new_demand;
 316       register_pressure -= candidate_diff;
 317
 318       if (candidate_idx < ctx.last_SMEM_dep_idx)
 319          ctx.last_SMEM_stall++;
 320       insert_idx--;
 321       k++;
 322    }
 323
 324    /* create the initial set of values which depend on current */
 325    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 326    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 327    for (const Definition& def : current->definitions) {
 328       if (def.isTemp())
 329          ctx.depends_on[def.tempId()] = true;
 330    }
 331
 332    /* find the first instruction depending on current or find another MEM */
 333    insert_idx = idx + 1;
 334    moving_interaction = barrier_none;
 335    moving_spill = false;
 336    can_reorder_cur = true;
 337
 338    bool found_dependency = false;
 339    /* second, check if we have instructions after current to move up */
 340    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 341       assert(candidate_idx < (int) block->instructions.size());
 342       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 343       bool can_reorder_candidate = can_reorder(candidate.get());
 344
 345       if (candidate->opcode == aco_opcode::p_logical_end)
 346          break;
 347       if (!can_move_instr(candidate, current, moving_interaction))
 348          break;
 349
 350       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 351                                            [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
 352       if (writes_exec)
 353          break;
 354
 355       /* check if candidate depends on current */
 356       bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
 357                                        [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
 358       /* no need to steal from following VMEM instructions */
 359       if (is_dependency && candidate->isVMEM())
 360          break;
 361       if (moving_spill && is_spill_reload(candidate))
 362          is_dependency = true;
 363       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 364          is_dependency = true;
 365       moving_interaction |= get_barrier_interaction(candidate.get());
 366       moving_spill |= is_spill_reload(candidate);
 367       if (is_dependency) {
 368          for (const Definition& def : candidate->definitions) {
 369             if (def.isTemp())
 370                ctx.depends_on[def.tempId()] = true;
 371          }
 372          for (const Operand& op : candidate->operands) {
 373             if (op.isTemp())
 374                ctx.RAR_dependencies[op.tempId()] = true;
 375          }
 376          if (!found_dependency) {
 377             insert_idx = candidate_idx;
 378             found_dependency = true;
 379             /* init register pressure */
 380             register_pressure = register_demand[insert_idx - 1];
 381          }
 382       }
 383
 384       if (!can_reorder_candidate && !can_reorder_cur)
 385          break;
 386
 387       if (!found_dependency) {
 388          k++;
 389          continue;
 390       }
 391
 392       /* update register pressure */
 393       register_pressure.update(register_demand[candidate_idx - 1]);
 394
 395       if (is_dependency) {
 396          can_reorder_cur &= can_reorder_candidate;
 397          continue;
 398       }
 399       assert(insert_idx != idx);
 400
 401       // TODO: correctly calculate register pressure for this case
 402       bool register_pressure_unknown = false;
 403       /* check if candidate uses/kills an operand which is used by a dependency */
 404       for (const Operand& op : candidate->operands) {
 405          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
 406             register_pressure_unknown = true;
 407       }
 408       if (register_pressure_unknown) {
 409          if (candidate->isVMEM())
 410             break;
 411          for (const Definition& def : candidate->definitions) {
 412             if (def.isTemp())
 413                ctx.RAR_dependencies[def.tempId()] = true;
 414          }
 415          for (const Operand& op : candidate->operands) {
 416             if (op.isTemp())
 417                ctx.RAR_dependencies[op.tempId()] = true;
 418          }
 419          can_reorder_cur &= can_reorder_candidate;
 420          continue;
 421       }
 422
 423       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 424       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 425       const RegisterDemand temp = getTempRegisters(candidate);
 426       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 427          break;
 428       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 429       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 430       if (new_demand.exceeds(ctx.max_registers))
 431          break;
 432
 433       /* move the candidate above the insert_idx */
 434       move_element(block->instructions, candidate_idx, insert_idx);
 435
 436       /* update register pressure */
 437       move_element(register_demand, candidate_idx, insert_idx);
 438       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 439          register_demand[i] += candidate_diff;
 440       }
 441       register_demand[insert_idx] = new_demand;
 442       register_pressure += candidate_diff;
 443       insert_idx++;
 444       k++;
 445    }
 446
 447    ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
 448    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 449 }
 450
 451 void schedule_VMEM(sched_ctx& ctx, Block* block,
 452                    std::vector<RegisterDemand>& register_demand,
 453                    Instruction* current, int idx)
 454 {
 455    assert(idx != 0);
 456    int window_size = VMEM_WINDOW_SIZE;
 457    int max_moves = VMEM_MAX_MOVES;
 458    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
 459    int16_t k = 0;
 460    /* initially true as we don't pull other VMEM instructions
 461     * through the current instruction */
 462    bool can_reorder_vmem = true;
 463    bool can_reorder_smem = true;
 464
 465    /* create the initial set of values which current depends on */
 466    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 467    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 468    std::fill(ctx.new_RAR_dependencies.begin(), ctx.new_RAR_dependencies.end(), false);
 469    for (const Operand& op : current->operands) {
 470       if (op.isTemp()) {
 471          ctx.depends_on[op.tempId()] = true;
 472          if (op.isFirstKill())
 473             ctx.RAR_dependencies[op.tempId()] = true;
 474       }
 475    }
 476
 477    /* maintain how many registers remain free when moving instructions */
 478    RegisterDemand register_pressure_indep = register_demand[idx];
 479    RegisterDemand register_pressure_clause = register_demand[idx];
 480
 481    /* first, check if we have instructions before current to move down */
 482    int indep_insert_idx = idx + 1;
 483    int clause_insert_idx = idx;
 484    int moving_interaction = barrier_none;
 485    bool moving_spill = false;
 486
 487    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 488       assert(candidate_idx >= 0);
 489       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 490       bool can_reorder_candidate = can_reorder(candidate.get());
 491       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 492
 493       /* break when encountering another VMEM instruction, logical_start or barriers */
 494       if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate)
 495          break;
 496       if (candidate->opcode == aco_opcode::p_logical_start)
 497          break;
 498       if (candidate->opcode == aco_opcode::p_exit_early_if)
 499          break;
 500       if (!can_move_instr(candidate, current, moving_interaction))
 501          break;
 502
 503       /* break if we'd make the previous SMEM instruction stall */
 504       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 505       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 506          break;
 507       register_pressure_indep.update(register_demand[candidate_idx]);
 508
 509       bool part_of_clause = false;
 510       if (current->isVMEM() == candidate->isVMEM()) {
 511          bool same_resource = true;
 512          if (current->isVMEM())
 513             same_resource = candidate->operands[1].tempId() == current->operands[1].tempId();
 514          bool can_reorder = can_reorder_vmem || can_reorder_candidate;
 515          int grab_dist = clause_insert_idx - candidate_idx;
 516          /* We can't easily tell how much this will decrease the def-to-use
 517           * distances, so just use how far it will be moved as a heuristic. */
 518          part_of_clause = can_reorder && same_resource && grab_dist < clause_max_grab_dist;
 519       }
 520
 521       /* if current depends on candidate, add additional dependencies and continue */
 522       bool can_move_down = !is_vmem || part_of_clause;
 523       bool writes_exec = false;
 524       for (const Definition& def : candidate->definitions) {
 525          if (def.isTemp() && ctx.depends_on[def.tempId()])
 526             can_move_down = false;
 527          if (def.isFixed() && def.physReg() == exec)
 528             writes_exec = true;
 529       }
 530       if (writes_exec)
 531          break;
 532
 533       if (moving_spill && is_spill_reload(candidate))
 534          can_move_down = false;
 535       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 536          can_move_down = false;
 537       moving_interaction |= get_barrier_interaction(candidate.get());
 538       moving_spill |= is_spill_reload(candidate);
 539       if (!can_move_down) {
 540          for (const Operand& op : candidate->operands) {
 541             if (op.isTemp()) {
 542                ctx.depends_on[op.tempId()] = true;
 543                if (op.isFirstKill()) {
 544                   ctx.RAR_dependencies[op.tempId()] = true;
 545                   ctx.new_RAR_dependencies[op.tempId()] = true;
 546                }
 547             }
 548          }
 549          register_pressure_clause.update(register_demand[candidate_idx]);
 550          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 551          can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 552          continue;
 553       }
 554
 555       if (part_of_clause) {
 556          for (const Operand& op : candidate->operands) {
 557             if (op.isTemp()) {
 558                ctx.depends_on[op.tempId()] = true;
 559                if (op.isFirstKill())
 560                   ctx.RAR_dependencies[op.tempId()] = true;
 561             }
 562          }
 563       }
 564
 565       bool register_pressure_unknown = false;
 566       std::vector<bool>& RAR_deps = part_of_clause ? ctx.new_RAR_dependencies : ctx.RAR_dependencies;
 567       /* check if one of candidate's operands is killed by depending instruction */
 568       for (const Operand& op : candidate->operands) {
 569          if (op.isTemp() && RAR_deps[op.tempId()]) {
 570             // FIXME: account for difference in register pressure
 571             register_pressure_unknown = true;
 572          }
 573       }
 574       if (register_pressure_unknown) {
 575          for (const Operand& op : candidate->operands) {
 576             if (op.isTemp()) {
 577                ctx.depends_on[op.tempId()] = true;
 578                if (op.isFirstKill()) {
 579                   ctx.RAR_dependencies[op.tempId()] = true;
 580                   ctx.new_RAR_dependencies[op.tempId()] = true;
 581                }
 582             }
 583          }
 584          register_pressure_clause.update(register_demand[candidate_idx]);
 585          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 586          can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 587          continue;
 588       }
 589
 590       int insert_idx = part_of_clause ? clause_insert_idx : indep_insert_idx;
 591       RegisterDemand register_pressure = part_of_clause ? register_pressure_clause : register_pressure_indep;
 592
 593       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 594       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 595       const RegisterDemand temp = getTempRegisters(candidate);;
 596       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 597          break;
 598       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 599       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 600       if (new_demand.exceeds(ctx.max_registers))
 601          break;
 602       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 603
 604       /* move the candidate below the memory load */
 605       move_element(block->instructions, candidate_idx, insert_idx);
 606
 607       /* update register pressure */
 608       move_element(register_demand, candidate_idx, insert_idx);
 609       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 610          register_demand[i] -= candidate_diff;
 611       }
 612       register_demand[insert_idx - 1] = new_demand;
 613       register_pressure_clause -= candidate_diff;
 614       clause_insert_idx--;
 615       if (!part_of_clause) {
 616          register_pressure_indep -= candidate_diff;
 617          indep_insert_idx--;
 618       }
 619       k++;
 620       if (candidate_idx < ctx.last_SMEM_dep_idx)
 621          ctx.last_SMEM_stall++;
 622    }
 623
 624    /* create the initial set of values which depend on current */
 625    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 626    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 627    for (const Definition& def : current->definitions) {
 628       if (def.isTemp())
 629          ctx.depends_on[def.tempId()] = true;
 630    }
 631
 632    /* find the first instruction depending on current or find another VMEM */
 633    RegisterDemand register_pressure;
 634    int insert_idx = idx;
 635    moving_interaction = barrier_none;
 636    moving_spill = false;
 637    // TODO: differentiate between loads and stores (load-load can always reorder)
 638    can_reorder_vmem = true;
 639    can_reorder_smem = true;
 640
 641    bool found_dependency = false;
 642    /* second, check if we have instructions after current to move up */
 643    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 644       assert(candidate_idx < (int) block->instructions.size());
 645       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 646       bool can_reorder_candidate = can_reorder(candidate.get());
 647       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 648
 649       if (candidate->opcode == aco_opcode::p_logical_end)
 650          break;
 651       if (!can_move_instr(candidate, current, moving_interaction))
 652          break;
 653
 654       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 655                                            [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
 656       if (writes_exec)
 657          break;
 658
 659       /* check if candidate depends on current */
 660       bool is_dependency = false;
 661       if (candidate->format == Format::SMEM)
 662          is_dependency = !can_reorder_smem && !can_reorder_candidate;
 663       if (is_vmem)
 664          is_dependency = !can_reorder_vmem && !can_reorder_candidate;
 665       for (const Operand& op : candidate->operands) {
 666          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 667             is_dependency = true;
 668             break;
 669          }
 670       }
 671       if (moving_spill && is_spill_reload(candidate))
 672          is_dependency = true;
 673       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 674          is_dependency = true;
 675       moving_interaction |= get_barrier_interaction(candidate.get());
 676       moving_spill |= is_spill_reload(candidate);
 677       if (is_dependency) {
 678          for (const Definition& def : candidate->definitions) {
 679             if (def.isTemp())
 680                ctx.depends_on[def.tempId()] = true;
 681          }
 682          for (const Operand& op : candidate->operands) {
 683             if (op.isTemp())
 684                ctx.RAR_dependencies[op.tempId()] = true;
 685          }
 686          /* update flag whether we can reorder other memory instructions */
 687          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 688          can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 689
 690          if (!found_dependency) {
 691             insert_idx = candidate_idx;
 692             found_dependency = true;
 693             /* init register pressure */
 694             register_pressure = register_demand[insert_idx - 1];
 695             continue;
 696          }
 697
 698       } else if (is_vmem) {
 699          /* don't move up dependencies of other VMEM instructions */
 700          for (const Definition& def : candidate->definitions) {
 701             if (def.isTemp())
 702                ctx.depends_on[def.tempId()] = true;
 703          }
 704       }
 705
 706       /* update register pressure */
 707       register_pressure.update(register_demand[candidate_idx - 1]);
 708
 709       if (is_dependency || !found_dependency)
 710          continue;
 711       assert(insert_idx != idx);
 712
 713       bool register_pressure_unknown = false;
 714       /* check if candidate uses/kills an operand which is used by a dependency */
 715       for (const Operand& op : candidate->operands) {
 716          if (op.isTemp() && op.isFirstKill() && ctx.RAR_dependencies[op.tempId()])
 717             register_pressure_unknown = true;
 718       }
 719       if (register_pressure_unknown) {
 720          for (const Definition& def : candidate->definitions) {
 721             if (def.isTemp())
 722                ctx.depends_on[def.tempId()] = true;
 723          }
 724          for (const Operand& op : candidate->operands) {
 725             if (op.isTemp())
 726                ctx.RAR_dependencies[op.tempId()] = true;
 727          }
 728          can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate;
 729          can_reorder_vmem &= !is_vmem || can_reorder_candidate;
 730          continue;
 731       }
 732
 733       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 734       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 735       const RegisterDemand temp = getTempRegisters(candidate);
 736       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 737          break;
 738       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 739       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 740       if (new_demand.exceeds(ctx.max_registers))
 741          break;
 742
 743       /* move the candidate above the insert_idx */
 744       move_element(block->instructions, candidate_idx, insert_idx);
 745
 746       /* update register pressure */
 747       move_element(register_demand, candidate_idx, insert_idx);
 748       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 749          register_demand[i] += candidate_diff;
 750       }
 751       register_demand[insert_idx] = new_demand;
 752       register_pressure += candidate_diff;
 753       insert_idx++;
 754       k++;
 755    }
 756 }
 757
 758 void schedule_position_export(sched_ctx& ctx, Block* block,
 759                               std::vector<RegisterDemand>& register_demand,
 760                               Instruction* current, int idx)
 761 {
 762    assert(idx != 0);
 763    int window_size = POS_EXP_WINDOW_SIZE;
 764    int max_moves = POS_EXP_MAX_MOVES;
 765    int16_t k = 0;
 766
 767    /* create the initial set of values which current depends on */
 768    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 769    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 770    for (const Operand& op : current->operands) {
 771       if (op.isTemp()) {
 772          ctx.depends_on[op.tempId()] = true;
 773          if (op.isFirstKill())
 774             ctx.RAR_dependencies[op.tempId()] = true;
 775       }
 776    }
 777
 778    /* maintain how many registers remain free when moving instructions */
 779    RegisterDemand register_pressure = register_demand[idx];
 780
 781    /* first, check if we have instructions before current to move down */
 782    int insert_idx = idx + 1;
 783    int moving_interaction = barrier_none;
 784    bool moving_spill = false;
 785
 786    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 787       assert(candidate_idx >= 0);
 788       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 789
 790       /* break when encountering logical_start or barriers */
 791       if (candidate->opcode == aco_opcode::p_logical_start)
 792          break;
 793       if (candidate->opcode == aco_opcode::p_exit_early_if)
 794          break;
 795       if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
 796          break;
 797       if (!can_move_instr(candidate, current, moving_interaction))
 798          break;
 799
 800       register_pressure.update(register_demand[candidate_idx]);
 801
 802       /* if current depends on candidate, add additional dependencies and continue */
 803       bool can_move_down = true;
 804       bool writes_exec = false;
 805       for (unsigned i = 0; i < candidate->definitions.size(); i++) {
 806          if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
 807             can_move_down = false;
 808          if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
 809             writes_exec = true;
 810       }
 811       if (writes_exec)
 812          break;
 813
 814       if (moving_spill && is_spill_reload(candidate))
 815          can_move_down = false;
 816       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 817          can_move_down = false;
 818       moving_interaction |= get_barrier_interaction(candidate.get());
 819       moving_spill |= is_spill_reload(candidate);
 820       if (!can_move_down) {
 821          for (const Operand& op : candidate->operands) {
 822             if (op.isTemp()) {
 823                ctx.depends_on[op.tempId()] = true;
 824                if (op.isFirstKill())
 825                   ctx.RAR_dependencies[op.tempId()] = true;
 826             }
 827          }
 828          continue;
 829       }
 830
 831       bool register_pressure_unknown = false;
 832       /* check if one of candidate's operands is killed by depending instruction */
 833       for (const Operand& op : candidate->operands) {
 834          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) {
 835             // FIXME: account for difference in register pressure
 836             register_pressure_unknown = true;
 837          }
 838       }
 839       if (register_pressure_unknown) {
 840          for (const Operand& op : candidate->operands) {
 841             if (op.isTemp()) {
 842                ctx.depends_on[op.tempId()] = true;
 843                if (op.isFirstKill())
 844                   ctx.RAR_dependencies[op.tempId()] = true;
 845             }
 846          }
 847          continue;
 848       }
 849
 850       /* check if register pressure is low enough: the diff is negative if register pressure is increased */
 851       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 852       const RegisterDemand temp = getTempRegisters(candidate);;
 853       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 854          break;
 855       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 856       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 857       if (new_demand.exceeds(ctx.max_registers))
 858          break;
 859       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 860
 861       /* move the candidate below the export */
 862       move_element(block->instructions, candidate_idx, insert_idx);
 863
 864       /* update register pressure */
 865       move_element(register_demand, candidate_idx, insert_idx);
 866       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 867          register_demand[i] -= candidate_diff;
 868       }
 869       register_demand[insert_idx - 1] = new_demand;
 870       register_pressure -=  candidate_diff;
 871       insert_idx--;
 872       k++;
 873    }
 874 }
 875
 876 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 877 {
 878    ctx.last_SMEM_dep_idx = 0;
 879    ctx.last_SMEM_stall = INT16_MIN;
 880
 881    /* go through all instructions and find memory loads */
 882    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 883       Instruction* current = block->instructions[idx].get();
 884
 885       if (current->definitions.empty())
 886          continue;
 887
 888       if (current->isVMEM() || current->isFlatOrGlobal())
 889          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 890       if (current->format == Format::SMEM)
 891          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 892    }
 893
 894    if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
 895       /* Try to move position exports as far up as possible, to reduce register
 896        * usage and because ISA reference guides say so. */
 897       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 898          Instruction* current = block->instructions[idx].get();
 899
 900          if (current->format == Format::EXP) {
 901             unsigned target = static_cast<Export_instruction*>(current)->dest;
 902             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
 903                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 904          }
 905       }
 906    }
 907
 908    /* resummarize the block's register demand */
 909    block->register_demand = RegisterDemand();
 910    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 911       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 912    }
 913 }
 914
 915
 916 void schedule_program(Program *program, live& live_vars)
 917 {
 918    sched_ctx ctx;
 919    ctx.depends_on.resize(program->peekAllocationId());
 920    ctx.RAR_dependencies.resize(program->peekAllocationId());
 921    ctx.new_RAR_dependencies.resize(program->peekAllocationId());
 922    /* Allowing the scheduler to reduce the number of waves to as low as 5
 923     * improves performance of Thrones of Britannia significantly and doesn't
 924     * seem to hurt anything else. */
 925    if (program->num_waves <= 5)
 926       ctx.num_waves = program->num_waves;
 927    else if (program->max_reg_demand.vgpr >= 32)
 928       ctx.num_waves = 5;
 929    else if (program->max_reg_demand.vgpr >= 28)
 930       ctx.num_waves = 6;
 931    else if (program->max_reg_demand.vgpr >= 24)
 932       ctx.num_waves = 7;
 933    else
 934       ctx.num_waves = 8;
 935
 936    assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
 937    ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
 938
 939    for (Block& block : program->blocks)
 940       schedule_block(ctx, program, &block, live_vars);
 941
 942    /* update max_reg_demand and num_waves */
 943    RegisterDemand new_demand;
 944    for (Block& block : program->blocks) {
 945       new_demand.update(block.register_demand);
 946    }
 947    update_vgpr_sgpr_demand(program, new_demand);
 948
 949    /* if enabled, this code asserts that register_demand is updated correctly */
 950    #if 0
 951    int prev_num_waves = program->num_waves;
 952    const RegisterDemand prev_max_demand = program->max_reg_demand;
 953
 954    std::vector<RegisterDemand> demands(program->blocks.size());
 955    for (unsigned j = 0; j < program->blocks.size(); j++) {
 956       demands[j] = program->blocks[j].register_demand;
 957    }
 958
 959    struct radv_nir_compiler_options options;
 960    options.chip_class = program->chip_class;
 961    live live_vars2 = aco::live_var_analysis(program, &options);
 962
 963    for (unsigned j = 0; j < program->blocks.size(); j++) {
 964       Block &b = program->blocks[j];
 965       for (unsigned i = 0; i < b.instructions.size(); i++)
 966          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
 967       assert(b.register_demand == demands[j]);
 968    }
 969
 970    assert(program->max_reg_demand == prev_max_demand);
 971    assert(program->num_waves == prev_num_waves);
 972    #endif
 973 }
 974
 975 }