src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include "aco_builder.h"
  27 #include <unordered_set>
  28 #include <algorithm>
  29
  30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  31 #include "amdgfxregs.h"
  32
  33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  35 #define POS_EXP_WINDOW_SIZE 512
  36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
  37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
  38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
  39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
  40 #define POS_EXP_MAX_MOVES 512
  41
  42 namespace aco {
  43
  44 enum MoveResult {
  45    move_success,
  46    move_fail_ssa,
  47    move_fail_rar,
  48    move_fail_pressure,
  49 };
  50
  51 struct MoveState {
  52    RegisterDemand max_registers;
  53
  54    Block *block;
  55    Instruction *current;
  56    RegisterDemand *register_demand;
  57    bool improved_rar;
  58
  59    std::vector<bool> depends_on;
  60    /* Two are needed because, for downwards VMEM scheduling, one needs to
  61     * exclude the instructions in the clause, since new instructions in the
  62     * clause are not moved past any other instructions in the clause. */
  63    std::vector<bool> RAR_dependencies;
  64    std::vector<bool> RAR_dependencies_clause;
  65
  66    int source_idx;
  67    int insert_idx, insert_idx_clause;
  68    RegisterDemand total_demand, total_demand_clause;
  69
  70    /* for moving instructions before the current instruction to after it */
  71    void downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
  72    MoveResult downwards_move(bool clause);
  73    void downwards_skip();
  74
  75    /* for moving instructions after the first use of the current instruction upwards */
  76    void upwards_init(int source_idx, bool improved_rar);
  77    bool upwards_check_deps();
  78    void upwards_set_insert_idx(int before);
  79    MoveResult upwards_move();
  80    void upwards_skip();
  81
  82 private:
  83    void downwards_advance_helper();
  84 };
  85
  86 struct sched_ctx {
  87    int16_t num_waves;
  88    int16_t last_SMEM_stall;
  89    int last_SMEM_dep_idx;
  90    MoveState mv;
  91 };
  92
  93 /* This scheduler is a simple bottom-up pass based on ideas from
  94  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  95  * from Xiaohua Shi and Peng Guo.
  96  * The basic approach is to iterate over all instructions. When a memory instruction
  97  * is encountered it tries to move independent instructions from above and below
  98  * between the memory instruction and it's first user.
  99  * The novelty is that this scheduler cares for the current register pressure:
 100  * Instructions will only be moved if the register pressure won't exceed a certain bound.
 101  */
 102
 103 template <typename T>
 104 void move_element(T begin_it, size_t idx, size_t before) {
 105     if (idx < before) {
 106         auto begin = std::next(begin_it, idx);
 107         auto end = std::next(begin_it, before);
 108         std::rotate(begin, begin + 1, end);
 109     } else if (idx > before) {
 110         auto begin = std::next(begin_it, before);
 111         auto end = std::next(begin_it, idx + 1);
 112         std::rotate(begin, end - 1, end);
 113     }
 114 }
 115
 116 void MoveState::downwards_advance_helper()
 117 {
 118    source_idx--;
 119    total_demand.update(register_demand[source_idx]);
 120 }
 121
 122 void MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
 123 {
 124    improved_rar = improved_rar_;
 125    source_idx = current_idx;
 126
 127    insert_idx = current_idx + 1;
 128    insert_idx_clause = current_idx;
 129
 130    total_demand = total_demand_clause = register_demand[current_idx];
 131
 132    std::fill(depends_on.begin(), depends_on.end(), false);
 133    if (improved_rar) {
 134       std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
 135       if (may_form_clauses)
 136          std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
 137    }
 138
 139    for (const Operand& op : current->operands) {
 140       if (op.isTemp()) {
 141          depends_on[op.tempId()] = true;
 142          if (improved_rar && op.isFirstKill())
 143             RAR_dependencies[op.tempId()] = true;
 144       }
 145    }
 146
 147    /* update total_demand/source_idx */
 148    downwards_advance_helper();
 149 }
 150
 151 MoveResult MoveState::downwards_move(bool clause)
 152 {
 153    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 154
 155    for (const Definition& def : instr->definitions)
 156       if (def.isTemp() && depends_on[def.tempId()])
 157          return move_fail_ssa;
 158
 159    /* check if one of candidate's operands is killed by depending instruction */
 160    std::vector<bool>& RAR_deps = improved_rar ? (clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
 161    for (const Operand& op : instr->operands) {
 162       if (op.isTemp() && RAR_deps[op.tempId()]) {
 163          // FIXME: account for difference in register pressure
 164          return move_fail_rar;
 165       }
 166    }
 167
 168    if (clause) {
 169       for (const Operand& op : instr->operands) {
 170          if (op.isTemp()) {
 171             depends_on[op.tempId()] = true;
 172             if (op.isFirstKill())
 173                RAR_dependencies[op.tempId()] = true;
 174          }
 175       }
 176    }
 177
 178    int dest_insert_idx = clause ? insert_idx_clause : insert_idx;
 179    RegisterDemand register_pressure = clause ? total_demand_clause : total_demand;
 180
 181    const RegisterDemand candidate_diff = get_live_changes(instr);
 182    const RegisterDemand temp = get_temp_registers(instr);
 183    if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
 184       return move_fail_pressure;
 185    const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
 186    const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
 187    if (new_demand.exceeds(max_registers))
 188       return move_fail_pressure;
 189
 190    /* move the candidate below the memory load */
 191    move_element(block->instructions.begin(), source_idx, dest_insert_idx);
 192
 193    /* update register pressure */
 194    move_element(register_demand, source_idx, dest_insert_idx);
 195    for (int i = source_idx; i < dest_insert_idx - 1; i++)
 196       register_demand[i] -= candidate_diff;
 197    register_demand[dest_insert_idx - 1] = new_demand;
 198    total_demand_clause -= candidate_diff;
 199    insert_idx_clause--;
 200    if (!clause) {
 201       total_demand -= candidate_diff;
 202       insert_idx--;
 203    }
 204
 205    downwards_advance_helper();
 206    return move_success;
 207 }
 208
 209 void MoveState::downwards_skip()
 210 {
 211    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 212
 213    for (const Operand& op : instr->operands) {
 214       if (op.isTemp()) {
 215          depends_on[op.tempId()] = true;
 216          if (improved_rar && op.isFirstKill()) {
 217             RAR_dependencies[op.tempId()] = true;
 218             RAR_dependencies_clause[op.tempId()] = true;
 219          }
 220       }
 221    }
 222    total_demand_clause.update(register_demand[source_idx]);
 223
 224    downwards_advance_helper();
 225 }
 226
 227 void MoveState::upwards_init(int source_idx_, bool improved_rar_)
 228 {
 229    source_idx = source_idx_;
 230    improved_rar = improved_rar_;
 231
 232    insert_idx = -1;
 233
 234    std::fill(depends_on.begin(), depends_on.end(), false);
 235    std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
 236
 237    for (const Definition& def : current->definitions) {
 238       if (def.isTemp())
 239          depends_on[def.tempId()] = true;
 240    }
 241 }
 242
 243 bool MoveState::upwards_check_deps()
 244 {
 245    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 246    for (const Operand& op : instr->operands) {
 247       if (op.isTemp() && depends_on[op.tempId()])
 248          return false;
 249    }
 250    return true;
 251 }
 252
 253 void MoveState::upwards_set_insert_idx(int before)
 254 {
 255    insert_idx = before;
 256    total_demand = register_demand[before - 1];
 257 }
 258
 259 MoveResult MoveState::upwards_move()
 260 {
 261    assert(insert_idx >= 0);
 262
 263    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 264    for (const Operand& op : instr->operands) {
 265       if (op.isTemp() && depends_on[op.tempId()])
 266          return move_fail_ssa;
 267    }
 268
 269    /* check if candidate uses/kills an operand which is used by a dependency */
 270    for (const Operand& op : instr->operands) {
 271       if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
 272          return move_fail_rar;
 273    }
 274
 275    /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 276    const RegisterDemand candidate_diff = get_live_changes(instr);
 277    const RegisterDemand temp = get_temp_registers(instr);
 278    if (RegisterDemand(total_demand + candidate_diff).exceeds(max_registers))
 279       return move_fail_pressure;
 280    const RegisterDemand temp2 = get_temp_registers(block->instructions[insert_idx - 1]);
 281    const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 282    if (new_demand.exceeds(max_registers))
 283       return move_fail_pressure;
 284
 285    /* move the candidate above the insert_idx */
 286    move_element(block->instructions.begin(), source_idx, insert_idx);
 287
 288    /* update register pressure */
 289    move_element(register_demand, source_idx, insert_idx);
 290    for (int i = insert_idx + 1; i <= source_idx; i++)
 291       register_demand[i] += candidate_diff;
 292    register_demand[insert_idx] = new_demand;
 293    total_demand += candidate_diff;
 294
 295    insert_idx++;
 296
 297    total_demand.update(register_demand[source_idx]);
 298    source_idx++;
 299
 300    return move_success;
 301 }
 302
 303 void MoveState::upwards_skip()
 304 {
 305    if (insert_idx >= 0) {
 306       aco_ptr<Instruction>& instr = block->instructions[source_idx];
 307       for (const Definition& def : instr->definitions) {
 308          if (def.isTemp())
 309             depends_on[def.tempId()] = true;
 310       }
 311       for (const Operand& op : instr->operands) {
 312          if (op.isTemp())
 313             RAR_dependencies[op.tempId()] = true;
 314       }
 315       total_demand.update(register_demand[source_idx]);
 316    }
 317
 318    source_idx++;
 319 }
 320
 321 bool is_gs_or_done_sendmsg(const Instruction *instr)
 322 {
 323    if (instr->opcode == aco_opcode::s_sendmsg) {
 324       uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
 325       return (imm & sendmsg_id_mask) == _sendmsg_gs ||
 326              (imm & sendmsg_id_mask) == _sendmsg_gs_done;
 327    }
 328    return false;
 329 }
 330
 331 bool is_done_sendmsg(const Instruction *instr)
 332 {
 333    if (instr->opcode == aco_opcode::s_sendmsg) {
 334       uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
 335       return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
 336    }
 337    return false;
 338 }
 339
 340 memory_sync_info get_sync_info_with_hack(const Instruction* instr)
 341 {
 342    memory_sync_info sync = get_sync_info(instr);
 343    if (instr->format == Format::SMEM && !instr->operands.empty() && instr->operands[0].bytes() == 16) {
 344       // FIXME: currently, it doesn't seem beneficial to omit this due to how our scheduler works
 345       sync.storage = (storage_class)(sync.storage | storage_buffer);
 346       sync.semantics = (memory_semantics)(sync.semantics | semantic_private);
 347    }
 348    return sync;
 349 }
 350
 351 struct memory_event_set {
 352    bool has_control_barrier;
 353
 354    unsigned bar_acquire;
 355    unsigned bar_release;
 356    unsigned bar_classes;
 357
 358    unsigned access_acquire;
 359    unsigned access_release;
 360    unsigned access_relaxed;
 361    unsigned access_atomic;
 362 };
 363
 364 struct hazard_query {
 365    bool contains_spill;
 366    bool contains_sendmsg;
 367    memory_event_set mem_events;
 368    unsigned aliasing_storage; /* storage classes which are accessed (non-SMEM) */
 369    unsigned aliasing_storage_smem; /* storage classes which are accessed (SMEM) */
 370 };
 371
 372 void init_hazard_query(hazard_query *query) {
 373    query->contains_spill = false;
 374    query->contains_sendmsg = false;
 375    memset(&query->mem_events, 0, sizeof(query->mem_events));
 376    query->aliasing_storage = 0;
 377    query->aliasing_storage_smem = 0;
 378 }
 379
 380 void add_memory_event(memory_event_set *set, Instruction *instr, memory_sync_info *sync)
 381 {
 382    set->has_control_barrier |= is_done_sendmsg(instr);
 383    if (instr->opcode == aco_opcode::p_barrier) {
 384       Pseudo_barrier_instruction *bar = static_cast<Pseudo_barrier_instruction*>(instr);
 385       if (bar->sync.semantics & semantic_acquire)
 386          set->bar_acquire |= bar->sync.storage;
 387       if (bar->sync.semantics & semantic_release)
 388          set->bar_release |= bar->sync.storage;
 389       set->bar_classes |= bar->sync.storage;
 390
 391       set->has_control_barrier |= bar->exec_scope > scope_invocation;
 392    }
 393
 394    if (!sync->storage)
 395       return;
 396
 397    if (sync->semantics & semantic_acquire)
 398       set->access_acquire |= sync->storage;
 399    if (sync->semantics & semantic_release)
 400       set->access_release |= sync->storage;
 401
 402    if (!(sync->semantics & semantic_private)) {
 403       if (sync->semantics & semantic_atomic)
 404          set->access_atomic |= sync->storage;
 405       else
 406          set->access_relaxed |= sync->storage;
 407    }
 408 }
 409
 410 void add_to_hazard_query(hazard_query *query, Instruction *instr)
 411 {
 412    if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
 413       query->contains_spill = true;
 414    query->contains_sendmsg |= instr->opcode == aco_opcode::s_sendmsg;
 415
 416    memory_sync_info sync = get_sync_info_with_hack(instr);
 417
 418    add_memory_event(&query->mem_events, instr, &sync);
 419
 420    if (!(sync.semantics & semantic_can_reorder)) {
 421       unsigned storage = sync.storage;
 422       /* images and buffer/global memory can alias */ //TODO: more precisely, buffer images and buffer/global memory can alias
 423       if (storage & (storage_buffer | storage_image))
 424          storage |= storage_buffer | storage_image;
 425       if (instr->format == Format::SMEM)
 426          query->aliasing_storage_smem |= storage;
 427       else
 428          query->aliasing_storage |= storage;
 429    }
 430 }
 431
 432 enum HazardResult {
 433    hazard_success,
 434    hazard_fail_reorder_vmem_smem,
 435    hazard_fail_reorder_ds,
 436    hazard_fail_reorder_sendmsg,
 437    hazard_fail_spill,
 438    hazard_fail_export,
 439    hazard_fail_barrier,
 440    /* Must stop at these failures. The hazard query code doesn't consider them
 441     * when added. */
 442    hazard_fail_exec,
 443    hazard_fail_unreorderable,
 444 };
 445
 446 HazardResult perform_hazard_query(hazard_query *query, Instruction *instr, bool upwards)
 447 {
 448    if (instr->opcode == aco_opcode::p_exit_early_if)
 449       return hazard_fail_exec;
 450    for (const Definition& def : instr->definitions) {
 451       if (def.isFixed() && def.physReg() == exec)
 452          return hazard_fail_exec;
 453    }
 454
 455    /* don't move exports so that they stay closer together */
 456    if (instr->format == Format::EXP)
 457       return hazard_fail_export;
 458
 459    /* don't move non-reorderable instructions */
 460    if (instr->opcode == aco_opcode::s_memtime ||
 461        instr->opcode == aco_opcode::s_memrealtime ||
 462        instr->opcode == aco_opcode::s_setprio ||
 463        instr->opcode == aco_opcode::s_getreg_b32)
 464       return hazard_fail_unreorderable;
 465
 466    memory_event_set instr_set;
 467    memset(&instr_set, 0, sizeof(instr_set));
 468    memory_sync_info sync = get_sync_info_with_hack(instr);
 469    add_memory_event(&instr_set, instr, &sync);
 470
 471    memory_event_set *first = &instr_set;
 472    memory_event_set *second = &query->mem_events;
 473    if (upwards)
 474       std::swap(first, second);
 475
 476    /* everything after barrier(acquire) happens after the atomics/control_barriers before
 477     * everything after load(acquire) happens after the load
 478     */
 479    if ((first->has_control_barrier || first->access_atomic) && second->bar_acquire)
 480       return hazard_fail_barrier;
 481    if (((first->access_acquire || first->bar_acquire) && second->bar_classes) ||
 482        ((first->access_acquire | first->bar_acquire) & (second->access_relaxed | second->access_atomic)))
 483       return hazard_fail_barrier;
 484
 485    /* everything before barrier(release) happens before the atomics/control_barriers after *
 486     * everything before store(release) happens before the store
 487     */
 488    if (first->bar_release && (second->has_control_barrier || second->access_atomic))
 489       return hazard_fail_barrier;
 490    if ((first->bar_classes && (second->bar_release || second->access_release)) ||
 491        ((first->access_relaxed | first->access_atomic) & (second->bar_release | second->access_release)))
 492       return hazard_fail_barrier;
 493
 494    /* don't move memory barriers around other memory barriers */
 495    if (first->bar_classes && second->bar_classes)
 496       return hazard_fail_barrier;
 497
 498    /* Don't move memory accesses to before control barriers. I don't think
 499     * this is necessary for the Vulkan memory model, but it might be for GLSL450. */
 500    unsigned control_classes = storage_buffer | storage_atomic_counter | storage_image | storage_shared;
 501    if (first->has_control_barrier && ((second->access_atomic | second->access_relaxed) & control_classes))
 502       return hazard_fail_barrier;
 503
 504    /* don't move memory loads/stores past potentially aliasing loads/stores */
 505    unsigned aliasing_storage = instr->format == Format::SMEM ?
 506                                query->aliasing_storage_smem :
 507                                query->aliasing_storage;
 508    if ((sync.storage & aliasing_storage) && !(sync.semantics & semantic_can_reorder)) {
 509       unsigned intersect = sync.storage & aliasing_storage;
 510       if (intersect & storage_shared)
 511          return hazard_fail_reorder_ds;
 512       return hazard_fail_reorder_vmem_smem;
 513    }
 514
 515    if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
 516        query->contains_spill)
 517       return hazard_fail_spill;
 518
 519    if (instr->opcode == aco_opcode::s_sendmsg && query->contains_sendmsg)
 520       return hazard_fail_reorder_sendmsg;
 521
 522    return hazard_success;
 523 }
 524
 525 void schedule_SMEM(sched_ctx& ctx, Block* block,
 526                    std::vector<RegisterDemand>& register_demand,
 527                    Instruction* current, int idx)
 528 {
 529    assert(idx != 0);
 530    int window_size = SMEM_WINDOW_SIZE;
 531    int max_moves = SMEM_MAX_MOVES;
 532    int16_t k = 0;
 533
 534    /* don't move s_memtime/s_memrealtime */
 535    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
 536       return;
 537
 538    /* first, check if we have instructions before current to move down */
 539    hazard_query hq;
 540    init_hazard_query(&hq);
 541    add_to_hazard_query(&hq, current);
 542
 543    ctx.mv.downwards_init(idx, false, false);
 544
 545    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 546       assert(candidate_idx >= 0);
 547       assert(candidate_idx == ctx.mv.source_idx);
 548       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 549
 550       /* break if we'd make the previous SMEM instruction stall */
 551       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 552       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 553          break;
 554
 555       /* break when encountering another MEM instruction, logical_start or barriers */
 556       if (candidate->opcode == aco_opcode::p_logical_start)
 557          break;
 558       if (candidate->isVMEM())
 559          break;
 560
 561       bool can_move_down = true;
 562
 563       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
 564       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
 565          can_move_down = false;
 566       else if (haz != hazard_success)
 567          break;
 568
 569       /* don't use LDS/GDS instructions to hide latency since it can
 570        * significanly worsen LDS scheduling */
 571       if (candidate->format == Format::DS || !can_move_down) {
 572          add_to_hazard_query(&hq, candidate.get());
 573          ctx.mv.downwards_skip();
 574          continue;
 575       }
 576
 577       MoveResult res = ctx.mv.downwards_move(false);
 578       if (res == move_fail_ssa || res == move_fail_rar) {
 579          add_to_hazard_query(&hq, candidate.get());
 580          ctx.mv.downwards_skip();
 581          continue;
 582       } else if (res == move_fail_pressure) {
 583          break;
 584       }
 585
 586       if (candidate_idx < ctx.last_SMEM_dep_idx)
 587          ctx.last_SMEM_stall++;
 588       k++;
 589    }
 590
 591    /* find the first instruction depending on current or find another MEM */
 592    ctx.mv.upwards_init(idx + 1, false);
 593
 594    bool found_dependency = false;
 595    /* second, check if we have instructions after current to move up */
 596    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 597       assert(candidate_idx == ctx.mv.source_idx);
 598       assert(candidate_idx < (int) block->instructions.size());
 599       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 600
 601       if (candidate->opcode == aco_opcode::p_logical_end)
 602          break;
 603
 604       /* check if candidate depends on current */
 605       bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps();
 606       /* no need to steal from following VMEM instructions */
 607       if (is_dependency && candidate->isVMEM())
 608          break;
 609
 610       if (found_dependency) {
 611          HazardResult haz = perform_hazard_query(&hq, candidate.get(), true);
 612          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 613              haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
 614              haz == hazard_fail_export)
 615             is_dependency = true;
 616          else if (haz != hazard_success)
 617             break;
 618       }
 619
 620       if (is_dependency) {
 621          if (!found_dependency) {
 622             ctx.mv.upwards_set_insert_idx(candidate_idx);
 623             init_hazard_query(&hq);
 624             found_dependency = true;
 625          }
 626       }
 627
 628       if (is_dependency || !found_dependency) {
 629          if (found_dependency)
 630             add_to_hazard_query(&hq, candidate.get());
 631          else
 632             k++;
 633          ctx.mv.upwards_skip();
 634          continue;
 635       }
 636
 637       MoveResult res = ctx.mv.upwards_move();
 638       if (res == move_fail_ssa || res == move_fail_rar) {
 639          /* no need to steal from following VMEM instructions */
 640          if (res == move_fail_ssa && candidate->isVMEM())
 641             break;
 642          add_to_hazard_query(&hq, candidate.get());
 643          ctx.mv.upwards_skip();
 644          continue;
 645       } else if (res == move_fail_pressure) {
 646          break;
 647       }
 648       k++;
 649    }
 650
 651    ctx.last_SMEM_dep_idx = found_dependency ? ctx.mv.insert_idx : 0;
 652    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 653 }
 654
 655 void schedule_VMEM(sched_ctx& ctx, Block* block,
 656                    std::vector<RegisterDemand>& register_demand,
 657                    Instruction* current, int idx)
 658 {
 659    assert(idx != 0);
 660    int window_size = VMEM_WINDOW_SIZE;
 661    int max_moves = VMEM_MAX_MOVES;
 662    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
 663    int16_t k = 0;
 664
 665    /* first, check if we have instructions before current to move down */
 666    hazard_query indep_hq;
 667    hazard_query clause_hq;
 668    init_hazard_query(&indep_hq);
 669    init_hazard_query(&clause_hq);
 670    add_to_hazard_query(&indep_hq, current);
 671
 672    ctx.mv.downwards_init(idx, true, true);
 673
 674    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 675       assert(candidate_idx == ctx.mv.source_idx);
 676       assert(candidate_idx >= 0);
 677       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 678       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 679
 680       /* break when encountering another VMEM instruction, logical_start or barriers */
 681       if (candidate->opcode == aco_opcode::p_logical_start)
 682          break;
 683
 684       /* break if we'd make the previous SMEM instruction stall */
 685       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 686       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 687          break;
 688
 689       bool part_of_clause = false;
 690       if (current->isVMEM() == candidate->isVMEM()) {
 691          bool same_resource = true;
 692          if (current->isVMEM())
 693             same_resource = candidate->operands[0].tempId() == current->operands[0].tempId();
 694          int grab_dist = ctx.mv.insert_idx_clause - candidate_idx;
 695          /* We can't easily tell how much this will decrease the def-to-use
 696           * distances, so just use how far it will be moved as a heuristic. */
 697          part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
 698       }
 699
 700       /* if current depends on candidate, add additional dependencies and continue */
 701       bool can_move_down = !is_vmem || part_of_clause;
 702
 703       HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get(), false);
 704       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 705           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
 706           haz == hazard_fail_export)
 707          can_move_down = false;
 708       else if (haz != hazard_success)
 709          break;
 710
 711       if (!can_move_down) {
 712          add_to_hazard_query(&indep_hq, candidate.get());
 713          add_to_hazard_query(&clause_hq, candidate.get());
 714          ctx.mv.downwards_skip();
 715          continue;
 716       }
 717
 718       Instruction *candidate_ptr = candidate.get();
 719       MoveResult res = ctx.mv.downwards_move(part_of_clause);
 720       if (res == move_fail_ssa || res == move_fail_rar) {
 721          add_to_hazard_query(&indep_hq, candidate.get());
 722          add_to_hazard_query(&clause_hq, candidate.get());
 723          ctx.mv.downwards_skip();
 724          continue;
 725       } else if (res == move_fail_pressure) {
 726          break;
 727       }
 728       if (part_of_clause)
 729          add_to_hazard_query(&indep_hq, candidate_ptr);
 730       k++;
 731       if (candidate_idx < ctx.last_SMEM_dep_idx)
 732          ctx.last_SMEM_stall++;
 733    }
 734
 735    /* find the first instruction depending on current or find another VMEM */
 736    ctx.mv.upwards_init(idx + 1, true);
 737
 738    bool found_dependency = false;
 739    /* second, check if we have instructions after current to move up */
 740    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 741       assert(candidate_idx == ctx.mv.source_idx);
 742       assert(candidate_idx < (int) block->instructions.size());
 743       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 744       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 745
 746       if (candidate->opcode == aco_opcode::p_logical_end)
 747          break;
 748
 749       /* check if candidate depends on current */
 750       bool is_dependency = false;
 751       if (found_dependency) {
 752          HazardResult haz = perform_hazard_query(&indep_hq, candidate.get(), true);
 753          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 754              haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
 755              haz == hazard_fail_barrier || haz == hazard_fail_export)
 756             is_dependency = true;
 757          else if (haz != hazard_success)
 758             break;
 759       }
 760
 761       is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps();
 762       if (is_dependency) {
 763          if (!found_dependency) {
 764             ctx.mv.upwards_set_insert_idx(candidate_idx);
 765             init_hazard_query(&indep_hq);
 766             found_dependency = true;
 767          }
 768       } else if (is_vmem) {
 769          /* don't move up dependencies of other VMEM instructions */
 770          for (const Definition& def : candidate->definitions) {
 771             if (def.isTemp())
 772                ctx.mv.depends_on[def.tempId()] = true;
 773          }
 774       }
 775
 776       if (is_dependency || !found_dependency) {
 777          if (found_dependency)
 778             add_to_hazard_query(&indep_hq, candidate.get());
 779          ctx.mv.upwards_skip();
 780          continue;
 781       }
 782
 783       MoveResult res = ctx.mv.upwards_move();
 784       if (res == move_fail_ssa || res == move_fail_rar) {
 785          add_to_hazard_query(&indep_hq, candidate.get());
 786          ctx.mv.upwards_skip();
 787          continue;
 788       } else if (res == move_fail_pressure) {
 789          break;
 790       }
 791       k++;
 792    }
 793 }
 794
 795 void schedule_position_export(sched_ctx& ctx, Block* block,
 796                               std::vector<RegisterDemand>& register_demand,
 797                               Instruction* current, int idx)
 798 {
 799    assert(idx != 0);
 800    int window_size = POS_EXP_WINDOW_SIZE;
 801    int max_moves = POS_EXP_MAX_MOVES;
 802    int16_t k = 0;
 803
 804    ctx.mv.downwards_init(idx, true, false);
 805
 806    hazard_query hq;
 807    init_hazard_query(&hq);
 808    add_to_hazard_query(&hq, current);
 809
 810    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 811       assert(candidate_idx >= 0);
 812       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 813
 814       if (candidate->opcode == aco_opcode::p_logical_start)
 815          break;
 816       if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
 817          break;
 818
 819       HazardResult haz = perform_hazard_query(&hq, candidate.get(), false);
 820       if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
 821          break;
 822
 823       if (haz != hazard_success) {
 824          add_to_hazard_query(&hq, candidate.get());
 825          ctx.mv.downwards_skip();
 826          continue;
 827       }
 828
 829       MoveResult res = ctx.mv.downwards_move(false);
 830       if (res == move_fail_ssa || res == move_fail_rar) {
 831          add_to_hazard_query(&hq, candidate.get());
 832          ctx.mv.downwards_skip();
 833          continue;
 834       } else if (res == move_fail_pressure) {
 835          break;
 836       }
 837       k++;
 838    }
 839 }
 840
 841 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 842 {
 843    ctx.last_SMEM_dep_idx = 0;
 844    ctx.last_SMEM_stall = INT16_MIN;
 845    ctx.mv.block = block;
 846    ctx.mv.register_demand = live_vars.register_demand[block->index].data();
 847
 848    /* go through all instructions and find memory loads */
 849    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 850       Instruction* current = block->instructions[idx].get();
 851
 852       if (current->definitions.empty())
 853          continue;
 854
 855       if (current->isVMEM() || current->isFlatOrGlobal()) {
 856          ctx.mv.current = current;
 857          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 858       }
 859
 860       if (current->format == Format::SMEM) {
 861          ctx.mv.current = current;
 862          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 863       }
 864    }
 865
 866    if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) {
 867       /* Try to move position exports as far up as possible, to reduce register
 868        * usage and because ISA reference guides say so. */
 869       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 870          Instruction* current = block->instructions[idx].get();
 871
 872          if (current->format == Format::EXP) {
 873             unsigned target = static_cast<Export_instruction*>(current)->dest;
 874             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) {
 875                ctx.mv.current = current;
 876                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 877             }
 878          }
 879       }
 880    }
 881
 882    /* resummarize the block's register demand */
 883    block->register_demand = RegisterDemand();
 884    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 885       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 886    }
 887 }
 888
 889
 890 void schedule_program(Program *program, live& live_vars)
 891 {
 892    /* don't use program->max_reg_demand because that is affected by max_waves_per_simd */
 893    RegisterDemand demand;
 894    for (Block& block : program->blocks)
 895       demand.update(block.register_demand);
 896
 897    sched_ctx ctx;
 898    ctx.mv.depends_on.resize(program->peekAllocationId());
 899    ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
 900    ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
 901    /* Allowing the scheduler to reduce the number of waves to as low as 5
 902     * improves performance of Thrones of Britannia significantly and doesn't
 903     * seem to hurt anything else. */
 904    if (program->num_waves <= 5)
 905       ctx.num_waves = program->num_waves;
 906    else if (demand.vgpr >= 29)
 907       ctx.num_waves = 5;
 908    else if (demand.vgpr >= 25)
 909       ctx.num_waves = 6;
 910    else
 911       ctx.num_waves = 7;
 912    ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
 913    ctx.num_waves = std::min<uint16_t>(ctx.num_waves, program->max_waves);
 914
 915    assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
 916    ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
 917                             int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
 918
 919    for (Block& block : program->blocks)
 920       schedule_block(ctx, program, &block, live_vars);
 921
 922    /* update max_reg_demand and num_waves */
 923    RegisterDemand new_demand;
 924    for (Block& block : program->blocks) {
 925       new_demand.update(block.register_demand);
 926    }
 927    update_vgpr_sgpr_demand(program, new_demand);
 928
 929    /* if enabled, this code asserts that register_demand is updated correctly */
 930    #if 0
 931    int prev_num_waves = program->num_waves;
 932    const RegisterDemand prev_max_demand = program->max_reg_demand;
 933
 934    std::vector<RegisterDemand> demands(program->blocks.size());
 935    for (unsigned j = 0; j < program->blocks.size(); j++) {
 936       demands[j] = program->blocks[j].register_demand;
 937    }
 938
 939    struct radv_nir_compiler_options options;
 940    options.chip_class = program->chip_class;
 941    live live_vars2 = aco::live_var_analysis(program, &options);
 942
 943    for (unsigned j = 0; j < program->blocks.size(); j++) {
 944       Block &b = program->blocks[j];
 945       for (unsigned i = 0; i < b.instructions.size(); i++)
 946          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
 947       assert(b.register_demand == demands[j]);
 948    }
 949
 950    assert(program->max_reg_demand == prev_max_demand);
 951    assert(program->num_waves == prev_num_waves);
 952    #endif
 953 }
 954
 955 }