src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include "aco_builder.h"
  27 #include <unordered_set>
  28 #include <algorithm>
  29
  30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  31 #include "amdgfxregs.h"
  32
  33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  35 #define POS_EXP_WINDOW_SIZE 512
  36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
  37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
  38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
  39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
  40 #define POS_EXP_MAX_MOVES 512
  41
  42 namespace aco {
  43
  44 enum MoveResult {
  45    move_success,
  46    move_fail_ssa,
  47    move_fail_rar,
  48    move_fail_pressure,
  49 };
  50
  51 struct MoveState {
  52    RegisterDemand max_registers;
  53
  54    Block *block;
  55    Instruction *current;
  56    RegisterDemand *register_demand;
  57    bool improved_rar;
  58
  59    std::vector<bool> depends_on;
  60    /* Two are needed because, for downwards VMEM scheduling, one needs to
  61     * exclude the instructions in the clause, since new instructions in the
  62     * clause are not moved past any other instructions in the clause. */
  63    std::vector<bool> RAR_dependencies;
  64    std::vector<bool> RAR_dependencies_clause;
  65
  66    int source_idx;
  67    int insert_idx, insert_idx_clause;
  68    RegisterDemand total_demand, total_demand_clause;
  69
  70    /* for moving instructions before the current instruction to after it */
  71    void downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
  72    MoveResult downwards_move(bool clause);
  73    void downwards_skip();
  74
  75    /* for moving instructions after the first use of the current instruction upwards */
  76    void upwards_init(int source_idx, bool improved_rar);
  77    bool upwards_check_deps();
  78    void upwards_set_insert_idx(int before);
  79    MoveResult upwards_move();
  80    void upwards_skip();
  81
  82 private:
  83    void downwards_advance_helper();
  84 };
  85
  86 struct sched_ctx {
  87    int16_t num_waves;
  88    int16_t last_SMEM_stall;
  89    int last_SMEM_dep_idx;
  90    MoveState mv;
  91 };
  92
  93 /* This scheduler is a simple bottom-up pass based on ideas from
  94  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  95  * from Xiaohua Shi and Peng Guo.
  96  * The basic approach is to iterate over all instructions. When a memory instruction
  97  * is encountered it tries to move independent instructions from above and below
  98  * between the memory instruction and it's first user.
  99  * The novelty is that this scheduler cares for the current register pressure:
 100  * Instructions will only be moved if the register pressure won't exceed a certain bound.
 101  */
 102
 103 template <typename T>
 104 void move_element(T begin_it, size_t idx, size_t before) {
 105     if (idx < before) {
 106         auto begin = std::next(begin_it, idx);
 107         auto end = std::next(begin_it, before);
 108         std::rotate(begin, begin + 1, end);
 109     } else if (idx > before) {
 110         auto begin = std::next(begin_it, before);
 111         auto end = std::next(begin_it, idx + 1);
 112         std::rotate(begin, end - 1, end);
 113     }
 114 }
 115
 116 void MoveState::downwards_advance_helper()
 117 {
 118    source_idx--;
 119    total_demand.update(register_demand[source_idx]);
 120 }
 121
 122 void MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
 123 {
 124    improved_rar = improved_rar_;
 125    source_idx = current_idx;
 126
 127    insert_idx = current_idx + 1;
 128    insert_idx_clause = current_idx;
 129
 130    total_demand = total_demand_clause = register_demand[current_idx];
 131
 132    std::fill(depends_on.begin(), depends_on.end(), false);
 133    if (improved_rar) {
 134       std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
 135       if (may_form_clauses)
 136          std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
 137    }
 138
 139    for (const Operand& op : current->operands) {
 140       if (op.isTemp()) {
 141          depends_on[op.tempId()] = true;
 142          if (improved_rar && op.isFirstKill())
 143             RAR_dependencies[op.tempId()] = true;
 144       }
 145    }
 146
 147    /* update total_demand/source_idx */
 148    downwards_advance_helper();
 149 }
 150
 151 MoveResult MoveState::downwards_move(bool clause)
 152 {
 153    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 154
 155    for (const Definition& def : instr->definitions)
 156       if (def.isTemp() && depends_on[def.tempId()])
 157          return move_fail_ssa;
 158
 159    /* check if one of candidate's operands is killed by depending instruction */
 160    std::vector<bool>& RAR_deps = improved_rar ? (clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
 161    for (const Operand& op : instr->operands) {
 162       if (op.isTemp() && RAR_deps[op.tempId()]) {
 163          // FIXME: account for difference in register pressure
 164          return move_fail_rar;
 165       }
 166    }
 167
 168    if (clause) {
 169       for (const Operand& op : instr->operands) {
 170          if (op.isTemp()) {
 171             depends_on[op.tempId()] = true;
 172             if (op.isFirstKill())
 173                RAR_dependencies[op.tempId()] = true;
 174          }
 175       }
 176    }
 177
 178    int dest_insert_idx = clause ? insert_idx_clause : insert_idx;
 179    RegisterDemand register_pressure = clause ? total_demand_clause : total_demand;
 180
 181    const RegisterDemand candidate_diff = get_live_changes(instr);
 182    const RegisterDemand temp = get_temp_registers(instr);
 183    if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
 184       return move_fail_pressure;
 185    const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
 186    const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
 187    if (new_demand.exceeds(max_registers))
 188       return move_fail_pressure;
 189
 190    /* move the candidate below the memory load */
 191    move_element(block->instructions.begin(), source_idx, dest_insert_idx);
 192
 193    /* update register pressure */
 194    move_element(register_demand, source_idx, dest_insert_idx);
 195    for (int i = source_idx; i < dest_insert_idx - 1; i++)
 196       register_demand[i] -= candidate_diff;
 197    register_demand[dest_insert_idx - 1] = new_demand;
 198    total_demand_clause -= candidate_diff;
 199    insert_idx_clause--;
 200    if (!clause) {
 201       total_demand -= candidate_diff;
 202       insert_idx--;
 203    }
 204
 205    downwards_advance_helper();
 206    return move_success;
 207 }
 208
 209 void MoveState::downwards_skip()
 210 {
 211    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 212
 213    for (const Operand& op : instr->operands) {
 214       if (op.isTemp()) {
 215          depends_on[op.tempId()] = true;
 216          if (improved_rar && op.isFirstKill()) {
 217             RAR_dependencies[op.tempId()] = true;
 218             RAR_dependencies_clause[op.tempId()] = true;
 219          }
 220       }
 221    }
 222    total_demand_clause.update(register_demand[source_idx]);
 223
 224    downwards_advance_helper();
 225 }
 226
 227 void MoveState::upwards_init(int source_idx_, bool improved_rar_)
 228 {
 229    source_idx = source_idx_;
 230    improved_rar = improved_rar_;
 231
 232    insert_idx = -1;
 233
 234    std::fill(depends_on.begin(), depends_on.end(), false);
 235    std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
 236
 237    for (const Definition& def : current->definitions) {
 238       if (def.isTemp())
 239          depends_on[def.tempId()] = true;
 240    }
 241 }
 242
 243 bool MoveState::upwards_check_deps()
 244 {
 245    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 246    for (const Operand& op : instr->operands) {
 247       if (op.isTemp() && depends_on[op.tempId()])
 248          return false;
 249    }
 250    return true;
 251 }
 252
 253 void MoveState::upwards_set_insert_idx(int before)
 254 {
 255    insert_idx = before;
 256    total_demand = register_demand[before - 1];
 257 }
 258
 259 MoveResult MoveState::upwards_move()
 260 {
 261    assert(insert_idx >= 0);
 262
 263    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 264    for (const Operand& op : instr->operands) {
 265       if (op.isTemp() && depends_on[op.tempId()])
 266          return move_fail_ssa;
 267    }
 268
 269    /* check if candidate uses/kills an operand which is used by a dependency */
 270    for (const Operand& op : instr->operands) {
 271       if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
 272          return move_fail_rar;
 273    }
 274
 275    /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 276    const RegisterDemand candidate_diff = get_live_changes(instr);
 277    const RegisterDemand temp = get_temp_registers(instr);
 278    if (RegisterDemand(total_demand + candidate_diff).exceeds(max_registers))
 279       return move_fail_pressure;
 280    const RegisterDemand temp2 = get_temp_registers(block->instructions[insert_idx - 1]);
 281    const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 282    if (new_demand.exceeds(max_registers))
 283       return move_fail_pressure;
 284
 285    /* move the candidate above the insert_idx */
 286    move_element(block->instructions.begin(), source_idx, insert_idx);
 287
 288    /* update register pressure */
 289    move_element(register_demand, source_idx, insert_idx);
 290    for (int i = insert_idx + 1; i <= source_idx; i++)
 291       register_demand[i] += candidate_diff;
 292    register_demand[insert_idx] = new_demand;
 293    total_demand += candidate_diff;
 294
 295    insert_idx++;
 296
 297    total_demand.update(register_demand[source_idx]);
 298    source_idx++;
 299
 300    return move_success;
 301 }
 302
 303 void MoveState::upwards_skip()
 304 {
 305    if (insert_idx >= 0) {
 306       aco_ptr<Instruction>& instr = block->instructions[source_idx];
 307       for (const Definition& def : instr->definitions) {
 308          if (def.isTemp())
 309             depends_on[def.tempId()] = true;
 310       }
 311       for (const Operand& op : instr->operands) {
 312          if (op.isTemp())
 313             RAR_dependencies[op.tempId()] = true;
 314       }
 315       total_demand.update(register_demand[source_idx]);
 316    }
 317
 318    source_idx++;
 319 }
 320
 321 bool can_reorder(Instruction* candidate)
 322 {
 323    switch (candidate->format) {
 324    case Format::SMEM:
 325       return static_cast<SMEM_instruction*>(candidate)->can_reorder;
 326    case Format::MUBUF:
 327       return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
 328    case Format::MIMG:
 329       return static_cast<MIMG_instruction*>(candidate)->can_reorder;
 330    case Format::MTBUF:
 331       return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
 332    case Format::FLAT:
 333    case Format::GLOBAL:
 334    case Format::SCRATCH:
 335       return static_cast<FLAT_instruction*>(candidate)->can_reorder;
 336    default:
 337       return true;
 338    }
 339 }
 340
 341 bool is_gs_or_done_sendmsg(Instruction *instr)
 342 {
 343    if (instr->opcode == aco_opcode::s_sendmsg) {
 344       uint16_t imm = static_cast<SOPP_instruction*>(instr)->imm;
 345       return (imm & sendmsg_id_mask) == _sendmsg_gs ||
 346              (imm & sendmsg_id_mask) == _sendmsg_gs_done;
 347    }
 348    return false;
 349 }
 350
 351 bool is_done_sendmsg(Instruction *instr)
 352 {
 353    if (instr->opcode == aco_opcode::s_sendmsg) {
 354       uint16_t imm = static_cast<SOPP_instruction*>(instr)->imm;
 355       return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
 356    }
 357    return false;
 358 }
 359
 360 barrier_interaction get_barrier_interaction(Instruction* instr)
 361 {
 362    switch (instr->format) {
 363    case Format::SMEM:
 364       return static_cast<SMEM_instruction*>(instr)->barrier;
 365    case Format::MUBUF:
 366       return static_cast<MUBUF_instruction*>(instr)->barrier;
 367    case Format::MIMG:
 368       return static_cast<MIMG_instruction*>(instr)->barrier;
 369    case Format::MTBUF:
 370       return static_cast<MTBUF_instruction*>(instr)->barrier;
 371    case Format::FLAT:
 372    case Format::GLOBAL:
 373    case Format::SCRATCH:
 374       return static_cast<FLAT_instruction*>(instr)->barrier;
 375    case Format::DS:
 376       return barrier_shared;
 377    case Format::SOPP:
 378       if (is_done_sendmsg(instr))
 379          return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
 380       else if (is_gs_or_done_sendmsg(instr))
 381          return barrier_gs_sendmsg;
 382       else
 383          return barrier_none;
 384    case Format::PSEUDO_BARRIER:
 385       return barrier_barrier;
 386    default:
 387       return barrier_none;
 388    }
 389 }
 390
 391 barrier_interaction parse_barrier(Instruction *instr)
 392 {
 393    if (instr->format == Format::PSEUDO_BARRIER) {
 394       switch (instr->opcode) {
 395       case aco_opcode::p_memory_barrier_atomic:
 396          return barrier_atomic;
 397       /* For now, buffer and image barriers are treated the same. this is because of
 398        * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
 399        * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
 400        * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
 401        * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
 402        * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
 403        * Either way, this solution should work. */
 404       case aco_opcode::p_memory_barrier_buffer:
 405       case aco_opcode::p_memory_barrier_image:
 406          return (barrier_interaction)(barrier_image | barrier_buffer);
 407       case aco_opcode::p_memory_barrier_shared:
 408          return barrier_shared;
 409       case aco_opcode::p_memory_barrier_common:
 410          return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
 411       case aco_opcode::p_memory_barrier_gs_data:
 412          return barrier_gs_data;
 413       case aco_opcode::p_memory_barrier_gs_sendmsg:
 414          return barrier_gs_sendmsg;
 415       default:
 416          break;
 417       }
 418    } else if (instr->opcode == aco_opcode::s_barrier) {
 419       return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
 420    }
 421    return barrier_none;
 422 }
 423
 424 struct hazard_query {
 425    bool contains_spill;
 426    int barriers;
 427    int barrier_interaction;
 428    bool can_reorder_vmem;
 429    bool can_reorder_smem;
 430 };
 431
 432 void init_hazard_query(hazard_query *query) {
 433    query->contains_spill = false;
 434    query->barriers = 0;
 435    query->barrier_interaction = 0;
 436    query->can_reorder_vmem = true;
 437    query->can_reorder_smem = true;
 438 }
 439
 440 void add_to_hazard_query(hazard_query *query, Instruction *instr)
 441 {
 442    query->barriers |= parse_barrier(instr);
 443    query->barrier_interaction |= get_barrier_interaction(instr);
 444    if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
 445       query->contains_spill = true;
 446
 447    bool can_reorder_instr = can_reorder(instr);
 448    query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr;
 449    query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr;
 450 }
 451
 452 enum HazardResult {
 453    hazard_success,
 454    hazard_fail_reorder_vmem_smem,
 455    hazard_fail_reorder_ds,
 456    hazard_fail_reorder_sendmsg,
 457    hazard_fail_spill,
 458    hazard_fail_export,
 459    hazard_fail_barrier,
 460    /* Must stop at these failures. The hazard query code doesn't consider them
 461     * when added. */
 462    hazard_fail_exec,
 463    hazard_fail_unreorderable,
 464 };
 465
 466 HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
 467 {
 468    bool can_reorder_candidate = can_reorder(instr);
 469
 470    if (instr->opcode == aco_opcode::p_exit_early_if)
 471       return hazard_fail_exec;
 472    for (const Definition& def : instr->definitions) {
 473       if (def.isFixed() && def.physReg() == exec)
 474          return hazard_fail_exec;
 475    }
 476
 477    /* don't move exports so that they stay closer together */
 478    if (instr->format == Format::EXP)
 479       return hazard_fail_export;
 480
 481    /* don't move non-reorderable instructions */
 482    if (instr->opcode == aco_opcode::s_memtime ||
 483        instr->opcode == aco_opcode::s_memrealtime ||
 484        instr->opcode == aco_opcode::s_setprio)
 485       return hazard_fail_unreorderable;
 486
 487    if (query->barrier_interaction && (query->barrier_interaction & parse_barrier(instr)))
 488       return hazard_fail_barrier;
 489    if (query->barriers && (query->barriers & get_barrier_interaction(instr)))
 490       return hazard_fail_barrier;
 491
 492    if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate)
 493       return hazard_fail_reorder_vmem_smem;
 494    if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate)
 495       return hazard_fail_reorder_vmem_smem;
 496    if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS)
 497       return hazard_fail_reorder_ds;
 498    if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr)))
 499       return hazard_fail_reorder_sendmsg;
 500
 501    if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
 502        query->contains_spill)
 503       return hazard_fail_spill;
 504
 505    return hazard_success;
 506 }
 507
 508 void schedule_SMEM(sched_ctx& ctx, Block* block,
 509                    std::vector<RegisterDemand>& register_demand,
 510                    Instruction* current, int idx)
 511 {
 512    assert(idx != 0);
 513    int window_size = SMEM_WINDOW_SIZE;
 514    int max_moves = SMEM_MAX_MOVES;
 515    int16_t k = 0;
 516
 517    /* don't move s_memtime/s_memrealtime */
 518    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
 519       return;
 520
 521    /* first, check if we have instructions before current to move down */
 522    hazard_query hq;
 523    init_hazard_query(&hq);
 524    add_to_hazard_query(&hq, current);
 525
 526    ctx.mv.downwards_init(idx, false, false);
 527
 528    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 529       assert(candidate_idx >= 0);
 530       assert(candidate_idx == ctx.mv.source_idx);
 531       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 532
 533       /* break if we'd make the previous SMEM instruction stall */
 534       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 535       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 536          break;
 537
 538       /* break when encountering another MEM instruction, logical_start or barriers */
 539       if (candidate->opcode == aco_opcode::p_logical_start)
 540          break;
 541       if (candidate->isVMEM())
 542          break;
 543
 544       bool can_move_down = true;
 545
 546       HazardResult haz = perform_hazard_query(&hq, candidate.get());
 547       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
 548          can_move_down = false;
 549       else if (haz != hazard_success)
 550          break;
 551
 552       /* don't use LDS/GDS instructions to hide latency since it can
 553        * significanly worsen LDS scheduling */
 554       if (candidate->format == Format::DS || !can_move_down) {
 555          add_to_hazard_query(&hq, candidate.get());
 556          ctx.mv.downwards_skip();
 557          continue;
 558       }
 559
 560       MoveResult res = ctx.mv.downwards_move(false);
 561       if (res == move_fail_ssa || res == move_fail_rar) {
 562          add_to_hazard_query(&hq, candidate.get());
 563          ctx.mv.downwards_skip();
 564          continue;
 565       } else if (res == move_fail_pressure) {
 566          break;
 567       }
 568
 569       if (candidate_idx < ctx.last_SMEM_dep_idx)
 570          ctx.last_SMEM_stall++;
 571       k++;
 572    }
 573
 574    /* find the first instruction depending on current or find another MEM */
 575    ctx.mv.upwards_init(idx + 1, false);
 576
 577    bool found_dependency = false;
 578    /* second, check if we have instructions after current to move up */
 579    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 580       assert(candidate_idx == ctx.mv.source_idx);
 581       assert(candidate_idx < (int) block->instructions.size());
 582       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 583
 584       if (candidate->opcode == aco_opcode::p_logical_end)
 585          break;
 586
 587       /* check if candidate depends on current */
 588       bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps();
 589       /* no need to steal from following VMEM instructions */
 590       if (is_dependency && candidate->isVMEM())
 591          break;
 592
 593       if (found_dependency) {
 594          HazardResult haz = perform_hazard_query(&hq, candidate.get());
 595          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 596              haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
 597              haz == hazard_fail_export)
 598             is_dependency = true;
 599          else if (haz != hazard_success)
 600             break;
 601       }
 602
 603       if (is_dependency) {
 604          if (!found_dependency) {
 605             ctx.mv.upwards_set_insert_idx(candidate_idx);
 606             init_hazard_query(&hq);
 607             found_dependency = true;
 608          }
 609       }
 610
 611       if (is_dependency || !found_dependency) {
 612          if (found_dependency)
 613             add_to_hazard_query(&hq, candidate.get());
 614          else
 615             k++;
 616          ctx.mv.upwards_skip();
 617          continue;
 618       }
 619
 620       MoveResult res = ctx.mv.upwards_move();
 621       if (res == move_fail_ssa || res == move_fail_rar) {
 622          /* no need to steal from following VMEM instructions */
 623          if (res == move_fail_ssa && candidate->isVMEM())
 624             break;
 625          add_to_hazard_query(&hq, candidate.get());
 626          ctx.mv.upwards_skip();
 627          continue;
 628       } else if (res == move_fail_pressure) {
 629          break;
 630       }
 631       k++;
 632    }
 633
 634    ctx.last_SMEM_dep_idx = found_dependency ? ctx.mv.insert_idx : 0;
 635    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 636 }
 637
 638 void schedule_VMEM(sched_ctx& ctx, Block* block,
 639                    std::vector<RegisterDemand>& register_demand,
 640                    Instruction* current, int idx)
 641 {
 642    assert(idx != 0);
 643    int window_size = VMEM_WINDOW_SIZE;
 644    int max_moves = VMEM_MAX_MOVES;
 645    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
 646    int16_t k = 0;
 647
 648    /* first, check if we have instructions before current to move down */
 649    hazard_query indep_hq;
 650    hazard_query clause_hq;
 651    init_hazard_query(&indep_hq);
 652    init_hazard_query(&clause_hq);
 653    add_to_hazard_query(&indep_hq, current);
 654
 655    ctx.mv.downwards_init(idx, true, true);
 656
 657    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 658       assert(candidate_idx == ctx.mv.source_idx);
 659       assert(candidate_idx >= 0);
 660       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 661       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 662
 663       /* break when encountering another VMEM instruction, logical_start or barriers */
 664       if (candidate->opcode == aco_opcode::p_logical_start)
 665          break;
 666
 667       /* break if we'd make the previous SMEM instruction stall */
 668       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 669       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 670          break;
 671
 672       bool part_of_clause = false;
 673       if (current->isVMEM() == candidate->isVMEM()) {
 674          bool same_resource = true;
 675          if (current->isVMEM())
 676             same_resource = candidate->operands[0].tempId() == current->operands[0].tempId();
 677          int grab_dist = ctx.mv.insert_idx_clause - candidate_idx;
 678          /* We can't easily tell how much this will decrease the def-to-use
 679           * distances, so just use how far it will be moved as a heuristic. */
 680          part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
 681       }
 682
 683       /* if current depends on candidate, add additional dependencies and continue */
 684       bool can_move_down = !is_vmem || part_of_clause;
 685
 686       HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get());
 687       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 688           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
 689           haz == hazard_fail_export)
 690          can_move_down = false;
 691       else if (haz != hazard_success)
 692          break;
 693
 694       if (!can_move_down) {
 695          add_to_hazard_query(&indep_hq, candidate.get());
 696          add_to_hazard_query(&clause_hq, candidate.get());
 697          ctx.mv.downwards_skip();
 698          continue;
 699       }
 700
 701       MoveResult res = ctx.mv.downwards_move(part_of_clause);
 702       if (res == move_fail_ssa || res == move_fail_rar) {
 703          add_to_hazard_query(&indep_hq, candidate.get());
 704          add_to_hazard_query(&clause_hq, candidate.get());
 705          ctx.mv.downwards_skip();
 706          continue;
 707       } else if (res == move_fail_pressure) {
 708          break;
 709       }
 710       k++;
 711       if (candidate_idx < ctx.last_SMEM_dep_idx)
 712          ctx.last_SMEM_stall++;
 713    }
 714
 715    /* find the first instruction depending on current or find another VMEM */
 716    ctx.mv.upwards_init(idx + 1, true);
 717
 718    bool found_dependency = false;
 719    /* second, check if we have instructions after current to move up */
 720    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 721       assert(candidate_idx == ctx.mv.source_idx);
 722       assert(candidate_idx < (int) block->instructions.size());
 723       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 724       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 725
 726       if (candidate->opcode == aco_opcode::p_logical_end)
 727          break;
 728
 729       /* check if candidate depends on current */
 730       bool is_dependency = false;
 731       if (found_dependency) {
 732          HazardResult haz = perform_hazard_query(&indep_hq, candidate.get());
 733          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 734              haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
 735              haz == hazard_fail_barrier || haz == hazard_fail_export)
 736             is_dependency = true;
 737          else if (haz != hazard_success)
 738             break;
 739       }
 740
 741       is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps();
 742       if (is_dependency) {
 743          if (!found_dependency) {
 744             ctx.mv.upwards_set_insert_idx(candidate_idx);
 745             init_hazard_query(&indep_hq);
 746             found_dependency = true;
 747          }
 748       } else if (is_vmem) {
 749          /* don't move up dependencies of other VMEM instructions */
 750          for (const Definition& def : candidate->definitions) {
 751             if (def.isTemp())
 752                ctx.mv.depends_on[def.tempId()] = true;
 753          }
 754       }
 755
 756       if (is_dependency || !found_dependency) {
 757          if (found_dependency)
 758             add_to_hazard_query(&indep_hq, candidate.get());
 759          ctx.mv.upwards_skip();
 760          continue;
 761       }
 762
 763       MoveResult res = ctx.mv.upwards_move();
 764       if (res == move_fail_ssa || res == move_fail_rar) {
 765          add_to_hazard_query(&indep_hq, candidate.get());
 766          ctx.mv.upwards_skip();
 767          continue;
 768       } else if (res == move_fail_pressure) {
 769          break;
 770       }
 771       k++;
 772    }
 773 }
 774
 775 void schedule_position_export(sched_ctx& ctx, Block* block,
 776                               std::vector<RegisterDemand>& register_demand,
 777                               Instruction* current, int idx)
 778 {
 779    assert(idx != 0);
 780    int window_size = POS_EXP_WINDOW_SIZE;
 781    int max_moves = POS_EXP_MAX_MOVES;
 782    int16_t k = 0;
 783
 784    ctx.mv.downwards_init(idx, true, false);
 785
 786    hazard_query hq;
 787    init_hazard_query(&hq);
 788    add_to_hazard_query(&hq, current);
 789
 790    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 791       assert(candidate_idx >= 0);
 792       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 793
 794       if (candidate->opcode == aco_opcode::p_logical_start)
 795          break;
 796       if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
 797          break;
 798
 799       HazardResult haz = perform_hazard_query(&hq, candidate.get());
 800       if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
 801          break;
 802
 803       if (haz != hazard_success) {
 804          add_to_hazard_query(&hq, candidate.get());
 805          ctx.mv.downwards_skip();
 806          continue;
 807       }
 808
 809       MoveResult res = ctx.mv.downwards_move(false);
 810       if (res == move_fail_ssa || res == move_fail_rar) {
 811          add_to_hazard_query(&hq, candidate.get());
 812          ctx.mv.downwards_skip();
 813          continue;
 814       } else if (res == move_fail_pressure) {
 815          break;
 816       }
 817       k++;
 818    }
 819 }
 820
 821 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 822 {
 823    ctx.last_SMEM_dep_idx = 0;
 824    ctx.last_SMEM_stall = INT16_MIN;
 825    ctx.mv.block = block;
 826    ctx.mv.register_demand = live_vars.register_demand[block->index].data();
 827
 828    /* go through all instructions and find memory loads */
 829    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 830       Instruction* current = block->instructions[idx].get();
 831
 832       if (current->definitions.empty())
 833          continue;
 834
 835       if (current->isVMEM() || current->isFlatOrGlobal()) {
 836          ctx.mv.current = current;
 837          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 838       }
 839
 840       if (current->format == Format::SMEM) {
 841          ctx.mv.current = current;
 842          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 843       }
 844    }
 845
 846    if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) {
 847       /* Try to move position exports as far up as possible, to reduce register
 848        * usage and because ISA reference guides say so. */
 849       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 850          Instruction* current = block->instructions[idx].get();
 851
 852          if (current->format == Format::EXP) {
 853             unsigned target = static_cast<Export_instruction*>(current)->dest;
 854             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) {
 855                ctx.mv.current = current;
 856                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 857             }
 858          }
 859       }
 860    }
 861
 862    /* resummarize the block's register demand */
 863    block->register_demand = RegisterDemand();
 864    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 865       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 866    }
 867 }
 868
 869
 870 void schedule_program(Program *program, live& live_vars)
 871 {
 872    sched_ctx ctx;
 873    ctx.mv.depends_on.resize(program->peekAllocationId());
 874    ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
 875    ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
 876    /* Allowing the scheduler to reduce the number of waves to as low as 5
 877     * improves performance of Thrones of Britannia significantly and doesn't
 878     * seem to hurt anything else. */
 879    if (program->num_waves <= 5)
 880       ctx.num_waves = program->num_waves;
 881    else if (program->max_reg_demand.vgpr >= 32)
 882       ctx.num_waves = 5;
 883    else if (program->max_reg_demand.vgpr >= 28)
 884       ctx.num_waves = 6;
 885    else if (program->max_reg_demand.vgpr >= 24)
 886       ctx.num_waves = 7;
 887    else
 888       ctx.num_waves = 8;
 889    ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
 890
 891    assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
 892    ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
 893                             int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
 894
 895    for (Block& block : program->blocks)
 896       schedule_block(ctx, program, &block, live_vars);
 897
 898    /* update max_reg_demand and num_waves */
 899    RegisterDemand new_demand;
 900    for (Block& block : program->blocks) {
 901       new_demand.update(block.register_demand);
 902    }
 903    update_vgpr_sgpr_demand(program, new_demand);
 904
 905    /* if enabled, this code asserts that register_demand is updated correctly */
 906    #if 0
 907    int prev_num_waves = program->num_waves;
 908    const RegisterDemand prev_max_demand = program->max_reg_demand;
 909
 910    std::vector<RegisterDemand> demands(program->blocks.size());
 911    for (unsigned j = 0; j < program->blocks.size(); j++) {
 912       demands[j] = program->blocks[j].register_demand;
 913    }
 914
 915    struct radv_nir_compiler_options options;
 916    options.chip_class = program->chip_class;
 917    live live_vars2 = aco::live_var_analysis(program, &options);
 918
 919    for (unsigned j = 0; j < program->blocks.size(); j++) {
 920       Block &b = program->blocks[j];
 921       for (unsigned i = 0; i < b.instructions.size(); i++)
 922          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
 923       assert(b.register_demand == demands[j]);
 924    }
 925
 926    assert(program->max_reg_demand == prev_max_demand);
 927    assert(program->num_waves == prev_num_waves);
 928    #endif
 929 }
 930
 931 }