src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include "aco_builder.h"
  27 #include <unordered_set>
  28 #include <algorithm>
  29
  30 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  31 #include "amdgfxregs.h"
  32
  33 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  34 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  35 #define POS_EXP_WINDOW_SIZE 512
  36 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4)
  37 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8)
  38 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
  39 #define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8)
  40 #define POS_EXP_MAX_MOVES 512
  41
  42 namespace aco {
  43
  44 enum MoveResult {
  45    move_success,
  46    move_fail_ssa,
  47    move_fail_rar,
  48    move_fail_pressure,
  49 };
  50
  51 struct MoveState {
  52    RegisterDemand max_registers;
  53
  54    Block *block;
  55    Instruction *current;
  56    RegisterDemand *register_demand;
  57    bool improved_rar;
  58
  59    std::vector<bool> depends_on;
  60    /* Two are needed because, for downwards VMEM scheduling, one needs to
  61     * exclude the instructions in the clause, since new instructions in the
  62     * clause are not moved past any other instructions in the clause. */
  63    std::vector<bool> RAR_dependencies;
  64    std::vector<bool> RAR_dependencies_clause;
  65
  66    int source_idx;
  67    int insert_idx, insert_idx_clause;
  68    RegisterDemand total_demand, total_demand_clause;
  69
  70    /* for moving instructions before the current instruction to after it */
  71    void downwards_init(int current_idx, bool improved_rar, bool may_form_clauses);
  72    MoveResult downwards_move(bool clause);
  73    void downwards_skip();
  74
  75    /* for moving instructions after the first use of the current instruction upwards */
  76    void upwards_init(int source_idx, bool improved_rar);
  77    bool upwards_check_deps();
  78    void upwards_set_insert_idx(int before);
  79    MoveResult upwards_move();
  80    void upwards_skip();
  81
  82 private:
  83    void downwards_advance_helper();
  84 };
  85
  86 struct sched_ctx {
  87    int16_t num_waves;
  88    int16_t last_SMEM_stall;
  89    int last_SMEM_dep_idx;
  90    MoveState mv;
  91 };
  92
  93 /* This scheduler is a simple bottom-up pass based on ideas from
  94  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  95  * from Xiaohua Shi and Peng Guo.
  96  * The basic approach is to iterate over all instructions. When a memory instruction
  97  * is encountered it tries to move independent instructions from above and below
  98  * between the memory instruction and it's first user.
  99  * The novelty is that this scheduler cares for the current register pressure:
 100  * Instructions will only be moved if the register pressure won't exceed a certain bound.
 101  */
 102
 103 template <typename T>
 104 void move_element(T begin_it, size_t idx, size_t before) {
 105     if (idx < before) {
 106         auto begin = std::next(begin_it, idx);
 107         auto end = std::next(begin_it, before);
 108         std::rotate(begin, begin + 1, end);
 109     } else if (idx > before) {
 110         auto begin = std::next(begin_it, before);
 111         auto end = std::next(begin_it, idx + 1);
 112         std::rotate(begin, end - 1, end);
 113     }
 114 }
 115
 116 void MoveState::downwards_advance_helper()
 117 {
 118    source_idx--;
 119    total_demand.update(register_demand[source_idx]);
 120 }
 121
 122 void MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses)
 123 {
 124    improved_rar = improved_rar_;
 125    source_idx = current_idx;
 126
 127    insert_idx = current_idx + 1;
 128    insert_idx_clause = current_idx;
 129
 130    total_demand = total_demand_clause = register_demand[current_idx];
 131
 132    std::fill(depends_on.begin(), depends_on.end(), false);
 133    if (improved_rar) {
 134       std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
 135       if (may_form_clauses)
 136          std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false);
 137    }
 138
 139    for (const Operand& op : current->operands) {
 140       if (op.isTemp()) {
 141          depends_on[op.tempId()] = true;
 142          if (improved_rar && op.isFirstKill())
 143             RAR_dependencies[op.tempId()] = true;
 144       }
 145    }
 146
 147    /* update total_demand/source_idx */
 148    downwards_advance_helper();
 149 }
 150
 151 MoveResult MoveState::downwards_move(bool clause)
 152 {
 153    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 154
 155    for (const Definition& def : instr->definitions)
 156       if (def.isTemp() && depends_on[def.tempId()])
 157          return move_fail_ssa;
 158
 159    /* check if one of candidate's operands is killed by depending instruction */
 160    std::vector<bool>& RAR_deps = improved_rar ? (clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on;
 161    for (const Operand& op : instr->operands) {
 162       if (op.isTemp() && RAR_deps[op.tempId()]) {
 163          // FIXME: account for difference in register pressure
 164          return move_fail_rar;
 165       }
 166    }
 167
 168    if (clause) {
 169       for (const Operand& op : instr->operands) {
 170          if (op.isTemp()) {
 171             depends_on[op.tempId()] = true;
 172             if (op.isFirstKill())
 173                RAR_dependencies[op.tempId()] = true;
 174          }
 175       }
 176    }
 177
 178    int dest_insert_idx = clause ? insert_idx_clause : insert_idx;
 179    RegisterDemand register_pressure = clause ? total_demand_clause : total_demand;
 180
 181    const RegisterDemand candidate_diff = get_live_changes(instr);
 182    const RegisterDemand temp = get_temp_registers(instr);
 183    if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers))
 184       return move_fail_pressure;
 185    const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]);
 186    const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp;
 187    if (new_demand.exceeds(max_registers))
 188       return move_fail_pressure;
 189
 190    /* move the candidate below the memory load */
 191    move_element(block->instructions.begin(), source_idx, dest_insert_idx);
 192
 193    /* update register pressure */
 194    move_element(register_demand, source_idx, dest_insert_idx);
 195    for (int i = source_idx; i < dest_insert_idx - 1; i++)
 196       register_demand[i] -= candidate_diff;
 197    register_demand[dest_insert_idx - 1] = new_demand;
 198    total_demand_clause -= candidate_diff;
 199    insert_idx_clause--;
 200    if (!clause) {
 201       total_demand -= candidate_diff;
 202       insert_idx--;
 203    }
 204
 205    downwards_advance_helper();
 206    return move_success;
 207 }
 208
 209 void MoveState::downwards_skip()
 210 {
 211    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 212
 213    for (const Operand& op : instr->operands) {
 214       if (op.isTemp()) {
 215          depends_on[op.tempId()] = true;
 216          if (improved_rar && op.isFirstKill()) {
 217             RAR_dependencies[op.tempId()] = true;
 218             RAR_dependencies_clause[op.tempId()] = true;
 219          }
 220       }
 221    }
 222    total_demand_clause.update(register_demand[source_idx]);
 223
 224    downwards_advance_helper();
 225 }
 226
 227 void MoveState::upwards_init(int source_idx_, bool improved_rar_)
 228 {
 229    source_idx = source_idx_;
 230    improved_rar = improved_rar_;
 231
 232    insert_idx = -1;
 233
 234    std::fill(depends_on.begin(), depends_on.end(), false);
 235    std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false);
 236
 237    for (const Definition& def : current->definitions) {
 238       if (def.isTemp())
 239          depends_on[def.tempId()] = true;
 240    }
 241 }
 242
 243 bool MoveState::upwards_check_deps()
 244 {
 245    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 246    for (const Operand& op : instr->operands) {
 247       if (op.isTemp() && depends_on[op.tempId()])
 248          return false;
 249    }
 250    return true;
 251 }
 252
 253 void MoveState::upwards_set_insert_idx(int before)
 254 {
 255    insert_idx = before;
 256    total_demand = register_demand[before - 1];
 257 }
 258
 259 MoveResult MoveState::upwards_move()
 260 {
 261    assert(insert_idx >= 0);
 262
 263    aco_ptr<Instruction>& instr = block->instructions[source_idx];
 264    for (const Operand& op : instr->operands) {
 265       if (op.isTemp() && depends_on[op.tempId()])
 266          return move_fail_ssa;
 267    }
 268
 269    /* check if candidate uses/kills an operand which is used by a dependency */
 270    for (const Operand& op : instr->operands) {
 271       if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()])
 272          return move_fail_rar;
 273    }
 274
 275    /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 276    const RegisterDemand candidate_diff = get_live_changes(instr);
 277    const RegisterDemand temp = get_temp_registers(instr);
 278    if (RegisterDemand(total_demand + candidate_diff).exceeds(max_registers))
 279       return move_fail_pressure;
 280    const RegisterDemand temp2 = get_temp_registers(block->instructions[insert_idx - 1]);
 281    const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 282    if (new_demand.exceeds(max_registers))
 283       return move_fail_pressure;
 284
 285    /* move the candidate above the insert_idx */
 286    move_element(block->instructions.begin(), source_idx, insert_idx);
 287
 288    /* update register pressure */
 289    move_element(register_demand, source_idx, insert_idx);
 290    for (int i = insert_idx + 1; i <= source_idx; i++)
 291       register_demand[i] += candidate_diff;
 292    register_demand[insert_idx] = new_demand;
 293    total_demand += candidate_diff;
 294
 295    insert_idx++;
 296
 297    total_demand.update(register_demand[source_idx]);
 298    source_idx++;
 299
 300    return move_success;
 301 }
 302
 303 void MoveState::upwards_skip()
 304 {
 305    if (insert_idx >= 0) {
 306       aco_ptr<Instruction>& instr = block->instructions[source_idx];
 307       for (const Definition& def : instr->definitions) {
 308          if (def.isTemp())
 309             depends_on[def.tempId()] = true;
 310       }
 311       for (const Operand& op : instr->operands) {
 312          if (op.isTemp())
 313             RAR_dependencies[op.tempId()] = true;
 314       }
 315       total_demand.update(register_demand[source_idx]);
 316    }
 317
 318    source_idx++;
 319 }
 320
 321 bool can_reorder(Instruction* candidate)
 322 {
 323    switch (candidate->format) {
 324    case Format::SMEM:
 325       return static_cast<SMEM_instruction*>(candidate)->can_reorder;
 326    case Format::MUBUF:
 327       return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
 328    case Format::MIMG:
 329       return static_cast<MIMG_instruction*>(candidate)->can_reorder;
 330    case Format::MTBUF:
 331       return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
 332    case Format::FLAT:
 333    case Format::GLOBAL:
 334    case Format::SCRATCH:
 335       return static_cast<FLAT_instruction*>(candidate)->can_reorder;
 336    default:
 337       return true;
 338    }
 339 }
 340
 341 bool is_gs_or_done_sendmsg(const Instruction *instr)
 342 {
 343    if (instr->opcode == aco_opcode::s_sendmsg) {
 344       uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
 345       return (imm & sendmsg_id_mask) == _sendmsg_gs ||
 346              (imm & sendmsg_id_mask) == _sendmsg_gs_done;
 347    }
 348    return false;
 349 }
 350
 351 bool is_done_sendmsg(const Instruction *instr)
 352 {
 353    if (instr->opcode == aco_opcode::s_sendmsg) {
 354       uint16_t imm = static_cast<const SOPP_instruction*>(instr)->imm;
 355       return (imm & sendmsg_id_mask) == _sendmsg_gs_done;
 356    }
 357    return false;
 358 }
 359
 360 barrier_interaction get_barrier_interaction(const Instruction* instr)
 361 {
 362    switch (instr->format) {
 363    case Format::SMEM:
 364       return static_cast<const SMEM_instruction*>(instr)->barrier;
 365    case Format::MUBUF:
 366       return static_cast<const MUBUF_instruction*>(instr)->barrier;
 367    case Format::MIMG:
 368       return static_cast<const MIMG_instruction*>(instr)->barrier;
 369    case Format::MTBUF:
 370       return static_cast<const MTBUF_instruction*>(instr)->barrier;
 371    case Format::FLAT:
 372    case Format::GLOBAL:
 373    case Format::SCRATCH:
 374       return static_cast<const FLAT_instruction*>(instr)->barrier;
 375    case Format::DS:
 376       return barrier_shared;
 377    case Format::SOPP:
 378       if (is_done_sendmsg(instr))
 379          return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg);
 380       else if (is_gs_or_done_sendmsg(instr))
 381          return barrier_gs_sendmsg;
 382       else
 383          return barrier_none;
 384    case Format::PSEUDO_BARRIER:
 385       return barrier_barrier;
 386    default:
 387       return barrier_none;
 388    }
 389 }
 390
 391 barrier_interaction parse_barrier(Instruction *instr)
 392 {
 393    if (instr->format == Format::PSEUDO_BARRIER) {
 394       switch (instr->opcode) {
 395       case aco_opcode::p_memory_barrier_atomic:
 396          return barrier_atomic;
 397       /* For now, buffer and image barriers are treated the same. this is because of
 398        * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
 399        * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
 400        * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
 401        * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
 402        * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
 403        * Either way, this solution should work. */
 404       case aco_opcode::p_memory_barrier_buffer:
 405       case aco_opcode::p_memory_barrier_image:
 406          return (barrier_interaction)(barrier_image | barrier_buffer);
 407       case aco_opcode::p_memory_barrier_shared:
 408          return barrier_shared;
 409       case aco_opcode::p_memory_barrier_common:
 410          return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
 411       case aco_opcode::p_memory_barrier_gs_data:
 412          return barrier_gs_data;
 413       case aco_opcode::p_memory_barrier_gs_sendmsg:
 414          return barrier_gs_sendmsg;
 415       default:
 416          break;
 417       }
 418    } else if (instr->opcode == aco_opcode::s_barrier) {
 419       return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic);
 420    }
 421    return barrier_none;
 422 }
 423
 424 struct hazard_query {
 425    bool contains_spill;
 426    int barriers;
 427    int barrier_interaction;
 428    bool can_reorder_vmem;
 429    bool can_reorder_smem;
 430 };
 431
 432 void init_hazard_query(hazard_query *query) {
 433    query->contains_spill = false;
 434    query->barriers = 0;
 435    query->barrier_interaction = 0;
 436    query->can_reorder_vmem = true;
 437    query->can_reorder_smem = true;
 438 }
 439
 440 void add_to_hazard_query(hazard_query *query, Instruction *instr)
 441 {
 442    query->barriers |= parse_barrier(instr);
 443    query->barrier_interaction |= get_barrier_interaction(instr);
 444    if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload)
 445       query->contains_spill = true;
 446
 447    bool can_reorder_instr = can_reorder(instr);
 448    query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr;
 449    query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr;
 450 }
 451
 452 enum HazardResult {
 453    hazard_success,
 454    hazard_fail_reorder_vmem_smem,
 455    hazard_fail_reorder_ds,
 456    hazard_fail_reorder_sendmsg,
 457    hazard_fail_spill,
 458    hazard_fail_export,
 459    hazard_fail_barrier,
 460    /* Must stop at these failures. The hazard query code doesn't consider them
 461     * when added. */
 462    hazard_fail_exec,
 463    hazard_fail_unreorderable,
 464 };
 465
 466 HazardResult perform_hazard_query(hazard_query *query, Instruction *instr)
 467 {
 468    bool can_reorder_candidate = can_reorder(instr);
 469
 470    if (instr->opcode == aco_opcode::p_exit_early_if)
 471       return hazard_fail_exec;
 472    for (const Definition& def : instr->definitions) {
 473       if (def.isFixed() && def.physReg() == exec)
 474          return hazard_fail_exec;
 475    }
 476
 477    /* don't move exports so that they stay closer together */
 478    if (instr->format == Format::EXP)
 479       return hazard_fail_export;
 480
 481    /* don't move non-reorderable instructions */
 482    if (instr->opcode == aco_opcode::s_memtime ||
 483        instr->opcode == aco_opcode::s_memrealtime ||
 484        instr->opcode == aco_opcode::s_setprio)
 485       return hazard_fail_unreorderable;
 486
 487    barrier_interaction bar = parse_barrier(instr);
 488    if (query->barrier_interaction && (query->barrier_interaction & bar))
 489       return hazard_fail_barrier;
 490    if (bar && query->barriers && (query->barriers & ~bar))
 491       return hazard_fail_barrier;
 492    if (query->barriers && (query->barriers & get_barrier_interaction(instr)))
 493       return hazard_fail_barrier;
 494
 495    if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate)
 496       return hazard_fail_reorder_vmem_smem;
 497    if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate)
 498       return hazard_fail_reorder_vmem_smem;
 499    if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS)
 500       return hazard_fail_reorder_ds;
 501    if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr)))
 502       return hazard_fail_reorder_sendmsg;
 503
 504    if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) &&
 505        query->contains_spill)
 506       return hazard_fail_spill;
 507
 508    return hazard_success;
 509 }
 510
 511 void schedule_SMEM(sched_ctx& ctx, Block* block,
 512                    std::vector<RegisterDemand>& register_demand,
 513                    Instruction* current, int idx)
 514 {
 515    assert(idx != 0);
 516    int window_size = SMEM_WINDOW_SIZE;
 517    int max_moves = SMEM_MAX_MOVES;
 518    int16_t k = 0;
 519
 520    /* don't move s_memtime/s_memrealtime */
 521    if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime)
 522       return;
 523
 524    /* first, check if we have instructions before current to move down */
 525    hazard_query hq;
 526    init_hazard_query(&hq);
 527    add_to_hazard_query(&hq, current);
 528
 529    ctx.mv.downwards_init(idx, false, false);
 530
 531    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 532       assert(candidate_idx >= 0);
 533       assert(candidate_idx == ctx.mv.source_idx);
 534       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 535
 536       /* break if we'd make the previous SMEM instruction stall */
 537       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 538       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 539          break;
 540
 541       /* break when encountering another MEM instruction, logical_start or barriers */
 542       if (candidate->opcode == aco_opcode::p_logical_start)
 543          break;
 544       if (candidate->isVMEM())
 545          break;
 546
 547       bool can_move_down = true;
 548
 549       HazardResult haz = perform_hazard_query(&hq, candidate.get());
 550       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export)
 551          can_move_down = false;
 552       else if (haz != hazard_success)
 553          break;
 554
 555       /* don't use LDS/GDS instructions to hide latency since it can
 556        * significanly worsen LDS scheduling */
 557       if (candidate->format == Format::DS || !can_move_down) {
 558          add_to_hazard_query(&hq, candidate.get());
 559          ctx.mv.downwards_skip();
 560          continue;
 561       }
 562
 563       MoveResult res = ctx.mv.downwards_move(false);
 564       if (res == move_fail_ssa || res == move_fail_rar) {
 565          add_to_hazard_query(&hq, candidate.get());
 566          ctx.mv.downwards_skip();
 567          continue;
 568       } else if (res == move_fail_pressure) {
 569          break;
 570       }
 571
 572       if (candidate_idx < ctx.last_SMEM_dep_idx)
 573          ctx.last_SMEM_stall++;
 574       k++;
 575    }
 576
 577    /* find the first instruction depending on current or find another MEM */
 578    ctx.mv.upwards_init(idx + 1, false);
 579
 580    bool found_dependency = false;
 581    /* second, check if we have instructions after current to move up */
 582    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 583       assert(candidate_idx == ctx.mv.source_idx);
 584       assert(candidate_idx < (int) block->instructions.size());
 585       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 586
 587       if (candidate->opcode == aco_opcode::p_logical_end)
 588          break;
 589
 590       /* check if candidate depends on current */
 591       bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps();
 592       /* no need to steal from following VMEM instructions */
 593       if (is_dependency && candidate->isVMEM())
 594          break;
 595
 596       if (found_dependency) {
 597          HazardResult haz = perform_hazard_query(&hq, candidate.get());
 598          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 599              haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
 600              haz == hazard_fail_export)
 601             is_dependency = true;
 602          else if (haz != hazard_success)
 603             break;
 604       }
 605
 606       if (is_dependency) {
 607          if (!found_dependency) {
 608             ctx.mv.upwards_set_insert_idx(candidate_idx);
 609             init_hazard_query(&hq);
 610             found_dependency = true;
 611          }
 612       }
 613
 614       if (is_dependency || !found_dependency) {
 615          if (found_dependency)
 616             add_to_hazard_query(&hq, candidate.get());
 617          else
 618             k++;
 619          ctx.mv.upwards_skip();
 620          continue;
 621       }
 622
 623       MoveResult res = ctx.mv.upwards_move();
 624       if (res == move_fail_ssa || res == move_fail_rar) {
 625          /* no need to steal from following VMEM instructions */
 626          if (res == move_fail_ssa && candidate->isVMEM())
 627             break;
 628          add_to_hazard_query(&hq, candidate.get());
 629          ctx.mv.upwards_skip();
 630          continue;
 631       } else if (res == move_fail_pressure) {
 632          break;
 633       }
 634       k++;
 635    }
 636
 637    ctx.last_SMEM_dep_idx = found_dependency ? ctx.mv.insert_idx : 0;
 638    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 639 }
 640
 641 void schedule_VMEM(sched_ctx& ctx, Block* block,
 642                    std::vector<RegisterDemand>& register_demand,
 643                    Instruction* current, int idx)
 644 {
 645    assert(idx != 0);
 646    int window_size = VMEM_WINDOW_SIZE;
 647    int max_moves = VMEM_MAX_MOVES;
 648    int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST;
 649    int16_t k = 0;
 650
 651    /* first, check if we have instructions before current to move down */
 652    hazard_query indep_hq;
 653    hazard_query clause_hq;
 654    init_hazard_query(&indep_hq);
 655    init_hazard_query(&clause_hq);
 656    add_to_hazard_query(&indep_hq, current);
 657
 658    ctx.mv.downwards_init(idx, true, true);
 659
 660    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 661       assert(candidate_idx == ctx.mv.source_idx);
 662       assert(candidate_idx >= 0);
 663       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 664       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 665
 666       /* break when encountering another VMEM instruction, logical_start or barriers */
 667       if (candidate->opcode == aco_opcode::p_logical_start)
 668          break;
 669
 670       /* break if we'd make the previous SMEM instruction stall */
 671       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 672       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 673          break;
 674
 675       bool part_of_clause = false;
 676       if (current->isVMEM() == candidate->isVMEM()) {
 677          bool same_resource = true;
 678          if (current->isVMEM())
 679             same_resource = candidate->operands[0].tempId() == current->operands[0].tempId();
 680          int grab_dist = ctx.mv.insert_idx_clause - candidate_idx;
 681          /* We can't easily tell how much this will decrease the def-to-use
 682           * distances, so just use how far it will be moved as a heuristic. */
 683          part_of_clause = same_resource && grab_dist < clause_max_grab_dist;
 684       }
 685
 686       /* if current depends on candidate, add additional dependencies and continue */
 687       bool can_move_down = !is_vmem || part_of_clause;
 688
 689       HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get());
 690       if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 691           haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier ||
 692           haz == hazard_fail_export)
 693          can_move_down = false;
 694       else if (haz != hazard_success)
 695          break;
 696
 697       if (!can_move_down) {
 698          add_to_hazard_query(&indep_hq, candidate.get());
 699          add_to_hazard_query(&clause_hq, candidate.get());
 700          ctx.mv.downwards_skip();
 701          continue;
 702       }
 703
 704       MoveResult res = ctx.mv.downwards_move(part_of_clause);
 705       if (res == move_fail_ssa || res == move_fail_rar) {
 706          add_to_hazard_query(&indep_hq, candidate.get());
 707          add_to_hazard_query(&clause_hq, candidate.get());
 708          ctx.mv.downwards_skip();
 709          continue;
 710       } else if (res == move_fail_pressure) {
 711          break;
 712       }
 713       k++;
 714       if (candidate_idx < ctx.last_SMEM_dep_idx)
 715          ctx.last_SMEM_stall++;
 716    }
 717
 718    /* find the first instruction depending on current or find another VMEM */
 719    ctx.mv.upwards_init(idx + 1, true);
 720
 721    bool found_dependency = false;
 722    /* second, check if we have instructions after current to move up */
 723    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 724       assert(candidate_idx == ctx.mv.source_idx);
 725       assert(candidate_idx < (int) block->instructions.size());
 726       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 727       bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal();
 728
 729       if (candidate->opcode == aco_opcode::p_logical_end)
 730          break;
 731
 732       /* check if candidate depends on current */
 733       bool is_dependency = false;
 734       if (found_dependency) {
 735          HazardResult haz = perform_hazard_query(&indep_hq, candidate.get());
 736          if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill ||
 737              haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg ||
 738              haz == hazard_fail_barrier || haz == hazard_fail_export)
 739             is_dependency = true;
 740          else if (haz != hazard_success)
 741             break;
 742       }
 743
 744       is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps();
 745       if (is_dependency) {
 746          if (!found_dependency) {
 747             ctx.mv.upwards_set_insert_idx(candidate_idx);
 748             init_hazard_query(&indep_hq);
 749             found_dependency = true;
 750          }
 751       } else if (is_vmem) {
 752          /* don't move up dependencies of other VMEM instructions */
 753          for (const Definition& def : candidate->definitions) {
 754             if (def.isTemp())
 755                ctx.mv.depends_on[def.tempId()] = true;
 756          }
 757       }
 758
 759       if (is_dependency || !found_dependency) {
 760          if (found_dependency)
 761             add_to_hazard_query(&indep_hq, candidate.get());
 762          ctx.mv.upwards_skip();
 763          continue;
 764       }
 765
 766       MoveResult res = ctx.mv.upwards_move();
 767       if (res == move_fail_ssa || res == move_fail_rar) {
 768          add_to_hazard_query(&indep_hq, candidate.get());
 769          ctx.mv.upwards_skip();
 770          continue;
 771       } else if (res == move_fail_pressure) {
 772          break;
 773       }
 774       k++;
 775    }
 776 }
 777
 778 void schedule_position_export(sched_ctx& ctx, Block* block,
 779                               std::vector<RegisterDemand>& register_demand,
 780                               Instruction* current, int idx)
 781 {
 782    assert(idx != 0);
 783    int window_size = POS_EXP_WINDOW_SIZE;
 784    int max_moves = POS_EXP_MAX_MOVES;
 785    int16_t k = 0;
 786
 787    ctx.mv.downwards_init(idx, true, false);
 788
 789    hazard_query hq;
 790    init_hazard_query(&hq);
 791    add_to_hazard_query(&hq, current);
 792
 793    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 794       assert(candidate_idx >= 0);
 795       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 796
 797       if (candidate->opcode == aco_opcode::p_logical_start)
 798          break;
 799       if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal())
 800          break;
 801
 802       HazardResult haz = perform_hazard_query(&hq, candidate.get());
 803       if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable)
 804          break;
 805
 806       if (haz != hazard_success) {
 807          add_to_hazard_query(&hq, candidate.get());
 808          ctx.mv.downwards_skip();
 809          continue;
 810       }
 811
 812       MoveResult res = ctx.mv.downwards_move(false);
 813       if (res == move_fail_ssa || res == move_fail_rar) {
 814          add_to_hazard_query(&hq, candidate.get());
 815          ctx.mv.downwards_skip();
 816          continue;
 817       } else if (res == move_fail_pressure) {
 818          break;
 819       }
 820       k++;
 821    }
 822 }
 823
 824 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 825 {
 826    ctx.last_SMEM_dep_idx = 0;
 827    ctx.last_SMEM_stall = INT16_MIN;
 828    ctx.mv.block = block;
 829    ctx.mv.register_demand = live_vars.register_demand[block->index].data();
 830
 831    /* go through all instructions and find memory loads */
 832    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 833       Instruction* current = block->instructions[idx].get();
 834
 835       if (current->definitions.empty())
 836          continue;
 837
 838       if (current->isVMEM() || current->isFlatOrGlobal()) {
 839          ctx.mv.current = current;
 840          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 841       }
 842
 843       if (current->format == Format::SMEM) {
 844          ctx.mv.current = current;
 845          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 846       }
 847    }
 848
 849    if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) {
 850       /* Try to move position exports as far up as possible, to reduce register
 851        * usage and because ISA reference guides say so. */
 852       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 853          Instruction* current = block->instructions[idx].get();
 854
 855          if (current->format == Format::EXP) {
 856             unsigned target = static_cast<Export_instruction*>(current)->dest;
 857             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) {
 858                ctx.mv.current = current;
 859                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 860             }
 861          }
 862       }
 863    }
 864
 865    /* resummarize the block's register demand */
 866    block->register_demand = RegisterDemand();
 867    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 868       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 869    }
 870 }
 871
 872
 873 void schedule_program(Program *program, live& live_vars)
 874 {
 875    sched_ctx ctx;
 876    ctx.mv.depends_on.resize(program->peekAllocationId());
 877    ctx.mv.RAR_dependencies.resize(program->peekAllocationId());
 878    ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId());
 879    /* Allowing the scheduler to reduce the number of waves to as low as 5
 880     * improves performance of Thrones of Britannia significantly and doesn't
 881     * seem to hurt anything else. */
 882    if (program->num_waves <= 5)
 883       ctx.num_waves = program->num_waves;
 884    else if (program->max_reg_demand.vgpr >= 32)
 885       ctx.num_waves = 5;
 886    else if (program->max_reg_demand.vgpr >= 28)
 887       ctx.num_waves = 6;
 888    else if (program->max_reg_demand.vgpr >= 24)
 889       ctx.num_waves = 7;
 890    else
 891       ctx.num_waves = 8;
 892    ctx.num_waves = std::max<uint16_t>(ctx.num_waves, program->min_waves);
 893
 894    assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves);
 895    ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2),
 896                             int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))};
 897
 898    for (Block& block : program->blocks)
 899       schedule_block(ctx, program, &block, live_vars);
 900
 901    /* update max_reg_demand and num_waves */
 902    RegisterDemand new_demand;
 903    for (Block& block : program->blocks) {
 904       new_demand.update(block.register_demand);
 905    }
 906    update_vgpr_sgpr_demand(program, new_demand);
 907
 908    /* if enabled, this code asserts that register_demand is updated correctly */
 909    #if 0
 910    int prev_num_waves = program->num_waves;
 911    const RegisterDemand prev_max_demand = program->max_reg_demand;
 912
 913    std::vector<RegisterDemand> demands(program->blocks.size());
 914    for (unsigned j = 0; j < program->blocks.size(); j++) {
 915       demands[j] = program->blocks[j].register_demand;
 916    }
 917
 918    struct radv_nir_compiler_options options;
 919    options.chip_class = program->chip_class;
 920    live live_vars2 = aco::live_var_analysis(program, &options);
 921
 922    for (unsigned j = 0; j < program->blocks.size(); j++) {
 923       Block &b = program->blocks[j];
 924       for (unsigned i = 0; i < b.instructions.size(); i++)
 925          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
 926       assert(b.register_demand == demands[j]);
 927    }
 928
 929    assert(program->max_reg_demand == prev_max_demand);
 930    assert(program->num_waves == prev_num_waves);
 931    #endif
 932 }
 933
 934 }