src/amd/compiler/aco_scheduler.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  */
  24
  25 #include "aco_ir.h"
  26 #include <unordered_set>
  27 #include <algorithm>
  28
  29 #include "vulkan/radv_shader.h" // for radv_nir_compiler_options
  30 #include "amdgfxregs.h"
  31
  32 #define SMEM_WINDOW_SIZE (350 - ctx.num_waves * 35)
  33 #define VMEM_WINDOW_SIZE (1024 - ctx.num_waves * 64)
  34 #define POS_EXP_WINDOW_SIZE 512
  35 #define SMEM_MAX_MOVES (80 - ctx.num_waves * 8)
  36 #define VMEM_MAX_MOVES (128 - ctx.num_waves * 4)
  37 #define POS_EXP_MAX_MOVES 512
  38
  39 namespace aco {
  40
  41 struct sched_ctx {
  42    std::vector<bool> depends_on;
  43    std::vector<bool> RAR_dependencies;
  44    RegisterDemand max_registers;
  45    int16_t num_waves;
  46    int16_t last_SMEM_stall;
  47    int last_SMEM_dep_idx;
  48 };
  49
  50 /* This scheduler is a simple bottom-up pass based on ideas from
  51  * "A Novel Lightweight Instruction Scheduling Algorithm for Just-In-Time Compiler"
  52  * from Xiaohua Shi and Peng Guo.
  53  * The basic approach is to iterate over all instructions. When a memory instruction
  54  * is encountered it tries to move independent instructions from above and below
  55  * between the memory instruction and it's first user.
  56  * The novelty is that this scheduler cares for the current register pressure:
  57  * Instructions will only be moved if the register pressure won't exceed a certain bound.
  58  */
  59
  60 template <typename T>
  61 void move_element(T& list, size_t idx, size_t before) {
  62     if (idx < before) {
  63         auto begin = std::next(list.begin(), idx);
  64         auto end = std::next(list.begin(), before);
  65         std::rotate(begin, begin + 1, end);
  66     } else if (idx > before) {
  67         auto begin = std::next(list.begin(), before);
  68         auto end = std::next(list.begin(), idx + 1);
  69         std::rotate(begin, end - 1, end);
  70     }
  71 }
  72
  73 static RegisterDemand getLiveChanges(aco_ptr<Instruction>& instr)
  74 {
  75    RegisterDemand changes;
  76    for (const Definition& def : instr->definitions) {
  77       if (!def.isTemp() || def.isKill())
  78          continue;
  79       changes += def.getTemp();
  80    }
  81
  82    for (const Operand& op : instr->operands) {
  83       if (!op.isTemp() || !op.isFirstKill())
  84          continue;
  85       changes -= op.getTemp();
  86    }
  87
  88    return changes;
  89 }
  90
  91 static RegisterDemand getTempRegisters(aco_ptr<Instruction>& instr)
  92 {
  93    RegisterDemand temp_registers;
  94    for (const Definition& def : instr->definitions) {
  95       if (!def.isTemp() || !def.isKill())
  96          continue;
  97       temp_registers += def.getTemp();
  98    }
  99    return temp_registers;
 100 }
 101
 102 static bool is_spill_reload(aco_ptr<Instruction>& instr)
 103 {
 104    return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload;
 105 }
 106
 107 bool can_move_instr(aco_ptr<Instruction>& instr, Instruction* current, int moving_interaction)
 108 {
 109    /* don't move exports so that they stay closer together */
 110    if (instr->format == Format::EXP)
 111       return false;
 112
 113    /* handle barriers */
 114
 115    /* TODO: instead of stopping, maybe try to move the barriers and any
 116     * instructions interacting with them instead? */
 117    if (instr->format != Format::PSEUDO_BARRIER) {
 118       if (instr->opcode == aco_opcode::s_barrier) {
 119          bool can_reorder = false;
 120          switch (current->format) {
 121          case Format::SMEM:
 122             can_reorder = static_cast<SMEM_instruction*>(current)->can_reorder;
 123             break;
 124          case Format::MUBUF:
 125             can_reorder = static_cast<MUBUF_instruction*>(current)->can_reorder;
 126             break;
 127          case Format::MIMG:
 128             can_reorder = static_cast<MIMG_instruction*>(current)->can_reorder;
 129             break;
 130          default:
 131             break;
 132          }
 133          return can_reorder && moving_interaction == barrier_none;
 134       } else {
 135          return true;
 136       }
 137    }
 138
 139    int interaction = get_barrier_interaction(current);
 140    interaction |= moving_interaction;
 141
 142    switch (instr->opcode) {
 143    case aco_opcode::p_memory_barrier_atomic:
 144       return !(interaction & barrier_atomic);
 145    /* For now, buffer and image barriers are treated the same. this is because of
 146     * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp
 147     * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important.
 148     * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it
 149     * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers.
 150     * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to?
 151     * Either way, this solution should work. */
 152    case aco_opcode::p_memory_barrier_buffer:
 153    case aco_opcode::p_memory_barrier_image:
 154       return !(interaction & (barrier_image | barrier_buffer));
 155    case aco_opcode::p_memory_barrier_shared:
 156       return !(interaction & barrier_shared);
 157    case aco_opcode::p_memory_barrier_all:
 158       return interaction == barrier_none;
 159    default:
 160       return false;
 161    }
 162 }
 163
 164 bool can_reorder(Instruction* candidate, bool allow_smem)
 165 {
 166    switch (candidate->format) {
 167    case Format::SMEM:
 168       return allow_smem || static_cast<SMEM_instruction*>(candidate)->can_reorder;
 169    case Format::MUBUF:
 170       return static_cast<MUBUF_instruction*>(candidate)->can_reorder;
 171    case Format::MIMG:
 172       return static_cast<MIMG_instruction*>(candidate)->can_reorder;
 173    case Format::MTBUF:
 174       return static_cast<MTBUF_instruction*>(candidate)->can_reorder;
 175    case Format::FLAT:
 176    case Format::GLOBAL:
 177    case Format::SCRATCH:
 178       return false;
 179    default:
 180       return true;
 181    }
 182 }
 183
 184 void schedule_SMEM(sched_ctx& ctx, Block* block,
 185                    std::vector<RegisterDemand>& register_demand,
 186                    Instruction* current, int idx)
 187 {
 188    assert(idx != 0);
 189    int window_size = SMEM_WINDOW_SIZE;
 190    int max_moves = SMEM_MAX_MOVES;
 191    int16_t k = 0;
 192    bool can_reorder_cur = can_reorder(current, false);
 193
 194    /* create the initial set of values which current depends on */
 195    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 196    for (const Operand& op : current->operands) {
 197       if (op.isTemp())
 198          ctx.depends_on[op.tempId()] = true;
 199    }
 200
 201    /* maintain how many registers remain free when moving instructions */
 202    RegisterDemand register_pressure = register_demand[idx];
 203
 204    /* first, check if we have instructions before current to move down */
 205    int insert_idx = idx + 1;
 206    int moving_interaction = barrier_none;
 207    bool moving_spill = false;
 208
 209    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 210       assert(candidate_idx >= 0);
 211       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 212
 213       /* break if we'd make the previous SMEM instruction stall */
 214       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 215       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 216          break;
 217
 218       /* break when encountering another MEM instruction, logical_start or barriers */
 219       if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
 220          break;
 221       if (candidate->opcode == aco_opcode::p_logical_start)
 222          break;
 223       if (!can_move_instr(candidate, current, moving_interaction))
 224          break;
 225       register_pressure.update(register_demand[candidate_idx]);
 226
 227       /* if current depends on candidate, add additional dependencies and continue */
 228       bool can_move_down = true;
 229       bool writes_exec = false;
 230       for (const Definition& def : candidate->definitions) {
 231          if (def.isTemp() && ctx.depends_on[def.tempId()])
 232             can_move_down = false;
 233          if (def.isFixed() && def.physReg() == exec)
 234             writes_exec = true;
 235       }
 236       if (writes_exec)
 237          break;
 238
 239       if (moving_spill && is_spill_reload(candidate))
 240          can_move_down = false;
 241       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 242          can_move_down = false;
 243       moving_interaction |= get_barrier_interaction(candidate.get());
 244       moving_spill |= is_spill_reload(candidate);
 245       if (!can_move_down) {
 246          for (const Operand& op : candidate->operands) {
 247             if (op.isTemp())
 248                ctx.depends_on[op.tempId()] = true;
 249          }
 250          continue;
 251       }
 252
 253       bool register_pressure_unknown = false;
 254       /* check if one of candidate's operands is killed by depending instruction */
 255       for (const Operand& op : candidate->operands) {
 256          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 257             // FIXME: account for difference in register pressure
 258             register_pressure_unknown = true;
 259          }
 260       }
 261       if (register_pressure_unknown) {
 262          for (const Operand& op : candidate->operands) {
 263             if (op.isTemp())
 264                ctx.depends_on[op.tempId()] = true;
 265          }
 266          continue;
 267       }
 268
 269       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 270       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 271       const RegisterDemand tempDemand = getTempRegisters(candidate);
 272       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 273          break;
 274       const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]);
 275       const RegisterDemand new_demand  = register_demand[insert_idx - 1] - tempDemand2 + tempDemand;
 276       if (new_demand.exceeds(ctx.max_registers))
 277          break;
 278       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 279
 280       /* move the candidate below the memory load */
 281       move_element(block->instructions, candidate_idx, insert_idx);
 282
 283       /* update register pressure */
 284       move_element(register_demand, candidate_idx, insert_idx);
 285       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 286          register_demand[i] -= candidate_diff;
 287       }
 288       register_demand[insert_idx - 1] = new_demand;
 289       register_pressure -= candidate_diff;
 290
 291       if (candidate_idx < ctx.last_SMEM_dep_idx)
 292          ctx.last_SMEM_stall++;
 293       insert_idx--;
 294       k++;
 295    }
 296
 297    /* create the initial set of values which depend on current */
 298    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 299    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 300    for (const Definition& def : current->definitions) {
 301       if (def.isTemp())
 302          ctx.depends_on[def.tempId()] = true;
 303    }
 304
 305    /* find the first instruction depending on current or find another MEM */
 306    insert_idx = idx + 1;
 307    moving_interaction = barrier_none;
 308    moving_spill = false;
 309
 310    bool found_dependency = false;
 311    /* second, check if we have instructions after current to move up */
 312    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 313       assert(candidate_idx < (int) block->instructions.size());
 314       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 315
 316       if (candidate->opcode == aco_opcode::p_logical_end)
 317          break;
 318       if (!can_move_instr(candidate, current, moving_interaction))
 319          break;
 320
 321       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 322                                            [](const Definition& def) { return def.isFixed() && def.physReg() == exec;});
 323       if (writes_exec)
 324          break;
 325
 326       /* check if candidate depends on current */
 327       bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(),
 328                                        [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];});
 329       if (moving_spill && is_spill_reload(candidate))
 330          is_dependency = true;
 331       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 332          is_dependency = true;
 333       moving_interaction |= get_barrier_interaction(candidate.get());
 334       moving_spill |= is_spill_reload(candidate);
 335       if (is_dependency) {
 336          for (const Definition& def : candidate->definitions) {
 337             if (def.isTemp())
 338                ctx.depends_on[def.tempId()] = true;
 339          }
 340          for (const Operand& op : candidate->operands) {
 341             if (op.isTemp())
 342                ctx.RAR_dependencies[op.tempId()] = true;
 343          }
 344          if (!found_dependency) {
 345             insert_idx = candidate_idx;
 346             found_dependency = true;
 347             /* init register pressure */
 348             register_pressure = register_demand[insert_idx - 1];
 349          }
 350       }
 351
 352       if (!can_reorder(candidate.get(), false) && !can_reorder_cur)
 353          break;
 354
 355       if (!found_dependency) {
 356          k++;
 357          continue;
 358       }
 359
 360       /* update register pressure */
 361       register_pressure.update(register_demand[candidate_idx - 1]);
 362
 363       if (is_dependency)
 364          continue;
 365       assert(insert_idx != idx);
 366
 367       // TODO: correctly calculate register pressure for this case
 368       bool register_pressure_unknown = false;
 369       /* check if candidate uses/kills an operand which is used by a dependency */
 370       for (const Operand& op : candidate->operands) {
 371          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
 372             register_pressure_unknown = true;
 373       }
 374       if (register_pressure_unknown) {
 375          for (const Definition& def : candidate->definitions) {
 376             if (def.isTemp())
 377                ctx.RAR_dependencies[def.tempId()] = true;
 378          }
 379          for (const Operand& op : candidate->operands) {
 380             if (op.isTemp())
 381                ctx.RAR_dependencies[op.tempId()] = true;
 382          }
 383          continue;
 384       }
 385
 386       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 387       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 388       const RegisterDemand temp = getTempRegisters(candidate);
 389       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 390          break;
 391       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 392       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 393       if (new_demand.exceeds(ctx.max_registers))
 394          break;
 395
 396       /* move the candidate above the insert_idx */
 397       move_element(block->instructions, candidate_idx, insert_idx);
 398
 399       /* update register pressure */
 400       move_element(register_demand, candidate_idx, insert_idx);
 401       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 402          register_demand[i] += candidate_diff;
 403       }
 404       register_demand[insert_idx] = new_demand;
 405       register_pressure += candidate_diff;
 406       insert_idx++;
 407       k++;
 408    }
 409
 410    ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0;
 411    ctx.last_SMEM_stall = 10 - ctx.num_waves - k;
 412 }
 413
 414 void schedule_VMEM(sched_ctx& ctx, Block* block,
 415                    std::vector<RegisterDemand>& register_demand,
 416                    Instruction* current, int idx)
 417 {
 418    assert(idx != 0);
 419    int window_size = VMEM_WINDOW_SIZE;
 420    int max_moves = VMEM_MAX_MOVES;
 421    int16_t k = 0;
 422    bool can_reorder_cur = can_reorder(current, false);
 423
 424    /* create the initial set of values which current depends on */
 425    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 426    for (const Operand& op : current->operands) {
 427       if (op.isTemp())
 428          ctx.depends_on[op.tempId()] = true;
 429    }
 430
 431    /* maintain how many registers remain free when moving instructions */
 432    RegisterDemand register_pressure = register_demand[idx];
 433
 434    /* first, check if we have instructions before current to move down */
 435    int insert_idx = idx + 1;
 436    int moving_interaction = barrier_none;
 437    bool moving_spill = false;
 438
 439    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 440       assert(candidate_idx >= 0);
 441       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 442
 443       /* break when encountering another VMEM instruction, logical_start or barriers */
 444       if (!can_reorder(candidate.get(), true) && !can_reorder_cur)
 445          break;
 446       if (candidate->opcode == aco_opcode::p_logical_start)
 447          break;
 448       if (!can_move_instr(candidate, current, moving_interaction))
 449          break;
 450
 451       /* break if we'd make the previous SMEM instruction stall */
 452       bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx;
 453       if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0)
 454          break;
 455       register_pressure.update(register_demand[candidate_idx]);
 456
 457       /* if current depends on candidate, add additional dependencies and continue */
 458       bool can_move_down = true;
 459       bool writes_exec = false;
 460       for (const Definition& def : candidate->definitions) {
 461          if (def.isTemp() && ctx.depends_on[def.tempId()])
 462             can_move_down = false;
 463          if (def.isFixed() && def.physReg() == exec)
 464             writes_exec = true;
 465       }
 466       if (writes_exec)
 467          break;
 468
 469       if (moving_spill && is_spill_reload(candidate))
 470          can_move_down = false;
 471       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 472          can_move_down = false;
 473       moving_interaction |= get_barrier_interaction(candidate.get());
 474       moving_spill |= is_spill_reload(candidate);
 475       if (!can_move_down) {
 476          for (const Operand& op : candidate->operands) {
 477             if (op.isTemp())
 478                ctx.depends_on[op.tempId()] = true;
 479          }
 480          continue;
 481       }
 482
 483       bool register_pressure_unknown = false;
 484       /* check if one of candidate's operands is killed by depending instruction */
 485       for (const Operand& op : candidate->operands) {
 486          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 487             // FIXME: account for difference in register pressure
 488             register_pressure_unknown = true;
 489          }
 490       }
 491       if (register_pressure_unknown) {
 492          for (const Operand& op : candidate->operands) {
 493             if (op.isTemp())
 494                ctx.depends_on[op.tempId()] = true;
 495          }
 496          continue;
 497       }
 498
 499       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 500       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 501       const RegisterDemand temp = getTempRegisters(candidate);;
 502       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 503          break;
 504       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 505       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 506       if (new_demand.exceeds(ctx.max_registers))
 507          break;
 508       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 509
 510       /* move the candidate below the memory load */
 511       move_element(block->instructions, candidate_idx, insert_idx);
 512
 513       /* update register pressure */
 514       move_element(register_demand, candidate_idx, insert_idx);
 515       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 516          register_demand[i] -= candidate_diff;
 517       }
 518       register_demand[insert_idx - 1] = new_demand;
 519       register_pressure -=  candidate_diff;
 520       insert_idx--;
 521       k++;
 522       if (candidate_idx < ctx.last_SMEM_dep_idx)
 523          ctx.last_SMEM_stall++;
 524    }
 525
 526    /* create the initial set of values which depend on current */
 527    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 528    std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false);
 529    for (const Definition& def : current->definitions) {
 530       if (def.isTemp())
 531          ctx.depends_on[def.tempId()] = true;
 532    }
 533
 534    /* find the first instruction depending on current or find another VMEM */
 535    insert_idx = idx;
 536    moving_interaction = barrier_none;
 537    moving_spill = false;
 538
 539    bool found_dependency = false;
 540    /* second, check if we have instructions after current to move up */
 541    for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) {
 542       assert(candidate_idx < (int) block->instructions.size());
 543       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 544
 545       if (candidate->opcode == aco_opcode::p_logical_end)
 546          break;
 547       if (!can_move_instr(candidate, current, moving_interaction))
 548          break;
 549
 550       const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(),
 551                                            [](const Definition& def) {return def.isFixed() && def.physReg() == exec; });
 552       if (writes_exec)
 553          break;
 554
 555       /* check if candidate depends on current */
 556       bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur;
 557       for (const Operand& op : candidate->operands) {
 558          if (op.isTemp() && ctx.depends_on[op.tempId()]) {
 559             is_dependency = true;
 560             break;
 561          }
 562       }
 563       if (moving_spill && is_spill_reload(candidate))
 564          is_dependency = true;
 565       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 566          is_dependency = true;
 567       moving_interaction |= get_barrier_interaction(candidate.get());
 568       moving_spill |= is_spill_reload(candidate);
 569       if (is_dependency) {
 570          for (const Definition& def : candidate->definitions) {
 571             if (def.isTemp())
 572                ctx.depends_on[def.tempId()] = true;
 573          }
 574          for (const Operand& op : candidate->operands) {
 575             if (op.isTemp())
 576                ctx.RAR_dependencies[op.tempId()] = true;
 577          }
 578          if (!found_dependency) {
 579             insert_idx = candidate_idx;
 580             found_dependency = true;
 581             /* init register pressure */
 582             register_pressure = register_demand[insert_idx - 1];
 583             continue;
 584          }
 585       }
 586
 587       /* update register pressure */
 588       register_pressure.update(register_demand[candidate_idx - 1]);
 589
 590       if (is_dependency || !found_dependency)
 591          continue;
 592       assert(insert_idx != idx);
 593
 594       bool register_pressure_unknown = false;
 595       /* check if candidate uses/kills an operand which is used by a dependency */
 596       for (const Operand& op : candidate->operands) {
 597          if (op.isTemp() && ctx.RAR_dependencies[op.tempId()])
 598             register_pressure_unknown = true;
 599       }
 600       if (register_pressure_unknown) {
 601          for (const Definition& def : candidate->definitions) {
 602             if (def.isTemp())
 603                ctx.RAR_dependencies[def.tempId()] = true;
 604          }
 605          for (const Operand& op : candidate->operands) {
 606             if (op.isTemp())
 607                ctx.RAR_dependencies[op.tempId()] = true;
 608          }
 609          continue;
 610       }
 611
 612       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 613       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 614       const RegisterDemand temp = getTempRegisters(candidate);
 615       if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers))
 616          break;
 617       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 618       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp;
 619       if (new_demand.exceeds(ctx.max_registers))
 620          break;
 621
 622       /* move the candidate above the insert_idx */
 623       move_element(block->instructions, candidate_idx, insert_idx);
 624
 625       /* update register pressure */
 626       move_element(register_demand, candidate_idx, insert_idx);
 627       for (int i = insert_idx + 1; i <= candidate_idx; i++) {
 628          register_demand[i] += candidate_diff;
 629       }
 630       register_demand[insert_idx] = new_demand;
 631       register_pressure += candidate_diff;
 632       insert_idx++;
 633       k++;
 634    }
 635 }
 636
 637 void schedule_position_export(sched_ctx& ctx, Block* block,
 638                               std::vector<RegisterDemand>& register_demand,
 639                               Instruction* current, int idx)
 640 {
 641    assert(idx != 0);
 642    int window_size = POS_EXP_WINDOW_SIZE;
 643    int max_moves = POS_EXP_MAX_MOVES;
 644    int16_t k = 0;
 645
 646    /* create the initial set of values which current depends on */
 647    std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false);
 648    for (unsigned i = 0; i < current->operands.size(); i++) {
 649       if (current->operands[i].isTemp())
 650          ctx.depends_on[current->operands[i].tempId()] = true;
 651    }
 652
 653    /* maintain how many registers remain free when moving instructions */
 654    RegisterDemand register_pressure = register_demand[idx];
 655
 656    /* first, check if we have instructions before current to move down */
 657    int insert_idx = idx + 1;
 658    int moving_interaction = barrier_none;
 659    bool moving_spill = false;
 660
 661    for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) {
 662       assert(candidate_idx >= 0);
 663       aco_ptr<Instruction>& candidate = block->instructions[candidate_idx];
 664
 665       /* break when encountering logical_start or barriers */
 666       if (candidate->opcode == aco_opcode::p_logical_start)
 667          break;
 668       if (candidate->isVMEM() || candidate->format == Format::SMEM)
 669          break;
 670       if (!can_move_instr(candidate, current, moving_interaction))
 671          break;
 672
 673       register_pressure.update(register_demand[candidate_idx]);
 674
 675       /* if current depends on candidate, add additional dependencies and continue */
 676       bool can_move_down = true;
 677       bool writes_exec = false;
 678       for (unsigned i = 0; i < candidate->definitions.size(); i++) {
 679          if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()])
 680             can_move_down = false;
 681          if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec)
 682             writes_exec = true;
 683       }
 684       if (writes_exec)
 685          break;
 686
 687       if (moving_spill && is_spill_reload(candidate))
 688          can_move_down = false;
 689       if ((moving_interaction & barrier_shared) && candidate->format == Format::DS)
 690          can_move_down = false;
 691       moving_interaction |= get_barrier_interaction(candidate.get());
 692       moving_spill |= is_spill_reload(candidate);
 693       if (!can_move_down) {
 694          for (unsigned i = 0; i < candidate->operands.size(); i++) {
 695             if (candidate->operands[i].isTemp())
 696                ctx.depends_on[candidate->operands[i].tempId()] = true;
 697          }
 698          continue;
 699       }
 700
 701       bool register_pressure_unknown = false;
 702       /* check if one of candidate's operands is killed by depending instruction */
 703       for (unsigned i = 0; i < candidate->operands.size(); i++) {
 704          if (candidate->operands[i].isTemp() && ctx.depends_on[candidate->operands[i].tempId()]) {
 705             // FIXME: account for difference in register pressure
 706             register_pressure_unknown = true;
 707          }
 708       }
 709       if (register_pressure_unknown) {
 710          for (unsigned i = 0; i < candidate->operands.size(); i++) {
 711             if (candidate->operands[i].isTemp())
 712                ctx.depends_on[candidate->operands[i].tempId()] = true;
 713          }
 714          continue;
 715       }
 716
 717       /* check if register pressure is low enough: the diff is negative if register pressure is decreased */
 718       const RegisterDemand candidate_diff = getLiveChanges(candidate);
 719       const RegisterDemand temp = getTempRegisters(candidate);;
 720       if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers))
 721          break;
 722       const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]);
 723       const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp;
 724       if (new_demand.exceeds(ctx.max_registers))
 725          break;
 726       // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure
 727
 728       /* move the candidate below the export */
 729       move_element(block->instructions, candidate_idx, insert_idx);
 730
 731       /* update register pressure */
 732       move_element(register_demand, candidate_idx, insert_idx);
 733       for (int i = candidate_idx; i < insert_idx - 1; i++) {
 734          register_demand[i] -= candidate_diff;
 735       }
 736       register_demand[insert_idx - 1] = new_demand;
 737       register_pressure -=  candidate_diff;
 738       insert_idx--;
 739       k++;
 740    }
 741 }
 742
 743 void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_vars)
 744 {
 745    ctx.last_SMEM_dep_idx = 0;
 746    ctx.last_SMEM_stall = INT16_MIN;
 747
 748    /* go through all instructions and find memory loads */
 749    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 750       Instruction* current = block->instructions[idx].get();
 751
 752       if (current->definitions.empty())
 753          continue;
 754
 755       if (current->isVMEM())
 756          schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 757       if (current->format == Format::SMEM)
 758          schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx);
 759    }
 760
 761    if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) {
 762       /* Try to move position exports as far up as possible, to reduce register
 763        * usage and because ISA reference guides say so. */
 764       for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 765          Instruction* current = block->instructions[idx].get();
 766
 767          if (current->format == Format::EXP) {
 768             unsigned target = static_cast<Export_instruction*>(current)->dest;
 769             if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM)
 770                schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx);
 771          }
 772       }
 773    }
 774
 775    /* resummarize the block's register demand */
 776    block->register_demand = RegisterDemand();
 777    for (unsigned idx = 0; idx < block->instructions.size(); idx++) {
 778       block->register_demand.update(live_vars.register_demand[block->index][idx]);
 779    }
 780 }
 781
 782
 783 void schedule_program(Program *program, live& live_vars)
 784 {
 785    sched_ctx ctx;
 786    ctx.depends_on.resize(program->peekAllocationId());
 787    ctx.RAR_dependencies.resize(program->peekAllocationId());
 788    /* Allowing the scheduler to reduce the number of waves to as low as 5
 789     * improves performance of Thrones of Britannia significantly and doesn't
 790     * seem to hurt anything else. */
 791    //TODO: maybe use some sort of heuristic instead
 792    //TODO: this also increases window-size/max-moves? did I realize that at the time?
 793    ctx.num_waves = std::min<uint16_t>(program->num_waves, 5);
 794    assert(ctx.num_waves);
 795    uint16_t total_sgpr_regs = program->chip_class >= GFX8 ? 800 : 512;
 796    uint16_t max_addressible_sgpr = program->sgpr_limit;
 797    ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), std::min<int16_t>(((total_sgpr_regs / ctx.num_waves) & ~7) - 2, max_addressible_sgpr)};
 798
 799    for (Block& block : program->blocks)
 800       schedule_block(ctx, program, &block, live_vars);
 801
 802    /* update max_reg_demand and num_waves */
 803    RegisterDemand new_demand;
 804    for (Block& block : program->blocks) {
 805       new_demand.update(block.register_demand);
 806    }
 807    update_vgpr_sgpr_demand(program, new_demand);
 808
 809    /* if enabled, this code asserts that register_demand is updated correctly */
 810    #if 0
 811    int prev_num_waves = program->num_waves;
 812    const RegisterDemand prev_max_demand = program->max_reg_demand;
 813
 814    std::vector<RegisterDemand> demands(program->blocks.size());
 815    for (unsigned j = 0; j < program->blocks.size(); j++) {
 816       demands[j] = program->blocks[j].register_demand;
 817    }
 818
 819    struct radv_nir_compiler_options options;
 820    options.chip_class = program->chip_class;
 821    live live_vars2 = aco::live_var_analysis(program, &options);
 822
 823    for (unsigned j = 0; j < program->blocks.size(); j++) {
 824       Block &b = program->blocks[j];
 825       for (unsigned i = 0; i < b.instructions.size(); i++)
 826          assert(live_vars.register_demand[b.index][i] == live_vars2.register_demand[b.index][i]);
 827       assert(b.register_demand == demands[j]);
 828    }
 829
 830    assert(program->max_reg_demand == prev_max_demand);
 831    assert(program->num_waves == prev_num_waves);
 832    #endif
 833 }
 834
 835 }