X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_scheduler.cpp;h=cb22491bf64a1b5a2c58b553d591e26ec89c7cda;hb=51bc11abc206ae5ea0946f5a79c68527701c24e0;hp=f1b4472a942c415fdd9d40e1d6ccf8411867b46f;hpb=703ce617ca4045a9e4d3e05b8e6ed607d89fd338;p=mesa.git diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index f1b4472a942..cb22491bf64 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -23,6 +23,7 @@ */ #include "aco_ir.h" +#include "aco_builder.h" #include #include @@ -34,17 +35,59 @@ #define POS_EXP_WINDOW_SIZE 512 #define SMEM_MAX_MOVES (64 - ctx.num_waves * 4) #define VMEM_MAX_MOVES (128 - ctx.num_waves * 8) +/* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */ +#define VMEM_CLAUSE_MAX_GRAB_DIST ((ctx.num_waves - 1) * 8) #define POS_EXP_MAX_MOVES 512 namespace aco { -struct sched_ctx { +enum MoveResult { + move_success, + move_fail_ssa, + move_fail_rar, + move_fail_pressure, +}; + +struct MoveState { + RegisterDemand max_registers; + + Block *block; + Instruction *current; + RegisterDemand *register_demand; + bool improved_rar; + std::vector depends_on; + /* Two are needed because, for downwards VMEM scheduling, one needs to + * exclude the instructions in the clause, since new instructions in the + * clause are not moved past any other instructions in the clause. */ std::vector RAR_dependencies; - RegisterDemand max_registers; + std::vector RAR_dependencies_clause; + + int source_idx; + int insert_idx, insert_idx_clause; + RegisterDemand total_demand, total_demand_clause; + + /* for moving instructions before the current instruction to after it */ + void downwards_init(int current_idx, bool improved_rar, bool may_form_clauses); + MoveResult downwards_move(bool clause); + void downwards_skip(); + + /* for moving instructions after the first use of the current instruction upwards */ + void upwards_init(int source_idx, bool improved_rar); + bool upwards_check_deps(); + void upwards_set_insert_idx(int before); + MoveResult upwards_move(); + void upwards_skip(); + +private: + void downwards_advance_helper(); +}; + +struct sched_ctx { int16_t num_waves; int16_t last_SMEM_stall; int last_SMEM_dep_idx; + MoveState mv; }; /* This scheduler is a simple bottom-up pass based on ideas from @@ -58,118 +101,228 @@ struct sched_ctx { */ template -void move_element(T& list, size_t idx, size_t before) { +void move_element(T begin_it, size_t idx, size_t before) { if (idx < before) { - auto begin = std::next(list.begin(), idx); - auto end = std::next(list.begin(), before); + auto begin = std::next(begin_it, idx); + auto end = std::next(begin_it, before); std::rotate(begin, begin + 1, end); } else if (idx > before) { - auto begin = std::next(list.begin(), before); - auto end = std::next(list.begin(), idx + 1); + auto begin = std::next(begin_it, before); + auto end = std::next(begin_it, idx + 1); std::rotate(begin, end - 1, end); } } -static RegisterDemand getLiveChanges(aco_ptr& instr) +void MoveState::downwards_advance_helper() { - RegisterDemand changes; - for (const Definition& def : instr->definitions) { - if (!def.isTemp() || def.isKill()) - continue; - changes += def.getTemp(); + source_idx--; + total_demand.update(register_demand[source_idx]); +} + +void MoveState::downwards_init(int current_idx, bool improved_rar_, bool may_form_clauses) +{ + improved_rar = improved_rar_; + source_idx = current_idx; + + insert_idx = current_idx + 1; + insert_idx_clause = current_idx; + + total_demand = total_demand_clause = register_demand[current_idx]; + + std::fill(depends_on.begin(), depends_on.end(), false); + if (improved_rar) { + std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false); + if (may_form_clauses) + std::fill(RAR_dependencies_clause.begin(), RAR_dependencies_clause.end(), false); + } + + for (const Operand& op : current->operands) { + if (op.isTemp()) { + depends_on[op.tempId()] = true; + if (improved_rar && op.isFirstKill()) + RAR_dependencies[op.tempId()] = true; + } } + /* update total_demand/source_idx */ + downwards_advance_helper(); +} + +MoveResult MoveState::downwards_move(bool clause) +{ + aco_ptr& instr = block->instructions[source_idx]; + + for (const Definition& def : instr->definitions) + if (def.isTemp() && depends_on[def.tempId()]) + return move_fail_ssa; + + /* check if one of candidate's operands is killed by depending instruction */ + std::vector& RAR_deps = improved_rar ? (clause ? RAR_dependencies_clause : RAR_dependencies) : depends_on; for (const Operand& op : instr->operands) { - if (!op.isTemp() || !op.isFirstKill()) - continue; - changes -= op.getTemp(); + if (op.isTemp() && RAR_deps[op.tempId()]) { + // FIXME: account for difference in register pressure + return move_fail_rar; + } + } + + if (clause) { + for (const Operand& op : instr->operands) { + if (op.isTemp()) { + depends_on[op.tempId()] = true; + if (op.isFirstKill()) + RAR_dependencies[op.tempId()] = true; + } + } } - return changes; + int dest_insert_idx = clause ? insert_idx_clause : insert_idx; + RegisterDemand register_pressure = clause ? total_demand_clause : total_demand; + + const RegisterDemand candidate_diff = get_live_changes(instr); + const RegisterDemand temp = get_temp_registers(instr); + if (RegisterDemand(register_pressure - candidate_diff).exceeds(max_registers)) + return move_fail_pressure; + const RegisterDemand temp2 = get_temp_registers(block->instructions[dest_insert_idx - 1]); + const RegisterDemand new_demand = register_demand[dest_insert_idx - 1] - temp2 + temp; + if (new_demand.exceeds(max_registers)) + return move_fail_pressure; + + /* move the candidate below the memory load */ + move_element(block->instructions.begin(), source_idx, dest_insert_idx); + + /* update register pressure */ + move_element(register_demand, source_idx, dest_insert_idx); + for (int i = source_idx; i < dest_insert_idx - 1; i++) + register_demand[i] -= candidate_diff; + register_demand[dest_insert_idx - 1] = new_demand; + total_demand_clause -= candidate_diff; + insert_idx_clause--; + if (!clause) { + total_demand -= candidate_diff; + insert_idx--; + } + + downwards_advance_helper(); + return move_success; } -static RegisterDemand getTempRegisters(aco_ptr& instr) +void MoveState::downwards_skip() { - RegisterDemand temp_registers; - for (const Definition& def : instr->definitions) { - if (!def.isTemp() || !def.isKill()) - continue; - temp_registers += def.getTemp(); + aco_ptr& instr = block->instructions[source_idx]; + + for (const Operand& op : instr->operands) { + if (op.isTemp()) { + depends_on[op.tempId()] = true; + if (improved_rar && op.isFirstKill()) { + RAR_dependencies[op.tempId()] = true; + RAR_dependencies_clause[op.tempId()] = true; + } + } + } + total_demand_clause.update(register_demand[source_idx]); + + downwards_advance_helper(); +} + +void MoveState::upwards_init(int source_idx_, bool improved_rar_) +{ + source_idx = source_idx_; + improved_rar = improved_rar_; + + insert_idx = -1; + + std::fill(depends_on.begin(), depends_on.end(), false); + std::fill(RAR_dependencies.begin(), RAR_dependencies.end(), false); + + for (const Definition& def : current->definitions) { + if (def.isTemp()) + depends_on[def.tempId()] = true; } - return temp_registers; } -static bool is_spill_reload(aco_ptr& instr) +bool MoveState::upwards_check_deps() { - return instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload; + aco_ptr& instr = block->instructions[source_idx]; + for (const Operand& op : instr->operands) { + if (op.isTemp() && depends_on[op.tempId()]) + return false; + } + return true; } -bool can_move_instr(aco_ptr& instr, Instruction* current, int moving_interaction) +void MoveState::upwards_set_insert_idx(int before) { - /* don't move exports so that they stay closer together */ - if (instr->format == Format::EXP) - return false; + insert_idx = before; + total_demand = register_demand[before - 1]; +} - /* don't move s_memtime/s_memrealtime */ - if (instr->opcode == aco_opcode::s_memtime || instr->opcode == aco_opcode::s_memrealtime) - return false; - - /* handle barriers */ - - /* TODO: instead of stopping, maybe try to move the barriers and any - * instructions interacting with them instead? */ - if (instr->format != Format::PSEUDO_BARRIER) { - if (instr->opcode == aco_opcode::s_barrier) { - bool can_reorder = false; - switch (current->format) { - case Format::SMEM: - can_reorder = static_cast(current)->can_reorder; - break; - case Format::MUBUF: - can_reorder = static_cast(current)->can_reorder; - break; - case Format::MIMG: - can_reorder = static_cast(current)->can_reorder; - break; - default: - break; - } - return can_reorder && moving_interaction == barrier_none; - } else { - return true; - } +MoveResult MoveState::upwards_move() +{ + assert(insert_idx >= 0); + + aco_ptr& instr = block->instructions[source_idx]; + for (const Operand& op : instr->operands) { + if (op.isTemp() && depends_on[op.tempId()]) + return move_fail_ssa; } - int interaction = get_barrier_interaction(current); - interaction |= moving_interaction; - - switch (instr->opcode) { - case aco_opcode::p_memory_barrier_atomic: - return !(interaction & barrier_atomic); - /* For now, buffer and image barriers are treated the same. this is because of - * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp - * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important. - * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it - * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers. - * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to? - * Either way, this solution should work. */ - case aco_opcode::p_memory_barrier_buffer: - case aco_opcode::p_memory_barrier_image: - return !(interaction & (barrier_image | barrier_buffer)); - case aco_opcode::p_memory_barrier_shared: - return !(interaction & barrier_shared); - case aco_opcode::p_memory_barrier_all: - return interaction == barrier_none; - default: - return false; + /* check if candidate uses/kills an operand which is used by a dependency */ + for (const Operand& op : instr->operands) { + if (op.isTemp() && (!improved_rar || op.isFirstKill()) && RAR_dependencies[op.tempId()]) + return move_fail_rar; + } + + /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ + const RegisterDemand candidate_diff = get_live_changes(instr); + const RegisterDemand temp = get_temp_registers(instr); + if (RegisterDemand(total_demand + candidate_diff).exceeds(max_registers)) + return move_fail_pressure; + const RegisterDemand temp2 = get_temp_registers(block->instructions[insert_idx - 1]); + const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp; + if (new_demand.exceeds(max_registers)) + return move_fail_pressure; + + /* move the candidate above the insert_idx */ + move_element(block->instructions.begin(), source_idx, insert_idx); + + /* update register pressure */ + move_element(register_demand, source_idx, insert_idx); + for (int i = insert_idx + 1; i <= source_idx; i++) + register_demand[i] += candidate_diff; + register_demand[insert_idx] = new_demand; + total_demand += candidate_diff; + + insert_idx++; + + total_demand.update(register_demand[source_idx]); + source_idx++; + + return move_success; +} + +void MoveState::upwards_skip() +{ + if (insert_idx >= 0) { + aco_ptr& instr = block->instructions[source_idx]; + for (const Definition& def : instr->definitions) { + if (def.isTemp()) + depends_on[def.tempId()] = true; + } + for (const Operand& op : instr->operands) { + if (op.isTemp()) + RAR_dependencies[op.tempId()] = true; + } + total_demand.update(register_demand[source_idx]); } + + source_idx++; } -bool can_reorder(Instruction* candidate, bool allow_smem) +bool can_reorder(Instruction* candidate) { switch (candidate->format) { case Format::SMEM: - return allow_smem || static_cast(candidate)->can_reorder; + return static_cast(candidate)->can_reorder; case Format::MUBUF: return static_cast(candidate)->can_reorder; case Format::MIMG: @@ -179,12 +332,182 @@ bool can_reorder(Instruction* candidate, bool allow_smem) case Format::FLAT: case Format::GLOBAL: case Format::SCRATCH: - return false; + return static_cast(candidate)->can_reorder; default: return true; } } +bool is_gs_or_done_sendmsg(const Instruction *instr) +{ + if (instr->opcode == aco_opcode::s_sendmsg) { + uint16_t imm = static_cast(instr)->imm; + return (imm & sendmsg_id_mask) == _sendmsg_gs || + (imm & sendmsg_id_mask) == _sendmsg_gs_done; + } + return false; +} + +bool is_done_sendmsg(const Instruction *instr) +{ + if (instr->opcode == aco_opcode::s_sendmsg) { + uint16_t imm = static_cast(instr)->imm; + return (imm & sendmsg_id_mask) == _sendmsg_gs_done; + } + return false; +} + +barrier_interaction get_barrier_interaction(const Instruction* instr) +{ + switch (instr->format) { + case Format::SMEM: + return static_cast(instr)->barrier; + case Format::MUBUF: + return static_cast(instr)->barrier; + case Format::MIMG: + return static_cast(instr)->barrier; + case Format::MTBUF: + return static_cast(instr)->barrier; + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + return static_cast(instr)->barrier; + case Format::DS: + return barrier_shared; + case Format::SOPP: + if (is_done_sendmsg(instr)) + return (barrier_interaction)(barrier_gs_data | barrier_gs_sendmsg); + else if (is_gs_or_done_sendmsg(instr)) + return barrier_gs_sendmsg; + else + return barrier_none; + case Format::PSEUDO_BARRIER: + return barrier_barrier; + default: + return barrier_none; + } +} + +barrier_interaction parse_barrier(Instruction *instr) +{ + if (instr->format == Format::PSEUDO_BARRIER) { + switch (instr->opcode) { + case aco_opcode::p_memory_barrier_atomic: + return barrier_atomic; + /* For now, buffer and image barriers are treated the same. this is because of + * dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.buffer.guard_nonlocal.image.comp + * which seems to use an image load to determine if the result of a buffer load is valid. So the ordering of the two loads is important. + * I /think/ we should probably eventually expand the meaning of a buffer barrier so that all buffer operations before it, must stay before it + * and that both image and buffer operations after it, must stay after it. We should also do the same for image barriers. + * Or perhaps the problem is that we don't have a combined barrier instruction for both buffers and images, but the CTS test expects us to? + * Either way, this solution should work. */ + case aco_opcode::p_memory_barrier_buffer: + case aco_opcode::p_memory_barrier_image: + return (barrier_interaction)(barrier_image | barrier_buffer); + case aco_opcode::p_memory_barrier_shared: + return barrier_shared; + case aco_opcode::p_memory_barrier_common: + return (barrier_interaction)(barrier_image | barrier_buffer | barrier_shared | barrier_atomic); + case aco_opcode::p_memory_barrier_gs_data: + return barrier_gs_data; + case aco_opcode::p_memory_barrier_gs_sendmsg: + return barrier_gs_sendmsg; + default: + break; + } + } else if (instr->opcode == aco_opcode::s_barrier) { + return (barrier_interaction)(barrier_barrier | barrier_image | barrier_buffer | barrier_shared | barrier_atomic); + } + return barrier_none; +} + +struct hazard_query { + bool contains_spill; + int barriers; + int barrier_interaction; + bool can_reorder_vmem; + bool can_reorder_smem; +}; + +void init_hazard_query(hazard_query *query) { + query->contains_spill = false; + query->barriers = 0; + query->barrier_interaction = 0; + query->can_reorder_vmem = true; + query->can_reorder_smem = true; +} + +void add_to_hazard_query(hazard_query *query, Instruction *instr) +{ + query->barriers |= parse_barrier(instr); + query->barrier_interaction |= get_barrier_interaction(instr); + if (instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) + query->contains_spill = true; + + bool can_reorder_instr = can_reorder(instr); + query->can_reorder_smem &= instr->format != Format::SMEM || can_reorder_instr; + query->can_reorder_vmem &= !(instr->isVMEM() || instr->isFlatOrGlobal()) || can_reorder_instr; +} + +enum HazardResult { + hazard_success, + hazard_fail_reorder_vmem_smem, + hazard_fail_reorder_ds, + hazard_fail_reorder_sendmsg, + hazard_fail_spill, + hazard_fail_export, + hazard_fail_barrier, + /* Must stop at these failures. The hazard query code doesn't consider them + * when added. */ + hazard_fail_exec, + hazard_fail_unreorderable, +}; + +HazardResult perform_hazard_query(hazard_query *query, Instruction *instr) +{ + bool can_reorder_candidate = can_reorder(instr); + + if (instr->opcode == aco_opcode::p_exit_early_if) + return hazard_fail_exec; + for (const Definition& def : instr->definitions) { + if (def.isFixed() && def.physReg() == exec) + return hazard_fail_exec; + } + + /* don't move exports so that they stay closer together */ + if (instr->format == Format::EXP) + return hazard_fail_export; + + /* don't move non-reorderable instructions */ + if (instr->opcode == aco_opcode::s_memtime || + instr->opcode == aco_opcode::s_memrealtime || + instr->opcode == aco_opcode::s_setprio) + return hazard_fail_unreorderable; + + barrier_interaction bar = parse_barrier(instr); + if (query->barrier_interaction && (query->barrier_interaction & bar)) + return hazard_fail_barrier; + if (bar && query->barriers && (query->barriers & ~bar)) + return hazard_fail_barrier; + if (query->barriers && (query->barriers & get_barrier_interaction(instr))) + return hazard_fail_barrier; + + if (!query->can_reorder_smem && instr->format == Format::SMEM && !can_reorder_candidate) + return hazard_fail_reorder_vmem_smem; + if (!query->can_reorder_vmem && (instr->isVMEM() || instr->isFlatOrGlobal()) && !can_reorder_candidate) + return hazard_fail_reorder_vmem_smem; + if ((query->barrier_interaction & barrier_shared) && instr->format == Format::DS) + return hazard_fail_reorder_ds; + if (is_gs_or_done_sendmsg(instr) && (query->barrier_interaction & get_barrier_interaction(instr))) + return hazard_fail_reorder_sendmsg; + + if ((instr->opcode == aco_opcode::p_spill || instr->opcode == aco_opcode::p_reload) && + query->contains_spill) + return hazard_fail_spill; + + return hazard_success; +} + void schedule_SMEM(sched_ctx& ctx, Block* block, std::vector& register_demand, Instruction* current, int idx) @@ -193,29 +516,21 @@ void schedule_SMEM(sched_ctx& ctx, Block* block, int window_size = SMEM_WINDOW_SIZE; int max_moves = SMEM_MAX_MOVES; int16_t k = 0; - bool can_reorder_cur = can_reorder(current, false); /* don't move s_memtime/s_memrealtime */ if (current->opcode == aco_opcode::s_memtime || current->opcode == aco_opcode::s_memrealtime) return; - /* create the initial set of values which current depends on */ - std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); - for (const Operand& op : current->operands) { - if (op.isTemp()) - ctx.depends_on[op.tempId()] = true; - } - - /* maintain how many registers remain free when moving instructions */ - RegisterDemand register_pressure = register_demand[idx]; - /* first, check if we have instructions before current to move down */ - int insert_idx = idx + 1; - int moving_interaction = barrier_none; - bool moving_spill = false; + hazard_query hq; + init_hazard_query(&hq); + add_to_hazard_query(&hq, current); + + ctx.mv.downwards_init(idx, false, false); for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { assert(candidate_idx >= 0); + assert(candidate_idx == ctx.mv.source_idx); aco_ptr& candidate = block->instructions[candidate_idx]; /* break if we'd make the previous SMEM instruction stall */ @@ -224,200 +539,102 @@ void schedule_SMEM(sched_ctx& ctx, Block* block, break; /* break when encountering another MEM instruction, logical_start or barriers */ - if (!can_reorder(candidate.get(), false) && !can_reorder_cur) - break; if (candidate->opcode == aco_opcode::p_logical_start) break; - if (candidate->opcode == aco_opcode::p_exit_early_if) - break; - if (!can_move_instr(candidate, current, moving_interaction)) + if (candidate->isVMEM()) break; - register_pressure.update(register_demand[candidate_idx]); - /* if current depends on candidate, add additional dependencies and continue */ bool can_move_down = true; - bool writes_exec = false; - for (const Definition& def : candidate->definitions) { - if (def.isTemp() && ctx.depends_on[def.tempId()]) - can_move_down = false; - if (def.isFixed() && def.physReg() == exec) - writes_exec = true; - } - if (writes_exec) - break; - if (moving_spill && is_spill_reload(candidate)) - can_move_down = false; - if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + HazardResult haz = perform_hazard_query(&hq, candidate.get()); + if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || haz == hazard_fail_export) can_move_down = false; - moving_interaction |= get_barrier_interaction(candidate.get()); - moving_spill |= is_spill_reload(candidate); - if (!can_move_down) { - for (const Operand& op : candidate->operands) { - if (op.isTemp()) - ctx.depends_on[op.tempId()] = true; - } - continue; - } + else if (haz != hazard_success) + break; - bool register_pressure_unknown = false; - /* check if one of candidate's operands is killed by depending instruction */ - for (const Operand& op : candidate->operands) { - if (op.isTemp() && ctx.depends_on[op.tempId()]) { - // FIXME: account for difference in register pressure - register_pressure_unknown = true; - } - } - if (register_pressure_unknown) { - for (const Operand& op : candidate->operands) { - if (op.isTemp()) - ctx.depends_on[op.tempId()] = true; - } + /* don't use LDS/GDS instructions to hide latency since it can + * significanly worsen LDS scheduling */ + if (candidate->format == Format::DS || !can_move_down) { + add_to_hazard_query(&hq, candidate.get()); + ctx.mv.downwards_skip(); continue; } - /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ - const RegisterDemand candidate_diff = getLiveChanges(candidate); - const RegisterDemand tempDemand = getTempRegisters(candidate); - if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) - break; - const RegisterDemand tempDemand2 = getTempRegisters(block->instructions[insert_idx - 1]); - const RegisterDemand new_demand = register_demand[insert_idx - 1] - tempDemand2 + tempDemand; - if (new_demand.exceeds(ctx.max_registers)) + MoveResult res = ctx.mv.downwards_move(false); + if (res == move_fail_ssa || res == move_fail_rar) { + add_to_hazard_query(&hq, candidate.get()); + ctx.mv.downwards_skip(); + continue; + } else if (res == move_fail_pressure) { break; - // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure - - /* move the candidate below the memory load */ - move_element(block->instructions, candidate_idx, insert_idx); - - /* update register pressure */ - move_element(register_demand, candidate_idx, insert_idx); - for (int i = candidate_idx; i < insert_idx - 1; i++) { - register_demand[i] -= candidate_diff; } - register_demand[insert_idx - 1] = new_demand; - register_pressure -= candidate_diff; if (candidate_idx < ctx.last_SMEM_dep_idx) ctx.last_SMEM_stall++; - insert_idx--; k++; } - /* create the initial set of values which depend on current */ - std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); - std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false); - for (const Definition& def : current->definitions) { - if (def.isTemp()) - ctx.depends_on[def.tempId()] = true; - } - /* find the first instruction depending on current or find another MEM */ - insert_idx = idx + 1; - moving_interaction = barrier_none; - moving_spill = false; + ctx.mv.upwards_init(idx + 1, false); bool found_dependency = false; /* second, check if we have instructions after current to move up */ for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) { + assert(candidate_idx == ctx.mv.source_idx); assert(candidate_idx < (int) block->instructions.size()); aco_ptr& candidate = block->instructions[candidate_idx]; if (candidate->opcode == aco_opcode::p_logical_end) break; - if (!can_move_instr(candidate, current, moving_interaction)) - break; - const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(), - [](const Definition& def) { return def.isFixed() && def.physReg() == exec;}); - if (writes_exec) + /* check if candidate depends on current */ + bool is_dependency = !found_dependency && !ctx.mv.upwards_check_deps(); + /* no need to steal from following VMEM instructions */ + if (is_dependency && candidate->isVMEM()) break; - /* check if candidate depends on current */ - bool is_dependency = std::any_of(candidate->operands.begin(), candidate->operands.end(), - [&ctx](const Operand& op) { return op.isTemp() && ctx.depends_on[op.tempId()];}); - if (moving_spill && is_spill_reload(candidate)) - is_dependency = true; - if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) - is_dependency = true; - moving_interaction |= get_barrier_interaction(candidate.get()); - moving_spill |= is_spill_reload(candidate); + if (found_dependency) { + HazardResult haz = perform_hazard_query(&hq, candidate.get()); + if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || + haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || + haz == hazard_fail_export) + is_dependency = true; + else if (haz != hazard_success) + break; + } + if (is_dependency) { - for (const Definition& def : candidate->definitions) { - if (def.isTemp()) - ctx.depends_on[def.tempId()] = true; - } - for (const Operand& op : candidate->operands) { - if (op.isTemp()) - ctx.RAR_dependencies[op.tempId()] = true; - } if (!found_dependency) { - insert_idx = candidate_idx; + ctx.mv.upwards_set_insert_idx(candidate_idx); + init_hazard_query(&hq); found_dependency = true; - /* init register pressure */ - register_pressure = register_demand[insert_idx - 1]; } } - if (!can_reorder(candidate.get(), false) && !can_reorder_cur) - break; - - if (!found_dependency) { - k++; + if (is_dependency || !found_dependency) { + if (found_dependency) + add_to_hazard_query(&hq, candidate.get()); + else + k++; + ctx.mv.upwards_skip(); continue; } - /* update register pressure */ - register_pressure.update(register_demand[candidate_idx - 1]); - - if (is_dependency) - continue; - assert(insert_idx != idx); - - // TODO: correctly calculate register pressure for this case - bool register_pressure_unknown = false; - /* check if candidate uses/kills an operand which is used by a dependency */ - for (const Operand& op : candidate->operands) { - if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) - register_pressure_unknown = true; - } - if (register_pressure_unknown) { - for (const Definition& def : candidate->definitions) { - if (def.isTemp()) - ctx.RAR_dependencies[def.tempId()] = true; - } - for (const Operand& op : candidate->operands) { - if (op.isTemp()) - ctx.RAR_dependencies[op.tempId()] = true; - } + MoveResult res = ctx.mv.upwards_move(); + if (res == move_fail_ssa || res == move_fail_rar) { + /* no need to steal from following VMEM instructions */ + if (res == move_fail_ssa && candidate->isVMEM()) + break; + add_to_hazard_query(&hq, candidate.get()); + ctx.mv.upwards_skip(); continue; - } - - /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ - const RegisterDemand candidate_diff = getLiveChanges(candidate); - const RegisterDemand temp = getTempRegisters(candidate); - if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers)) + } else if (res == move_fail_pressure) { break; - const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); - const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp; - if (new_demand.exceeds(ctx.max_registers)) - break; - - /* move the candidate above the insert_idx */ - move_element(block->instructions, candidate_idx, insert_idx); - - /* update register pressure */ - move_element(register_demand, candidate_idx, insert_idx); - for (int i = insert_idx + 1; i <= candidate_idx; i++) { - register_demand[i] += candidate_diff; } - register_demand[insert_idx] = new_demand; - register_pressure += candidate_diff; - insert_idx++; k++; } - ctx.last_SMEM_dep_idx = found_dependency ? insert_idx : 0; + ctx.last_SMEM_dep_idx = found_dependency ? ctx.mv.insert_idx : 0; ctx.last_SMEM_stall = 10 - ctx.num_waves - k; } @@ -428,220 +645,132 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, assert(idx != 0); int window_size = VMEM_WINDOW_SIZE; int max_moves = VMEM_MAX_MOVES; + int clause_max_grab_dist = VMEM_CLAUSE_MAX_GRAB_DIST; int16_t k = 0; - bool can_reorder_cur = can_reorder(current, false); - - /* create the initial set of values which current depends on */ - std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); - for (const Operand& op : current->operands) { - if (op.isTemp()) - ctx.depends_on[op.tempId()] = true; - } - - /* maintain how many registers remain free when moving instructions */ - RegisterDemand register_pressure = register_demand[idx]; /* first, check if we have instructions before current to move down */ - int insert_idx = idx + 1; - int moving_interaction = barrier_none; - bool moving_spill = false; + hazard_query indep_hq; + hazard_query clause_hq; + init_hazard_query(&indep_hq); + init_hazard_query(&clause_hq); + add_to_hazard_query(&indep_hq, current); + + ctx.mv.downwards_init(idx, true, true); for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { + assert(candidate_idx == ctx.mv.source_idx); assert(candidate_idx >= 0); aco_ptr& candidate = block->instructions[candidate_idx]; + bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal(); /* break when encountering another VMEM instruction, logical_start or barriers */ - if (!can_reorder(candidate.get(), true) && !can_reorder_cur) - break; if (candidate->opcode == aco_opcode::p_logical_start) break; - if (candidate->opcode == aco_opcode::p_exit_early_if) - break; - if (!can_move_instr(candidate, current, moving_interaction)) - break; /* break if we'd make the previous SMEM instruction stall */ bool can_stall_prev_smem = idx <= ctx.last_SMEM_dep_idx && candidate_idx < ctx.last_SMEM_dep_idx; if (can_stall_prev_smem && ctx.last_SMEM_stall >= 0) break; - register_pressure.update(register_demand[candidate_idx]); - /* if current depends on candidate, add additional dependencies and continue */ - bool can_move_down = true; - bool writes_exec = false; - for (const Definition& def : candidate->definitions) { - if (def.isTemp() && ctx.depends_on[def.tempId()]) - can_move_down = false; - if (def.isFixed() && def.physReg() == exec) - writes_exec = true; + bool part_of_clause = false; + if (current->isVMEM() == candidate->isVMEM()) { + bool same_resource = true; + if (current->isVMEM()) + same_resource = candidate->operands[0].tempId() == current->operands[0].tempId(); + int grab_dist = ctx.mv.insert_idx_clause - candidate_idx; + /* We can't easily tell how much this will decrease the def-to-use + * distances, so just use how far it will be moved as a heuristic. */ + part_of_clause = same_resource && grab_dist < clause_max_grab_dist; } - if (writes_exec) - break; - if (moving_spill && is_spill_reload(candidate)) - can_move_down = false; - if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) + /* if current depends on candidate, add additional dependencies and continue */ + bool can_move_down = !is_vmem || part_of_clause; + + HazardResult haz = perform_hazard_query(part_of_clause ? &clause_hq : &indep_hq, candidate.get()); + if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || + haz == hazard_fail_reorder_sendmsg || haz == hazard_fail_barrier || + haz == hazard_fail_export) can_move_down = false; - moving_interaction |= get_barrier_interaction(candidate.get()); - moving_spill |= is_spill_reload(candidate); + else if (haz != hazard_success) + break; + if (!can_move_down) { - for (const Operand& op : candidate->operands) { - if (op.isTemp()) - ctx.depends_on[op.tempId()] = true; - } + add_to_hazard_query(&indep_hq, candidate.get()); + add_to_hazard_query(&clause_hq, candidate.get()); + ctx.mv.downwards_skip(); continue; } - bool register_pressure_unknown = false; - /* check if one of candidate's operands is killed by depending instruction */ - for (const Operand& op : candidate->operands) { - if (op.isTemp() && ctx.depends_on[op.tempId()]) { - // FIXME: account for difference in register pressure - register_pressure_unknown = true; - } - } - if (register_pressure_unknown) { - for (const Operand& op : candidate->operands) { - if (op.isTemp()) - ctx.depends_on[op.tempId()] = true; - } + MoveResult res = ctx.mv.downwards_move(part_of_clause); + if (res == move_fail_ssa || res == move_fail_rar) { + add_to_hazard_query(&indep_hq, candidate.get()); + add_to_hazard_query(&clause_hq, candidate.get()); + ctx.mv.downwards_skip(); continue; - } - - /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ - const RegisterDemand candidate_diff = getLiveChanges(candidate); - const RegisterDemand temp = getTempRegisters(candidate);; - if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) - break; - const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); - const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp; - if (new_demand.exceeds(ctx.max_registers)) + } else if (res == move_fail_pressure) { break; - // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure - - /* move the candidate below the memory load */ - move_element(block->instructions, candidate_idx, insert_idx); - - /* update register pressure */ - move_element(register_demand, candidate_idx, insert_idx); - for (int i = candidate_idx; i < insert_idx - 1; i++) { - register_demand[i] -= candidate_diff; } - register_demand[insert_idx - 1] = new_demand; - register_pressure -= candidate_diff; - insert_idx--; k++; if (candidate_idx < ctx.last_SMEM_dep_idx) ctx.last_SMEM_stall++; } - /* create the initial set of values which depend on current */ - std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); - std::fill(ctx.RAR_dependencies.begin(), ctx.RAR_dependencies.end(), false); - for (const Definition& def : current->definitions) { - if (def.isTemp()) - ctx.depends_on[def.tempId()] = true; - } - /* find the first instruction depending on current or find another VMEM */ - insert_idx = idx; - moving_interaction = barrier_none; - moving_spill = false; + ctx.mv.upwards_init(idx + 1, true); bool found_dependency = false; /* second, check if we have instructions after current to move up */ for (int candidate_idx = idx + 1; k < max_moves && candidate_idx < (int) idx + window_size; candidate_idx++) { + assert(candidate_idx == ctx.mv.source_idx); assert(candidate_idx < (int) block->instructions.size()); aco_ptr& candidate = block->instructions[candidate_idx]; + bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal(); if (candidate->opcode == aco_opcode::p_logical_end) break; - if (!can_move_instr(candidate, current, moving_interaction)) - break; - - const bool writes_exec = std::any_of(candidate->definitions.begin(), candidate->definitions.end(), - [](const Definition& def) {return def.isFixed() && def.physReg() == exec; }); - if (writes_exec) - break; /* check if candidate depends on current */ - bool is_dependency = !can_reorder(candidate.get(), true) && !can_reorder_cur; - for (const Operand& op : candidate->operands) { - if (op.isTemp() && ctx.depends_on[op.tempId()]) { + bool is_dependency = false; + if (found_dependency) { + HazardResult haz = perform_hazard_query(&indep_hq, candidate.get()); + if (haz == hazard_fail_reorder_ds || haz == hazard_fail_spill || + haz == hazard_fail_reorder_vmem_smem || haz == hazard_fail_reorder_sendmsg || + haz == hazard_fail_barrier || haz == hazard_fail_export) is_dependency = true; + else if (haz != hazard_success) break; - } } - if (moving_spill && is_spill_reload(candidate)) - is_dependency = true; - if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) - is_dependency = true; - moving_interaction |= get_barrier_interaction(candidate.get()); - moving_spill |= is_spill_reload(candidate); + + is_dependency |= !found_dependency && !ctx.mv.upwards_check_deps(); if (is_dependency) { - for (const Definition& def : candidate->definitions) { - if (def.isTemp()) - ctx.depends_on[def.tempId()] = true; - } - for (const Operand& op : candidate->operands) { - if (op.isTemp()) - ctx.RAR_dependencies[op.tempId()] = true; - } if (!found_dependency) { - insert_idx = candidate_idx; + ctx.mv.upwards_set_insert_idx(candidate_idx); + init_hazard_query(&indep_hq); found_dependency = true; - /* init register pressure */ - register_pressure = register_demand[insert_idx - 1]; - continue; } - } - - /* update register pressure */ - register_pressure.update(register_demand[candidate_idx - 1]); - - if (is_dependency || !found_dependency) - continue; - assert(insert_idx != idx); - - bool register_pressure_unknown = false; - /* check if candidate uses/kills an operand which is used by a dependency */ - for (const Operand& op : candidate->operands) { - if (op.isTemp() && ctx.RAR_dependencies[op.tempId()]) - register_pressure_unknown = true; - } - if (register_pressure_unknown) { + } else if (is_vmem) { + /* don't move up dependencies of other VMEM instructions */ for (const Definition& def : candidate->definitions) { if (def.isTemp()) - ctx.RAR_dependencies[def.tempId()] = true; - } - for (const Operand& op : candidate->operands) { - if (op.isTemp()) - ctx.RAR_dependencies[op.tempId()] = true; + ctx.mv.depends_on[def.tempId()] = true; } + } + + if (is_dependency || !found_dependency) { + if (found_dependency) + add_to_hazard_query(&indep_hq, candidate.get()); + ctx.mv.upwards_skip(); continue; } - /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ - const RegisterDemand candidate_diff = getLiveChanges(candidate); - const RegisterDemand temp = getTempRegisters(candidate); - if (RegisterDemand(register_pressure + candidate_diff).exceeds(ctx.max_registers)) - break; - const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); - const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + candidate_diff + temp; - if (new_demand.exceeds(ctx.max_registers)) + MoveResult res = ctx.mv.upwards_move(); + if (res == move_fail_ssa || res == move_fail_rar) { + add_to_hazard_query(&indep_hq, candidate.get()); + ctx.mv.upwards_skip(); + continue; + } else if (res == move_fail_pressure) { break; - - /* move the candidate above the insert_idx */ - move_element(block->instructions, candidate_idx, insert_idx); - - /* update register pressure */ - move_element(register_demand, candidate_idx, insert_idx); - for (int i = insert_idx + 1; i <= candidate_idx; i++) { - register_demand[i] += candidate_diff; } - register_demand[insert_idx] = new_demand; - register_pressure += candidate_diff; - insert_idx++; k++; } } @@ -655,101 +784,39 @@ void schedule_position_export(sched_ctx& ctx, Block* block, int max_moves = POS_EXP_MAX_MOVES; int16_t k = 0; - /* create the initial set of values which current depends on */ - std::fill(ctx.depends_on.begin(), ctx.depends_on.end(), false); - for (unsigned i = 0; i < current->operands.size(); i++) { - if (current->operands[i].isTemp()) - ctx.depends_on[current->operands[i].tempId()] = true; - } - - /* maintain how many registers remain free when moving instructions */ - RegisterDemand register_pressure = register_demand[idx]; + ctx.mv.downwards_init(idx, true, false); - /* first, check if we have instructions before current to move down */ - int insert_idx = idx + 1; - int moving_interaction = barrier_none; - bool moving_spill = false; + hazard_query hq; + init_hazard_query(&hq); + add_to_hazard_query(&hq, current); for (int candidate_idx = idx - 1; k < max_moves && candidate_idx > (int) idx - window_size; candidate_idx--) { assert(candidate_idx >= 0); aco_ptr& candidate = block->instructions[candidate_idx]; - /* break when encountering logical_start or barriers */ if (candidate->opcode == aco_opcode::p_logical_start) break; - if (candidate->opcode == aco_opcode::p_exit_early_if) - break; - if (candidate->isVMEM() || candidate->format == Format::SMEM) - break; - if (!can_move_instr(candidate, current, moving_interaction)) + if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal()) break; - register_pressure.update(register_demand[candidate_idx]); - - /* if current depends on candidate, add additional dependencies and continue */ - bool can_move_down = true; - bool writes_exec = false; - for (unsigned i = 0; i < candidate->definitions.size(); i++) { - if (candidate->definitions[i].isTemp() && ctx.depends_on[candidate->definitions[i].tempId()]) - can_move_down = false; - if (candidate->definitions[i].isFixed() && candidate->definitions[i].physReg() == exec) - writes_exec = true; - } - if (writes_exec) + HazardResult haz = perform_hazard_query(&hq, candidate.get()); + if (haz == hazard_fail_exec || haz == hazard_fail_unreorderable) break; - if (moving_spill && is_spill_reload(candidate)) - can_move_down = false; - if ((moving_interaction & barrier_shared) && candidate->format == Format::DS) - can_move_down = false; - moving_interaction |= get_barrier_interaction(candidate.get()); - moving_spill |= is_spill_reload(candidate); - if (!can_move_down) { - for (unsigned i = 0; i < candidate->operands.size(); i++) { - if (candidate->operands[i].isTemp()) - ctx.depends_on[candidate->operands[i].tempId()] = true; - } + if (haz != hazard_success) { + add_to_hazard_query(&hq, candidate.get()); + ctx.mv.downwards_skip(); continue; } - bool register_pressure_unknown = false; - /* check if one of candidate's operands is killed by depending instruction */ - for (unsigned i = 0; i < candidate->operands.size(); i++) { - if (candidate->operands[i].isTemp() && ctx.depends_on[candidate->operands[i].tempId()]) { - // FIXME: account for difference in register pressure - register_pressure_unknown = true; - } - } - if (register_pressure_unknown) { - for (unsigned i = 0; i < candidate->operands.size(); i++) { - if (candidate->operands[i].isTemp()) - ctx.depends_on[candidate->operands[i].tempId()] = true; - } + MoveResult res = ctx.mv.downwards_move(false); + if (res == move_fail_ssa || res == move_fail_rar) { + add_to_hazard_query(&hq, candidate.get()); + ctx.mv.downwards_skip(); continue; - } - - /* check if register pressure is low enough: the diff is negative if register pressure is decreased */ - const RegisterDemand candidate_diff = getLiveChanges(candidate); - const RegisterDemand temp = getTempRegisters(candidate);; - if (RegisterDemand(register_pressure - candidate_diff).exceeds(ctx.max_registers)) - break; - const RegisterDemand temp2 = getTempRegisters(block->instructions[insert_idx - 1]); - const RegisterDemand new_demand = register_demand[insert_idx - 1] - temp2 + temp; - if (new_demand.exceeds(ctx.max_registers)) + } else if (res == move_fail_pressure) { break; - // TODO: we might want to look further to find a sequence of instructions to move down which doesn't exceed reg pressure - - /* move the candidate below the export */ - move_element(block->instructions, candidate_idx, insert_idx); - - /* update register pressure */ - move_element(register_demand, candidate_idx, insert_idx); - for (int i = candidate_idx; i < insert_idx - 1; i++) { - register_demand[i] -= candidate_diff; } - register_demand[insert_idx - 1] = new_demand; - register_pressure -= candidate_diff; - insert_idx--; k++; } } @@ -758,6 +825,8 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v { ctx.last_SMEM_dep_idx = 0; ctx.last_SMEM_stall = INT16_MIN; + ctx.mv.block = block; + ctx.mv.register_demand = live_vars.register_demand[block->index].data(); /* go through all instructions and find memory loads */ for (unsigned idx = 0; idx < block->instructions.size(); idx++) { @@ -766,13 +835,18 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v if (current->definitions.empty()) continue; - if (current->isVMEM()) + if (current->isVMEM() || current->isFlatOrGlobal()) { + ctx.mv.current = current; schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx); - if (current->format == Format::SMEM) + } + + if (current->format == Format::SMEM) { + ctx.mv.current = current; schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx); + } } - if ((program->stage & hw_vs) && block->index == program->blocks.size() - 1) { + if ((program->stage & (hw_vs | hw_ngg_gs)) && (block->kind & block_kind_export_end)) { /* Try to move position exports as far up as possible, to reduce register * usage and because ISA reference guides say so. */ for (unsigned idx = 0; idx < block->instructions.size(); idx++) { @@ -780,8 +854,10 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v if (current->format == Format::EXP) { unsigned target = static_cast(current)->dest; - if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) + if (target >= V_008DFC_SQ_EXP_POS && target < V_008DFC_SQ_EXP_PARAM) { + ctx.mv.current = current; schedule_position_export(ctx, block, live_vars.register_demand[block->index], current, idx); + } } } } @@ -797,8 +873,9 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v void schedule_program(Program *program, live& live_vars) { sched_ctx ctx; - ctx.depends_on.resize(program->peekAllocationId()); - ctx.RAR_dependencies.resize(program->peekAllocationId()); + ctx.mv.depends_on.resize(program->peekAllocationId()); + ctx.mv.RAR_dependencies.resize(program->peekAllocationId()); + ctx.mv.RAR_dependencies_clause.resize(program->peekAllocationId()); /* Allowing the scheduler to reduce the number of waves to as low as 5 * improves performance of Thrones of Britannia significantly and doesn't * seem to hurt anything else. */ @@ -812,9 +889,11 @@ void schedule_program(Program *program, live& live_vars) ctx.num_waves = 7; else ctx.num_waves = 8; + ctx.num_waves = std::max(ctx.num_waves, program->min_waves); assert(ctx.num_waves > 0 && ctx.num_waves <= program->num_waves); - ctx.max_registers = { int16_t(((256 / ctx.num_waves) & ~3) - 2), int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))}; + ctx.mv.max_registers = { int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves) - 2), + int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves))}; for (Block& block : program->blocks) schedule_block(ctx, program, &block, live_vars);