From 389ee819c04f3375358d0253bdb1f6094f2423c6 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 27 Nov 2019 17:27:36 +0000 Subject: [PATCH] aco: improve FLAT/GLOBAL scheduling MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann --- .../compiler/aco_instruction_selection.cpp | 2 ++ src/amd/compiler/aco_ir.h | 7 ++-- src/amd/compiler/aco_opcodes.py | 1 + src/amd/compiler/aco_print_ir.cpp | 1 + src/amd/compiler/aco_scheduler.cpp | 33 ++++++++++++------- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 60963060dea..2bced09cf97 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -4644,6 +4644,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr) flat->operands[1] = Operand(s1); flat->glc = glc; flat->dlc = dlc; + flat->barrier = barrier_buffer; if (dst.type() == RegType::sgpr) { Temp vec = bld.tmp(RegType::vgpr, dst.size()); @@ -4765,6 +4766,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr) flat->dlc = false; flat->offset = offset; flat->disable_wqm = true; + flat->barrier = barrier_buffer; ctx->program->needs_exact = true; ctx->block->instructions.emplace_back(std::move(flat)); } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 60f06393aa5..4073086662a 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -850,7 +850,9 @@ struct FLAT_instruction : public Instruction { bool dlc; /* NAVI: device level coherent */ bool lds; bool nv; - bool disable_wqm; + bool disable_wqm; /* Require an exec mask without helper invocations */ + bool can_reorder; + barrier_interaction barrier; }; struct Export_instruction : public Instruction { @@ -972,7 +974,8 @@ constexpr barrier_interaction get_barrier_interaction(Instruction* instr) return static_cast(instr)->barrier; case Format::FLAT: case Format::GLOBAL: - return barrier_buffer; + case Format::SCRATCH: + return static_cast(instr)->barrier; case Format::DS: return barrier_shared; default: diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 5f74998a421..a4b02507eda 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -133,6 +133,7 @@ class Format(Enum): ('bool', 'bound_ctrl', 'false')] elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: return [('uint16_t', 'offset', 0), + ('bool', 'can_reorder', 'true'), ('bool', 'glc', 'false'), ('bool', 'slc', 'false'), ('bool', 'lds', 'false'), diff --git a/src/amd/compiler/aco_print_ir.cpp b/src/amd/compiler/aco_print_ir.cpp index 5ced1d2d7bb..780980a8c69 100644 --- a/src/amd/compiler/aco_print_ir.cpp +++ b/src/amd/compiler/aco_print_ir.cpp @@ -373,6 +373,7 @@ static void print_instr_format_specific(struct Instruction *instr, FILE *output) fprintf(output, " nv"); if (flat->disable_wqm) fprintf(output, " disable_wqm"); + print_barrier_reorder(flat->can_reorder, flat->barrier, output); break; } case Format::MTBUF: { diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index eb0bb0d93e9..5c164703ebf 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -138,6 +138,11 @@ bool can_move_instr(aco_ptr& instr, Instruction* current, int movin case Format::MIMG: can_reorder = static_cast(current)->can_reorder; break; + case Format::FLAT: + case Format::GLOBAL: + case Format::SCRATCH: + can_reorder = static_cast(current)->can_reorder; + break; default: break; } @@ -186,7 +191,7 @@ bool can_reorder(Instruction* candidate) case Format::FLAT: case Format::GLOBAL: case Format::SCRATCH: - return false; + return static_cast(candidate)->can_reorder; default: return true; } @@ -483,6 +488,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, assert(candidate_idx >= 0); aco_ptr& candidate = block->instructions[candidate_idx]; bool can_reorder_candidate = can_reorder(candidate.get()); + bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal(); /* break when encountering another VMEM instruction, logical_start or barriers */ if (!can_reorder_smem && candidate->format == Format::SMEM && !can_reorder_candidate) @@ -501,8 +507,10 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, register_pressure_indep.update(register_demand[candidate_idx]); bool part_of_clause = false; - if (candidate->isVMEM()) { - bool same_resource = candidate->operands[1].tempId() == current->operands[1].tempId(); + if (current->isVMEM() == candidate->isVMEM()) { + bool same_resource = true; + if (current->isVMEM()) + same_resource = candidate->operands[1].tempId() == current->operands[1].tempId(); bool can_reorder = can_reorder_vmem || can_reorder_candidate; int grab_dist = clause_insert_idx - candidate_idx; /* We can't easily tell how much this will decrease the def-to-use @@ -511,7 +519,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, } /* if current depends on candidate, add additional dependencies and continue */ - bool can_move_down = !candidate->isVMEM() || part_of_clause; + bool can_move_down = !is_vmem || part_of_clause; bool writes_exec = false; for (const Definition& def : candidate->definitions) { if (def.isTemp() && ctx.depends_on[def.tempId()]) @@ -540,7 +548,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, } register_pressure_clause.update(register_demand[candidate_idx]); can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate; - can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate; + can_reorder_vmem &= !is_vmem || can_reorder_candidate; continue; } @@ -575,7 +583,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, } register_pressure_clause.update(register_demand[candidate_idx]); can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate; - can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate; + can_reorder_vmem &= !is_vmem || can_reorder_candidate; continue; } @@ -636,6 +644,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, assert(candidate_idx < (int) block->instructions.size()); aco_ptr& candidate = block->instructions[candidate_idx]; bool can_reorder_candidate = can_reorder(candidate.get()); + bool is_vmem = candidate->isVMEM() || candidate->isFlatOrGlobal(); if (candidate->opcode == aco_opcode::p_logical_end) break; @@ -651,7 +660,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, bool is_dependency = false; if (candidate->format == Format::SMEM) is_dependency = !can_reorder_smem && !can_reorder_candidate; - if (candidate->isVMEM()) + if (is_vmem) is_dependency = !can_reorder_vmem && !can_reorder_candidate; for (const Operand& op : candidate->operands) { if (op.isTemp() && ctx.depends_on[op.tempId()]) { @@ -676,7 +685,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, } /* update flag whether we can reorder other memory instructions */ can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate; - can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate; + can_reorder_vmem &= !is_vmem || can_reorder_candidate; if (!found_dependency) { insert_idx = candidate_idx; @@ -686,7 +695,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, continue; } - } else if (candidate->isVMEM()) { + } else if (is_vmem) { /* don't move up dependencies of other VMEM instructions */ for (const Definition& def : candidate->definitions) { if (def.isTemp()) @@ -717,7 +726,7 @@ void schedule_VMEM(sched_ctx& ctx, Block* block, ctx.RAR_dependencies[op.tempId()] = true; } can_reorder_smem &= candidate->format != Format::SMEM || can_reorder_candidate; - can_reorder_vmem &= !candidate->isVMEM() || can_reorder_candidate; + can_reorder_vmem &= !is_vmem || can_reorder_candidate; continue; } @@ -783,7 +792,7 @@ void schedule_position_export(sched_ctx& ctx, Block* block, break; if (candidate->opcode == aco_opcode::p_exit_early_if) break; - if (candidate->isVMEM() || candidate->format == Format::SMEM) + if (candidate->isVMEM() || candidate->format == Format::SMEM || candidate->isFlatOrGlobal()) break; if (!can_move_instr(candidate, current, moving_interaction)) break; @@ -876,7 +885,7 @@ void schedule_block(sched_ctx& ctx, Program *program, Block* block, live& live_v if (current->definitions.empty()) continue; - if (current->isVMEM()) + if (current->isVMEM() || current->isFlatOrGlobal()) schedule_VMEM(ctx, block, live_vars.register_demand[block->index], current, idx); if (current->format == Format::SMEM) schedule_SMEM(ctx, block, live_vars.register_demand[block->index], current, idx); -- 2.30.2