From b1544352c022953febcc2c2c448ba21551e6b215 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Wed, 4 Dec 2019 15:19:56 +0000 Subject: [PATCH] aco: add various compiler statistics MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Adds these statistics: - hash of code and constant data - number of instructions - number of copies from pseudo-instructions - number of branches - estimate of cycles spent not waiting in s_waitcnt - number of vmem/smem "clauses" - sgpr/vgpr usage before scheduling Signed-off-by: Rhys Perry Acked-by: Bas Nieuwenhuizen Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_interface.cpp | 44 ++++++++++- src/amd/compiler/aco_ir.h | 20 +++++ src/amd/compiler/aco_lower_to_hw_instr.cpp | 5 ++ src/amd/compiler/aco_statistics.cpp | 88 ++++++++++++++++++++++ src/amd/compiler/meson.build | 3 +- 5 files changed, 155 insertions(+), 5 deletions(-) create mode 100644 src/amd/compiler/aco_statistics.cpp diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 378a138d245..104436d33c6 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -54,6 +54,18 @@ static void init() } } +static radv_compiler_statistic_info statistic_infos[] = { + [aco::statistic_hash] = {"Hash", "CRC32 hash of code and constant data"}, + [aco::statistic_instructions] = {"Instructions", "Instruction count"}, + [aco::statistic_copies] = {"Copies", "Copy instructions created for pseudo-instructions"}, + [aco::statistic_branches] = {"Branches", "Branch instructions"}, + [aco::statistic_cycles] = {"Busy Cycles", "Estimate of busy cycles"}, + [aco::statistic_vmem_clauses] = {"VMEM Clause", "Number of VMEM clauses (includes 1-sized clauses)"}, + [aco::statistic_smem_clauses] = {"SMEM Clause", "Number of SMEM clauses (includes 1-sized clauses)"}, + [aco::statistic_sgpr_presched] = {"Pre-Sched SGPRs", "SGPR usage before scheduling"}, + [aco::statistic_vgpr_presched] = {"Pre-Sched VGPRs", "VGPR usage before scheduling"}, +}; + void aco_compile_shader(unsigned shader_count, struct nir_shader *const *shaders, struct radv_shader_binary **binary, @@ -64,6 +76,10 @@ void aco_compile_shader(unsigned shader_count, ac_shader_config config = {0}; std::unique_ptr program{new aco::Program}; + program->collect_statistics = args->options->record_ir; + if (program->collect_statistics) + memset(program->statistics, 0, sizeof(program->statistics)); + /* Instruction Selection */ if (args->is_gs_copy_shader) aco::select_gs_copy_shader(program.get(), shaders[0], &config, args); @@ -94,6 +110,9 @@ void aco_compile_shader(unsigned shader_count, aco::live live_vars = aco::live_var_analysis(program.get(), args->options); aco::spill(program.get(), live_vars, args->options); + if (program->collect_statistics) + aco::collect_presched_stats(program.get()); + //std::cerr << "Before Schedule:\n"; //aco_print_program(program.get(), stderr); aco::schedule_program(program.get(), live_vars); @@ -139,10 +158,16 @@ void aco_compile_shader(unsigned shader_count, //std::cerr << "After Insert-Waitcnt:\n"; //aco_print_program(program.get(), stderr); + if (program->collect_statistics) + aco::collect_preasm_stats(program.get()); + /* Assembly */ std::vector code; unsigned exec_size = aco::emit_program(program.get(), code); + if (program->collect_statistics) + aco::collect_postasm_stats(program.get(), code); + bool get_disasm = args->options->dump_shader || args->options->record_ir; size_t size = llvm_ir.size(); @@ -156,6 +181,11 @@ void aco_compile_shader(unsigned shader_count, size += disasm.size(); } + size_t stats_size = 0; + if (program->collect_statistics) + stats_size = sizeof(radv_compiler_statistics) + aco::num_statistics * sizeof(uint32_t); + size += stats_size; + size += code.size() * sizeof(uint32_t) + sizeof(radv_shader_binary_legacy); /* We need to calloc to prevent unintialized data because this will be used * directly for the disk cache. Uninitialized data can appear because of @@ -168,9 +198,15 @@ void aco_compile_shader(unsigned shader_count, legacy_binary->base.is_gs_copy_shader = args->is_gs_copy_shader; legacy_binary->base.total_size = size; - legacy_binary->stats_size = 0; + if (program->collect_statistics) { + radv_compiler_statistics *statistics = (radv_compiler_statistics *)legacy_binary->data; + statistics->count = aco::num_statistics; + statistics->infos = statistic_infos; + memcpy(statistics->values, program->statistics, aco::num_statistics * sizeof(uint32_t)); + } + legacy_binary->stats_size = stats_size; - memcpy(legacy_binary->data, code.data(), code.size() * sizeof(uint32_t)); + memcpy(legacy_binary->data + legacy_binary->stats_size, code.data(), code.size() * sizeof(uint32_t)); legacy_binary->exec_size = exec_size; legacy_binary->code_size = code.size() * sizeof(uint32_t); @@ -178,10 +214,10 @@ void aco_compile_shader(unsigned shader_count, legacy_binary->disasm_size = 0; legacy_binary->ir_size = llvm_ir.size(); - llvm_ir.copy((char*) legacy_binary->data + legacy_binary->code_size, llvm_ir.size()); + llvm_ir.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size, llvm_ir.size()); if (get_disasm) { - disasm.copy((char*) legacy_binary->data + legacy_binary->code_size + llvm_ir.size(), disasm.size()); + disasm.copy((char*) legacy_binary->data + legacy_binary->stats_size + legacy_binary->code_size + llvm_ir.size(), disasm.size()); legacy_binary->disasm_size = disasm.size(); } diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index ace84db1018..c6213e0c04e 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1219,6 +1219,19 @@ static constexpr Stage tess_control_hs = sw_tcs | hw_hs; static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */ static constexpr Stage geometry_gs = sw_gs | hw_gs; +enum statistic { + statistic_hash, + statistic_instructions, + statistic_copies, + statistic_branches, + statistic_cycles, + statistic_vmem_clauses, + statistic_smem_clauses, + statistic_sgpr_presched, + statistic_vgpr_presched, + num_statistics +}; + class Program final { public: float_mode next_fp_mode; @@ -1257,6 +1270,9 @@ public: bool needs_vcc = false; bool needs_flat_scr = false; + bool collect_statistics = false; + uint32_t statistics[num_statistics]; + uint32_t allocateId() { assert(allocationID <= 16777215); @@ -1337,6 +1353,10 @@ void perfwarn(bool cond, const char *msg, Instruction *instr=NULL); #define perfwarn(program, cond, msg, ...) do {} while(0) #endif +void collect_presched_stats(Program *program); +void collect_preasm_stats(Program *program); +void collect_postasm_stats(Program *program, const std::vector& code); + void aco_print_instr(Instruction *instr, FILE *output); void aco_print_program(Program *program, FILE *output); diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 6f2f54f6992..606f2fde65c 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -784,6 +784,7 @@ void handle_operands(std::map& copy_map, lower_context* copy_map.erase(it); it = copy_map.begin(); + ctx->program->statistics[statistic_copies]++; continue; } else { /* the target reg is used as operand, check the next entry */ @@ -813,6 +814,7 @@ void handle_operands(std::map& copy_map, lower_context* Definition op_as_def = Definition(swap.op.physReg(), swap.op.regClass()); if (chip_class >= GFX9 && swap.def.getTemp().type() == RegType::vgpr) { bld.vop1(aco_opcode::v_swap_b32, swap.def, op_as_def, swap.op, def_as_op); + ctx->program->statistics[statistic_copies]++; } else if (swap.op.physReg() == scc || swap.def.physReg() == scc) { /* we need to swap scc and another sgpr */ assert(!preserve_scc); @@ -822,6 +824,7 @@ void handle_operands(std::map& copy_map, lower_context* bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1)); bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u)); bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1)); + ctx->program->statistics[statistic_copies] += 3; } else if (swap.def.getTemp().type() == RegType::sgpr) { if (preserve_scc) { bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), swap.op); @@ -832,10 +835,12 @@ void handle_operands(std::map& copy_map, lower_context* bld.sop2(aco_opcode::s_xor_b32, swap.def, Definition(scc, s1), swap.op, def_as_op); bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op); } + ctx->program->statistics[statistic_copies] += 3; } else { bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); bld.vop2(aco_opcode::v_xor_b32, swap.def, swap.op, def_as_op); bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); + ctx->program->statistics[statistic_copies] += 3; } /* change the operand reg of the target's use */ diff --git a/src/amd/compiler/aco_statistics.cpp b/src/amd/compiler/aco_statistics.cpp new file mode 100644 index 00000000000..2e78ab63cb1 --- /dev/null +++ b/src/amd/compiler/aco_statistics.cpp @@ -0,0 +1,88 @@ +/* + * Copyright © 2020 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ +#include "aco_ir.h" +#include "util/crc32.h" + +namespace aco { + +/* sgpr_presched/vgpr_presched */ +void collect_presched_stats(Program *program) +{ + RegisterDemand presched_demand; + for (Block& block : program->blocks) + presched_demand.update(block.register_demand); + program->statistics[statistic_sgpr_presched] = presched_demand.sgpr; + program->statistics[statistic_vgpr_presched] = presched_demand.vgpr; +} + +/* instructions/branches/vmem_clauses/smem_clauses/cycles */ +void collect_preasm_stats(Program *program) +{ + for (Block& block : program->blocks) { + std::set vmem_clause_res; + std::set smem_clause_res; + + program->statistics[statistic_instructions] += block.instructions.size(); + + for (aco_ptr& instr : block.instructions) { + if (instr->format == Format::SOPP && static_cast(instr.get())->block != -1) + program->statistics[statistic_branches]++; + + if (instr->opcode == aco_opcode::p_constaddr) + program->statistics[statistic_instructions] += 2; + + if (instr->isVMEM() && !instr->operands.empty()) { + vmem_clause_res.insert(instr->operands[0].getTemp()); + } else { + program->statistics[statistic_vmem_clauses] += vmem_clause_res.size(); + vmem_clause_res.clear(); + } + + if (instr->format == Format::SMEM && !instr->operands.empty()) { + if (instr->operands[0].size() == 2) + smem_clause_res.insert(Temp(0, s2)); + else + smem_clause_res.insert(instr->operands[0].getTemp()); + } else { + program->statistics[statistic_smem_clauses] += smem_clause_res.size(); + smem_clause_res.clear(); + } + + /* TODO: this incorrectly assumes instructions always take 4 cycles */ + /* assume loops execute 4 times (TODO: it would be nice to be able to consider loop unrolling) */ + unsigned iter = 1 << (block.loop_nest_depth * 2); + program->statistics[statistic_cycles] += 4 * iter; + } + + program->statistics[statistic_vmem_clauses] += vmem_clause_res.size(); + program->statistics[statistic_smem_clauses] += smem_clause_res.size(); + } +} + +void collect_postasm_stats(Program *program, const std::vector& code) +{ + program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4); +} + +} diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build index 8a0ebb0befc..44b56baab4c 100644 --- a/src/amd/compiler/meson.build +++ b/src/amd/compiler/meson.build @@ -76,8 +76,9 @@ libaco_files = files( 'aco_print_asm.cpp', 'aco_print_ir.cpp', 'aco_scheduler.cpp', - 'aco_ssa_elimination.cpp', 'aco_spill.cpp', + 'aco_ssa_elimination.cpp', + 'aco_statistics.cpp', 'aco_util.h', 'aco_validate.cpp', ) -- 2.30.2