From e0dff5fd86179b4d265060d5fc6138bb6a50b54d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 12 Mar 2020 17:20:16 +0100 Subject: [PATCH] aco: Create null exports in instruction selection instead of assembler. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This allows the passes after isel to assume that the exports are always correct, and also allows to schedule these null exports later. Additionally, it ensures that the correct exec mask is used for these exports. Totals from affected shaders (GFX10): SGPRS: 84224 -> 84344 (0.14 %) VGPRS: 23088 -> 23076 (-0.05 %) Code Size: 882892 -> 894368 (1.30 %) bytes Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_assembler.cpp | 25 +++------ .../compiler/aco_instruction_selection.cpp | 55 ++++++++++++++----- src/amd/compiler/aco_lower_to_hw_instr.cpp | 28 ++++++++-- 3 files changed, 72 insertions(+), 36 deletions(-) diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp index 7683c8f45af..9544fc35bd4 100644 --- a/src/amd/compiler/aco_assembler.cpp +++ b/src/amd/compiler/aco_assembler.cpp @@ -595,11 +595,11 @@ void emit_block(asm_context& ctx, std::vector& out, Block& block) void fix_exports(asm_context& ctx, std::vector& out, Program* program) { + bool exported = false; for (Block& block : program->blocks) { if (!(block.kind & block_kind_export_end)) continue; std::vector>::reverse_iterator it = block.instructions.rbegin(); - bool exported = false; while ( it != block.instructions.rend()) { if ((*it)->format == Format::EXP) { @@ -620,22 +620,13 @@ void fix_exports(asm_context& ctx, std::vector& out, Program* program) break; ++it; } - if (exported) - continue; - /* we didn't find an Export instruction and have to insert a null export */ - aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; - for (unsigned i = 0; i < 4; i++) - exp->operands[i] = Operand(v1); - exp->enabled_mask = 0; - exp->compressed = false; - exp->done = true; - exp->valid_mask = (program->stage & hw_fs) || program->chip_class >= GFX10; - if (program->stage & hw_fs) - exp->dest = 9; /* NULL */ - else - exp->dest = V_008DFC_SQ_EXP_POS; - /* insert the null export 1 instruction before branch/endpgm */ - block.instructions.insert(block.instructions.end() - 1, std::move(exp)); + } + + if (!exported) { + /* Abort in order to avoid a GPU hang. */ + fprintf(stderr, "Missing export in %s shader:\n", (program->stage & hw_vs) ? "vertex" : "fragment"); + aco_print_program(program, stderr); + abort(); } } diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index a8b318c63c5..b9d203a6391 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -8981,7 +8981,20 @@ static bool visit_cf_list(isel_context *ctx, return false; } -static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) +static void create_null_export(isel_context *ctx) +{ + /* Some shader stages always need to have exports. + * So when there is none, we need to add a null export. + */ + + unsigned dest = (ctx->program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS; + bool vm = (ctx->program->stage & hw_fs) || ctx->program->chip_class >= GFX10; + Builder bld(ctx->program, ctx->block); + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), + /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, vm); +} + +static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos) { assert(ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || @@ -8992,9 +9005,9 @@ static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex : ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; uint64_t mask = ctx->outputs.mask[slot]; if (!is_pos && !mask) - return; + return false; if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED) - return; + return false; aco_ptr exp{create_instruction(aco_opcode::exp, Format::EXP, 4, 0)}; exp->enabled_mask = mask; for (unsigned i = 0; i < 4; ++i) { @@ -9014,6 +9027,8 @@ static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex else exp->dest = V_008DFC_SQ_EXP_PARAM + offset; ctx->block->instructions.emplace_back(std::move(exp)); + + return true; } static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos) @@ -9075,14 +9090,15 @@ static void create_vs_exports(isel_context *ctx) /* the order these position exports are created is important */ int next_pos = 0; - export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); + bool exported_pos = export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) { export_vs_psiz_layer_viewport(ctx, &next_pos); + exported_pos = true; } if (ctx->num_clip_distances + ctx->num_cull_distances > 0) - export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos); + exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos); if (ctx->num_clip_distances + ctx->num_cull_distances > 4) - export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); + exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); if (ctx->export_clip_dists) { if (ctx->num_clip_distances + ctx->num_cull_distances > 0) @@ -9098,9 +9114,12 @@ static void create_vs_exports(isel_context *ctx) export_vs_varying(ctx, i, false, NULL); } + + if (!exported_pos) + create_null_export(ctx); } -static void export_fs_mrt_z(isel_context *ctx) +static bool export_fs_mrt_z(isel_context *ctx) { Builder bld(ctx->program, ctx->block); unsigned enabled_channels = 0; @@ -9157,9 +9176,11 @@ static void export_fs_mrt_z(isel_context *ctx) bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr); + + return true; } -static void export_fs_mrt_color(isel_context *ctx, int slot) +static bool export_fs_mrt_color(isel_context *ctx, int slot) { Builder bld(ctx->program, ctx->block); unsigned write_mask = ctx->outputs.mask[slot]; @@ -9276,7 +9297,7 @@ static void export_fs_mrt_color(isel_context *ctx, int slot) } if (target == V_008DFC_SQ_EXP_NULL) - return; + return false; if ((bool) compr_op) { for (int i = 0; i < 2; i++) { @@ -9300,22 +9321,26 @@ static void export_fs_mrt_color(isel_context *ctx, int slot) bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target, (bool) compr_op); + return true; } static void create_fs_exports(isel_context *ctx) { + bool exported = false; + /* Export depth, stencil and sample mask. */ if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] || - ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK]) { - export_fs_mrt_z(ctx); - } + ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK]) + exported |= export_fs_mrt_z(ctx); /* Export all color render targets. */ - for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) { + for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) if (ctx->outputs.mask[i]) - export_fs_mrt_color(ctx, i); - } + exported |= export_fs_mrt_color(ctx, i); + + if (!exported) + create_null_export(ctx); } static void write_tcs_tess_factors(isel_context *ctx) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index f42484c502f..6f2f54f6992 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -969,10 +969,30 @@ void lower_to_hw_instr(Program* program) } case aco_opcode::p_exit_early_if: { - /* don't bother with an early exit at the end of the program */ - if (block->instructions[j + 1]->opcode == aco_opcode::p_logical_end && - block->instructions[j + 2]->opcode == aco_opcode::s_endpgm) { - break; + /* don't bother with an early exit near the end of the program */ + if ((block->instructions.size() - 1 - j) <= 4 && + block->instructions.back()->opcode == aco_opcode::s_endpgm) { + unsigned null_exp_dest = (ctx.program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS; + bool ignore_early_exit = true; + + for (unsigned k = j + 1; k < block->instructions.size(); ++k) { + const aco_ptr &instr = block->instructions[k]; + if (instr->opcode == aco_opcode::s_endpgm || + instr->opcode == aco_opcode::p_logical_end) + continue; + else if (instr->opcode == aco_opcode::exp && + static_cast(instr.get())->dest == null_exp_dest) + continue; + else if (instr->opcode == aco_opcode::p_parallelcopy && + instr->definitions[0].isFixed() && + instr->definitions[0].physReg() == exec) + continue; + + ignore_early_exit = false; + } + + if (ignore_early_exit) + break; } if (!discard_block) { -- 2.30.2