From 798dd98d6e530afc5dab2f973785fbbd4e598dee Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Thu, 26 Mar 2020 17:45:55 +0100 Subject: [PATCH] aco: When LS and HS invocations are the same, pass LS outputs in temps. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We know that in this case, the LS and HS invocations are working on the exact same vertex, so it's safe to skip the LDS. Totals: VGPRS: 3960744 -> 3961844 (0.03 %) Code Size: 254824300 -> 254764624 (-0.02 %) bytes Max Waves: 1053748 -> 1053574 (-0.02 %) Totals from affected shaders: VGPRS: 26152 -> 27252 (4.21 %) Code Size: 1496600 -> 1436924 (-3.99 %) bytes Max Waves: 4860 -> 4686 (-3.58 %) Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- .../compiler/aco_instruction_selection.cpp | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index b8816f51cde..716853d23ce 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3329,6 +3329,34 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr) return true; } +bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst) +{ + /* Only TCS per-vertex inputs are supported by this function. + * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same. + */ + if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq) + return false; + + nir_src *off_src = nir_get_io_offset_src(instr); + nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr); + nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr; + bool can_use_temps = nir_src_is_const(*off_src) && + vertex_index_instr->type == nir_instr_type_intrinsic && + nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id; + + if (!can_use_temps) + return false; + + unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src); + Temp *src = &ctx->inputs.temps[idx]; + Temp vec = create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u); + assert(vec.size() == dst.size()); + + Builder bld(ctx->program, ctx->block); + bld.copy(Definition(dst), vec); + return true; +} + void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) { Builder bld(ctx->program, ctx->block); @@ -3338,6 +3366,9 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) unsigned write_mask = nir_intrinsic_write_mask(instr); unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u; + if (ctx->tcs_in_out_eq) + store_output_to_temps(ctx, instr); + if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) { /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */ Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u)); @@ -3974,6 +4005,10 @@ void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *ins Builder bld(ctx->program, ctx->block); Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); + + if (load_input_from_temps(ctx, instr, dst)) + return; + std::pair offs = get_tcs_per_vertex_input_lds_offset(ctx, instr); unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; unsigned lds_align = calculate_lds_alignment(ctx, offs.second); -- 2.30.2