From: Timur Kristóf <timur.kristof@gmail.com>
Date: Thu, 26 Mar 2020 16:45:55 +0000 (+0100)
Subject: aco: When LS and HS invocations are the same, pass LS outputs in temps.
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=798dd98d6e530afc5dab2f973785fbbd4e598dee;p=mesa.git

aco: When LS and HS invocations are the same, pass LS outputs in temps.

We know that in this case, the LS and HS invocations are working
on the exact same vertex, so it's safe to skip the LDS.

Totals:
VGPRS: 3960744 -> 3961844 (0.03 %)
Code Size: 254824300 -> 254764624 (-0.02 %) bytes
Max Waves: 1053748 -> 1053574 (-0.02 %)

Totals from affected shaders:
VGPRS: 26152 -> 27252 (4.21 %)
Code Size: 1496600 -> 1436924 (-3.99 %) bytes
Max Waves: 4860 -> 4686 (-3.58 %)

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4165>
---

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index b8816f51cde..716853d23ce 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3329,6 +3329,34 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
    return true;
 }
 
+bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
+{
+   /* Only TCS per-vertex inputs are supported by this function.
+    * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
+    */
+   if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
+      return false;
+
+   nir_src *off_src = nir_get_io_offset_src(instr);
+   nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
+   nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
+   bool can_use_temps = nir_src_is_const(*off_src) &&
+                        vertex_index_instr->type == nir_instr_type_intrinsic &&
+                        nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
+
+   if (!can_use_temps)
+      return false;
+
+   unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
+   Temp *src = &ctx->inputs.temps[idx];
+   Temp vec = create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u);
+   assert(vec.size() == dst.size());
+
+   Builder bld(ctx->program, ctx->block);
+   bld.copy(Definition(dst), vec);
+   return true;
+}
+
 void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
 {
    Builder bld(ctx->program, ctx->block);
@@ -3338,6 +3366,9 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
    unsigned write_mask = nir_intrinsic_write_mask(instr);
    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
 
+   if (ctx->tcs_in_out_eq)
+      store_output_to_temps(ctx, instr);
+
    if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
       /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
       Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
@@ -3974,6 +4005,10 @@ void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *ins
 
    Builder bld(ctx->program, ctx->block);
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+
+   if (load_input_from_temps(ctx, instr, dst))
+      return;
+
    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
    unsigned lds_align = calculate_lds_alignment(ctx, offs.second);