aco: Store tess factors in VMEM only at the end of the shader.
authorTimur Kristóf <timur.kristof@gmail.com>
Fri, 6 Mar 2020 19:24:55 +0000 (21:24 +0200)
committerMarge Bot <eric+marge@anholt.net>
Wed, 11 Mar 2020 08:34:11 +0000 (08:34 +0000)
This optimizes out several superfluous stores of the tess factors,
especially if the shader wrote those outputs multiple times.

Pipeline DB changes on GFX10:
Totals from affected shaders:
SGPRS: 30384 -> 29536 (-2.79 %)
Code Size: 2260720 -> 2214484 (-2.05 %) bytes

Signed-off-by: Timur Kristóf <timur.kristof@gmail.com>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3964>

src/amd/compiler/aco_instruction_selection.cpp

index 62d14e6f824823c345dd4fcb6d8229af8fd86a4b..c363edb602dc9ddb489d77003f4d4cc30afee2ed 100644 (file)
@@ -3316,6 +3316,24 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
    }
 }
 
+bool should_write_tcs_patch_output_to_vmem(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   unsigned off = nir_intrinsic_base(instr) * 4u;
+   nir_src *off_src = nir_get_io_offset_src(instr);
+
+   /* Indirect offset, we can't be sure if this is a tess factor, always write to VMEM */
+   if (!nir_src_is_const(*off_src))
+      return true;
+
+   off += nir_src_as_uint(*off_src) * 16u;
+
+   const unsigned tess_index_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
+   const unsigned tess_index_outer = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
+
+   return (off != (tess_index_inner * 16u)) &&
+          (off != (tess_index_outer * 16u));
+}
+
 void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
 {
    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
@@ -3327,8 +3345,8 @@ void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool
    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
    unsigned write_mask = nir_intrinsic_write_mask(instr);
 
-   /* TODO: Only write to VMEM if the output is per-vertex or it's per-patch non tess factor */
-   bool write_to_vmem = true;
+   /* Only write to VMEM if the output is per-vertex or it's per-patch non tess factor */
+   bool write_to_vmem = per_vertex || should_write_tcs_patch_output_to_vmem(ctx, instr);
    /* TODO: Only write to LDS if the output is read by the shader, or it's per-patch tess factor */
    bool write_to_lds = true;
 
@@ -9273,6 +9291,22 @@ static void write_tcs_tess_factors(isel_context *ctx)
    Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr);
    store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
 
+   /* Store to offchip for TES to read - only if TES reads them */
+   if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
+      Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
+      Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
+
+      std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, tess_index_outer * 16);
+      Temp outer_vec = create_vec_from_array(ctx, outer, outer_comps, RegType::vgpr);
+      store_vmem_mubuf(ctx, outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
+
+      if (likely(inner_comps)) {
+         std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, tess_index_inner * 16);
+         Temp inner_vec = create_vec_from_array(ctx, inner, inner_comps, RegType::vgpr);
+         store_vmem_mubuf(ctx, inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
+      }
+   }
+
    begin_divergent_if_else(ctx, &ic_invocation_id_is_zero);
    end_divergent_if(ctx, &ic_invocation_id_is_zero);
 }