From 4dcca269455adb1029334cefb035fd19d9d99d50 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Timur=20Krist=C3=B3f?= Date: Fri, 6 Mar 2020 21:24:55 +0200 Subject: [PATCH] aco: Store tess factors in VMEM only at the end of the shader. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This optimizes out several superfluous stores of the tess factors, especially if the shader wrote those outputs multiple times. Pipeline DB changes on GFX10: Totals from affected shaders: SGPRS: 30384 -> 29536 (-2.79 %) Code Size: 2260720 -> 2214484 (-2.05 %) bytes Signed-off-by: Timur Kristóf Reviewed-by: Rhys Perry Part-of: --- .../compiler/aco_instruction_selection.cpp | 38 ++++++++++++++++++- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 62d14e6f824..c363edb602d 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3316,6 +3316,24 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr) } } +bool should_write_tcs_patch_output_to_vmem(isel_context *ctx, nir_intrinsic_instr *instr) +{ + unsigned off = nir_intrinsic_base(instr) * 4u; + nir_src *off_src = nir_get_io_offset_src(instr); + + /* Indirect offset, we can't be sure if this is a tess factor, always write to VMEM */ + if (!nir_src_is_const(*off_src)) + return true; + + off += nir_src_as_uint(*off_src) * 16u; + + const unsigned tess_index_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER); + const unsigned tess_index_outer = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER); + + return (off != (tess_index_inner * 16u)) && + (off != (tess_index_outer * 16u)); +} + void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex) { assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs); @@ -3327,8 +3345,8 @@ void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; unsigned write_mask = nir_intrinsic_write_mask(instr); - /* TODO: Only write to VMEM if the output is per-vertex or it's per-patch non tess factor */ - bool write_to_vmem = true; + /* Only write to VMEM if the output is per-vertex or it's per-patch non tess factor */ + bool write_to_vmem = per_vertex || should_write_tcs_patch_output_to_vmem(ctx, instr); /* TODO: Only write to LDS if the output is read by the shader, or it's per-patch tess factor */ bool write_to_lds = true; @@ -9273,6 +9291,22 @@ static void write_tcs_tess_factors(isel_context *ctx) Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr); store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false); + /* Store to offchip for TES to read - only if TES reads them */ + if (ctx->args->options->key.tcs.tes_reads_tess_factors) { + Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u)); + Temp oc_lds = get_arg(ctx, ctx->args->oc_lds); + + std::pair vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, tess_index_outer * 16); + Temp outer_vec = create_vec_from_array(ctx, outer, outer_comps, RegType::vgpr); + store_vmem_mubuf(ctx, outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false); + + if (likely(inner_comps)) { + std::pair vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, tess_index_inner * 16); + Temp inner_vec = create_vec_from_array(ctx, inner, inner_comps, RegType::vgpr); + store_vmem_mubuf(ctx, inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false); + } + } + begin_divergent_if_else(ctx, &ic_invocation_id_is_zero); end_divergent_if(ctx, &ic_invocation_id_is_zero); } -- 2.30.2