From ef6c84b301ce15022d4907dfb0db5764e31e68f5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 14 Nov 2016 09:09:51 +0100 Subject: [PATCH] radeonsi: eliminate VS outputs that aren't used by PS at runtime MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit A past commit added the ability to compile "optimized" shader variants asynchronously (not stalling the app). This commit builds upon that and adds what is basically a runtime shader linker. If a VS output isn't used by the currently-bound PS, a new VS compilation is started without that output. The new shader variant is used when it's ready. All apps using separate shader objects I've seen had unused VS outputs. Eliminating unused/useless VS outputs also eliminates the corresponding vertex attribute loads. Tested-by: Edmondo Tommasina Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_shader.c | 26 ++++++++++++- src/gallium/drivers/radeonsi/si_shader.h | 7 ++-- .../drivers/radeonsi/si_state_shaders.c | 37 +++++++++++++++++-- 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 2b432244331..abe30e54a58 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2281,6 +2281,26 @@ static void si_llvm_export_vs(struct lp_build_tgsi_context *bld_base, for (i = 0; i < noutput; i++) { semantic_name = outputs[i].name; semantic_index = outputs[i].sid; + bool export_param = true; + + switch (semantic_name) { + case TGSI_SEMANTIC_POSITION: /* ignore these */ + case TGSI_SEMANTIC_PSIZE: + case TGSI_SEMANTIC_CLIPVERTEX: + case TGSI_SEMANTIC_EDGEFLAG: + break; + case TGSI_SEMANTIC_GENERIC: + case TGSI_SEMANTIC_CLIPDIST: + if (shader->key.opt.hw_vs.kill_outputs & + (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index))) + export_param = false; + break; + default: + if (shader->key.opt.hw_vs.kill_outputs2 & + (1u << si_shader_io_get_unique_index2(semantic_name, semantic_index))) + export_param = false; + break; + } handle_semantic: /* Select the correct target */ @@ -2304,6 +2324,8 @@ handle_semantic: break; case TGSI_SEMANTIC_COLOR: case TGSI_SEMANTIC_BCOLOR: + if (!export_param) + continue; target = V_008DFC_SQ_EXP_PARAM + param_count; assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); shader->info.vs_output_param_offset[i] = param_count; @@ -2325,6 +2347,8 @@ handle_semantic: case TGSI_SEMANTIC_FOG: case TGSI_SEMANTIC_TEXCOORD: case TGSI_SEMANTIC_GENERIC: + if (!export_param) + continue; target = V_008DFC_SQ_EXP_PARAM + param_count; assert(i < ARRAY_SIZE(shader->info.vs_output_param_offset)); shader->info.vs_output_param_offset[i] = param_count; @@ -7083,7 +7107,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, si_init_shader_ctx(&ctx, sscreen, shader, tm); ctx.separate_prolog = !is_monolithic; - memset(shader->info.vs_output_param_offset, 0xff, + memset(shader->info.vs_output_param_offset, EXP_PARAM_UNDEFINED, sizeof(shader->info.vs_output_param_offset)); shader->info.uses_instanceid = sel->info.uses_instanceid; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index fc9c9131be8..aa37676f887 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -322,10 +322,6 @@ struct si_vs_prolog_bits { /* Common VS bits between the shader key and the epilog key. */ struct si_vs_epilog_bits { unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ - /* TODO: - * - skip layer, viewport, clipdist, and culldist parameter exports - * if PS doesn't read them - */ }; /* Common TCS bits between the shader key and the epilog key. */ @@ -440,6 +436,8 @@ struct si_shader_key { /* Optimization flags for asynchronous compilation only. */ union { struct { + uint64_t kill_outputs; /* "get_unique_index" bits */ + uint32_t kill_outputs2; /* "get_unique_index2" bits */ unsigned clip_disable:1; } hw_vs; /* HW VS (it can be VS, TES, GS) */ } opt; @@ -468,6 +466,7 @@ enum { EXP_PARAM_DEFAULT_VAL_0001, EXP_PARAM_DEFAULT_VAL_1110, EXP_PARAM_DEFAULT_VAL_1111, + EXP_PARAM_UNDEFINED = 255, }; /* GCN-specific shader info. */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index e4d8747c6fe..7834f8711b6 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -858,11 +858,35 @@ static void si_shader_selector_key_hw_vs(struct si_context *sctx, struct si_shader_selector *vs, struct si_shader_key *key) { + struct si_shader_selector *ps = sctx->ps_shader.cso; + key->opt.hw_vs.clip_disable = sctx->queued.named.rasterizer->clip_plane_enable == 0 && (vs->info.clipdist_writemask || vs->info.writes_clipvertex) && !vs->info.culldist_writemask; + + /* Find out if PS is disabled. */ + bool ps_disabled = ps == NULL; + + /* Find out which VS outputs aren't used by the PS. */ + uint64_t outputs_written = vs->outputs_written; + uint32_t outputs_written2 = vs->outputs_written2; + uint64_t inputs_read = 0; + uint32_t inputs_read2 = 0; + + outputs_written &= ~0x3; /* ignore POSITION, PSIZE */ + + if (!ps_disabled) { + inputs_read = ps->inputs_read; + inputs_read2 = ps->inputs_read2; + } + + uint64_t linked = outputs_written & inputs_read; + uint32_t linked2 = outputs_written2 & inputs_read2; + + key->opt.hw_vs.kill_outputs = ~linked & outputs_written; + key->opt.hw_vs.kill_outputs2 = ~linked2 & outputs_written2; } /* Compute the key for the hw shader variant */ @@ -1785,11 +1809,16 @@ static unsigned si_get_ps_input_cntl(struct si_context *sctx, /* The input is loaded from parameter memory. */ ps_input_cntl |= S_028644_OFFSET(offset); } else if (!G_028644_PT_SPRITE_TEX(ps_input_cntl)) { - /* The input is a DEFAULT_VAL constant. */ - assert(offset >= EXP_PARAM_DEFAULT_VAL_0000 && - offset <= EXP_PARAM_DEFAULT_VAL_1111); + if (offset == EXP_PARAM_UNDEFINED) { + /* This can happen with depth-only rendering. */ + offset = 0; + } else { + /* The input is a DEFAULT_VAL constant. */ + assert(offset >= EXP_PARAM_DEFAULT_VAL_0000 && + offset <= EXP_PARAM_DEFAULT_VAL_1111); + offset -= EXP_PARAM_DEFAULT_VAL_0000; + } - offset -= EXP_PARAM_DEFAULT_VAL_0000; ps_input_cntl = S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset); } -- 2.30.2