From 6eade342eb223313242c1c2a7615b6bd75036087 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 5 Sep 2017 13:40:59 +0200 Subject: [PATCH] radeonsi: optimize TCS epilog when invocation 0 writes tess factors MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This removes the barrier and LDS stores and loads for tess factors when it's possible. The removal of the barrier seems more important to me though. In one shader, it removes 17 * 4 bytes from the shader binary. Reviewed-by: Nicolai Hähnle --- src/gallium/auxiliary/tgsi/tgsi_scan.c | 2 - src/gallium/drivers/radeonsi/si_shader.c | 111 +++++++++++++----- src/gallium/drivers/radeonsi/si_shader.h | 2 + .../drivers/radeonsi/si_shader_internal.h | 1 + .../drivers/radeonsi/si_state_shaders.c | 3 + 5 files changed, 89 insertions(+), 30 deletions(-) diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index b8932891e4c..212d1bb95a8 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -1139,7 +1139,6 @@ tgsi_scan_tess_ctrl(const struct tgsi_token *tokens, if (main_block_tf_writemask || cond_block_tf_writemask) { /* Accumulate the result: */ out->tessfactors_are_def_in_all_invocs &= - main_block_tf_writemask && !(cond_block_tf_writemask & ~main_block_tf_writemask); /* Analyze the next code segment from scratch. */ @@ -1155,7 +1154,6 @@ tgsi_scan_tess_ctrl(const struct tgsi_token *tokens, /* Accumulate the result for the last code segment separated by a barrier. */ if (main_block_tf_writemask || cond_block_tf_writemask) { out->tessfactors_are_def_in_all_invocs &= - main_block_tf_writemask && !(cond_block_tf_writemask & ~main_block_tf_writemask); } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index e7888e6012b..43619dd329d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1150,7 +1150,7 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, LLVMValueRef buffer, base, buf_addr; LLVMValueRef values[4]; bool skip_lds_store; - bool is_tess_factor = false; + bool is_tess_factor = false, is_tess_inner = false; /* Only handle per-patch and per-vertex outputs here. * Vectors will be lowered to scalars and this function will be called again. @@ -1177,8 +1177,11 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, /* Always write tess factors into LDS for the TCS epilog. */ if (name == TGSI_SEMANTIC_TESSINNER || name == TGSI_SEMANTIC_TESSOUTER) { - skip_lds_store = false; + /* The epilog doesn't read LDS if invocation 0 defines tess factors. */ + skip_lds_store = !sh_info->reads_tessfactor_outputs && + ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs; is_tess_factor = true; + is_tess_inner = name == TGSI_SEMANTIC_TESSINNER; } } } @@ -1207,6 +1210,18 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, buf_addr, base, 4 * chan_index, 1, 0, true, false); } + + /* Write tess factors into VGPRs for the epilog. */ + if (is_tess_factor && + ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { + if (!is_tess_inner) { + LLVMBuildStore(gallivm->builder, value, /* outer */ + ctx->invoc0_tess_factors[chan_index]); + } else if (chan_index < 2) { + LLVMBuildStore(gallivm->builder, value, /* inner */ + ctx->invoc0_tess_factors[4 + chan_index]); + } + } } if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) { @@ -2671,7 +2686,9 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, LLVMValueRef rel_patch_id, LLVMValueRef invocation_id, - LLVMValueRef tcs_out_current_patch_data_offset) + LLVMValueRef tcs_out_current_patch_data_offset, + LLVMValueRef invoc0_tf_outer[4], + LLVMValueRef invoc0_tf_inner[2]) { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = &ctx->gallivm; @@ -2682,7 +2699,9 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, unsigned stride, outer_comps, inner_comps, i, offset; struct lp_build_if_state if_ctx, inner_if_ctx; - si_llvm_emit_barrier(NULL, bld_base, NULL); + /* Add a barrier before loading tess factors from LDS. */ + if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) + si_llvm_emit_barrier(NULL, bld_base, NULL); /* Do this only for invocation 0, because the tess levels are per-patch, * not per-vertex. @@ -2716,32 +2735,32 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, return; } - /* Load tess_inner and tess_outer from LDS. - * Any invocation can write them, so we can't get them from a temporary. - */ - tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); - tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); - - lds_base = tcs_out_current_patch_data_offset; - lds_inner = LLVMBuildAdd(gallivm->builder, lds_base, - LLVMConstInt(ctx->i32, - tess_inner_index * 4, 0), ""); - lds_outer = LLVMBuildAdd(gallivm->builder, lds_base, - LLVMConstInt(ctx->i32, - tess_outer_index * 4, 0), ""); - for (i = 0; i < 4; i++) { inner[i] = LLVMGetUndef(ctx->i32); outer[i] = LLVMGetUndef(ctx->i32); } - if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { - /* For isolines, the hardware expects tess factors in the - * reverse order from what GLSL / TGSI specify. - */ - outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer); - outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer); + if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) { + /* Tess factors are in VGPRs. */ + for (i = 0; i < outer_comps; i++) + outer[i] = out[i] = invoc0_tf_outer[i]; + for (i = 0; i < inner_comps; i++) + inner[i] = out[outer_comps+i] = invoc0_tf_inner[i]; } else { + /* Load tess_inner and tess_outer from LDS. + * Any invocation can write them, so we can't get them from a temporary. + */ + tess_inner_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0); + tess_outer_index = si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0); + + lds_base = tcs_out_current_patch_data_offset; + lds_inner = LLVMBuildAdd(gallivm->builder, lds_base, + LLVMConstInt(ctx->i32, + tess_inner_index * 4, 0), ""); + lds_outer = LLVMBuildAdd(gallivm->builder, lds_base, + LLVMConstInt(ctx->i32, + tess_outer_index * 4, 0), ""); + for (i = 0; i < outer_comps; i++) { outer[i] = out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); @@ -2752,6 +2771,15 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, } } + if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { + /* For isolines, the hardware expects tess factors in the + * reverse order from what GLSL / TGSI specify. + */ + LLVMValueRef tmp = out[0]; + out[0] = out[1]; + out[1] = tmp; + } + /* Convert the outputs to vectors for stores. */ vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4)); vec1 = NULL; @@ -2946,7 +2974,18 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, ""); - ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + + if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) { + vgpr++; /* skip the tess factor LDS offset */ + for (unsigned i = 0; i < 6; i++) { + LLVMValueRef value = + LLVMBuildLoad(builder, ctx->invoc0_tess_factors[i], ""); + value = bitcast(bld_base, TGSI_TYPE_FLOAT, value); + ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, ""); + } + } else { + ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, ""); + } ctx->return_value = ret; } @@ -4330,7 +4369,7 @@ static void create_function(struct si_shader_context *ctx) */ for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++) returns[num_returns++] = ctx->i32; /* SGPRs */ - for (i = 0; i < 5; i++) + for (i = 0; i < 11; i++) returns[num_returns++] = ctx->f32; /* VGPRs */ break; @@ -4387,7 +4426,7 @@ static void create_function(struct si_shader_context *ctx) */ for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; i++) returns[num_returns++] = ctx->i32; /* SGPRs */ - for (i = 0; i < 5; i++) + for (i = 0; i < 11; i++) returns[num_returns++] = ctx->f32; /* VGPRs */ } break; @@ -5692,6 +5731,14 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx, } } + if (ctx->type == PIPE_SHADER_TESS_CTRL && + sel->tcs_info.tessfactors_are_def_in_all_invocs) { + for (unsigned i = 0; i < 6; i++) { + ctx->invoc0_tess_factors[i] = + lp_build_alloca_undef(&ctx->gallivm, ctx->i32, ""); + } + } + if (ctx->type == PIPE_SHADER_GEOMETRY) { int i; for (i = 0; i < 4; i++) { @@ -6926,16 +6973,24 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx, add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch */ add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors should be loaded from */ + for (unsigned i = 0; i < 6; i++) + add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */ + /* Create the function. */ si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo, ctx->screen->b.chip_class >= CIK ? 128 : 64); declare_lds_as_pointer(ctx); func = ctx->main_fn; + LLVMValueRef invoc0_tess_factors[6]; + for (unsigned i = 0; i < 6; i++) + invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 3 + i); + si_write_tess_factors(bld_base, LLVMGetParam(func, tess_factors_idx), LLVMGetParam(func, tess_factors_idx + 1), - LLVMGetParam(func, tess_factors_idx + 2)); + LLVMGetParam(func, tess_factors_idx + 2), + invoc0_tess_factors, invoc0_tess_factors + 4); LLVMBuildRetVoid(gallivm->builder); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index ee6b0c167f9..4592ac551ce 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -327,6 +327,7 @@ struct si_shader_selector { struct nir_shader *nir; struct pipe_stream_output_info so; struct tgsi_shader_info info; + struct tgsi_tessctrl_info tcs_info; /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */ unsigned type; @@ -404,6 +405,7 @@ struct si_vs_prolog_bits { /* Common TCS bits between the shader key and the epilog key. */ struct si_tcs_epilog_bits { unsigned prim_mode:3; + unsigned invoc0_tess_factors_are_def:1; unsigned tes_reads_tess_factors:1; }; diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index ad29ab7e845..023f9a6a093 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -209,6 +209,7 @@ struct si_shader_context { LLVMValueRef gsvs_ring[4]; LLVMValueRef lds; + LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */ LLVMValueRef gs_next_vertex[4]; LLVMValueRef postponed_kill; LLVMValueRef return_value; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 9f76551cfbb..6398111e5a6 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1301,6 +1301,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, key->part.tcs.epilog.prim_mode = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; + key->part.tcs.epilog.invoc0_tess_factors_are_def = + sel->tcs_info.tessfactors_are_def_in_all_invocs; key->part.tcs.epilog.tes_reads_tess_factors = sctx->tes_shader.cso->info.reads_tess_factors; @@ -2004,6 +2006,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, } tgsi_scan_shader(state->tokens, &sel->info); + tgsi_scan_tess_ctrl(state->tokens, &sel->info, &sel->tcs_info); } else { assert(state->type == PIPE_SHADER_IR_NIR); -- 2.30.2