From: Marek Olšák Date: Tue, 13 Sep 2016 15:33:23 +0000 (+0200) Subject: radeonsi: reload PS inputs with direct indexing at each use (v2) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ab29788250a705eb0dd517cb3d38f37f944eb8ad;p=mesa.git radeonsi: reload PS inputs with direct indexing at each use (v2) The LLVM compiler can CSE interp intrinsics thanks to LLVMReadNoneAttribute. 26011 shaders in 14651 tests Totals: SGPRS: 1146340 -> 1132676 (-1.19 %) VGPRS: 727371 -> 711730 (-2.15 %) Spilled SGPRs: 2218 -> 2078 (-6.31 %) Spilled VGPRs: 369 -> 369 (0.00 %) Scratch VGPRs: 1344 -> 1344 (0.00 %) dwords per thread Code Size: 35841268 -> 36009732 (0.47 %) bytes LDS: 767 -> 767 (0.00 %) blocks Max Waves: 222559 -> 224779 (1.00 %) Wait states: 0 -> 0 (0.00 %) v2: don't call load_input for fragment shaders in emit_declaration Reviewed-by: Nicolai Hähnle --- diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index da5b7f58e15..f508d3230cc 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -30,7 +30,9 @@ #include #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_tgsi.h" +#include "tgsi/tgsi_parse.h" +#define RADEON_LLVM_MAX_INPUT_SLOTS 32 #define RADEON_LLVM_MAX_INPUTS 32 * 4 #define RADEON_LLVM_MAX_OUTPUTS 32 * 4 @@ -62,7 +64,8 @@ struct radeon_llvm_context { */ void (*load_input)(struct radeon_llvm_context *, unsigned input_index, - const struct tgsi_full_declaration *decl); + const struct tgsi_full_declaration *decl, + LLVMValueRef out[4]); void (*load_system_value)(struct radeon_llvm_context *, unsigned index, @@ -75,6 +78,7 @@ struct radeon_llvm_context { * values will be in the form of a target intrinsic that will inform the * backend how to load the actual inputs to the shader. */ + struct tgsi_full_declaration input_decls[RADEON_LLVM_MAX_INPUT_SLOTS]; LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS]; LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS]; diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index 4643e6d0ce7..4fa43cd2342 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -446,14 +446,29 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base, } } - case TGSI_FILE_INPUT: - result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)]; + case TGSI_FILE_INPUT: { + unsigned index = reg->Register.Index; + LLVMValueRef input[4]; + + /* I don't think doing this for vertex shaders is beneficial. + * For those, we want to make sure the VMEM loads are executed + * only once. Fragment shaders don't care much, because + * v_interp instructions are much cheaper than VMEM loads. + */ + if (ctx->soa.bld_base.info->processor == PIPE_SHADER_FRAGMENT) + ctx->load_input(ctx, index, &ctx->input_decls[index], input); + else + memcpy(input, &ctx->inputs[index * 4], sizeof(input)); + + result = input[swizzle]; + if (tgsi_type_is_64bit(type)) { ptr = result; - ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)]; + ptr2 = input[swizzle + 1]; return radeon_llvm_emit_fetch_64bit(bld_base, type, ptr, ptr2); } break; + } case TGSI_FILE_TEMPORARY: if (reg->Register.Index >= ctx->temps_count) @@ -626,8 +641,13 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base, { unsigned idx; for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { - if (ctx->load_input) - ctx->load_input(ctx, idx, decl); + if (ctx->load_input) { + ctx->input_decls[idx] = *decl; + + if (bld_base->info->processor != PIPE_SHADER_FRAGMENT) + ctx->load_input(ctx, idx, decl, + &ctx->inputs[idx * 4]); + } } } break; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index b034837d252..3ccff7ae9e0 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -440,7 +440,8 @@ static LLVMValueRef get_instance_index_for_fetch( static void declare_input_vs( struct radeon_llvm_context *radeon_bld, unsigned input_index, - const struct tgsi_full_declaration *decl) + const struct tgsi_full_declaration *decl, + LLVMValueRef out[4]) { struct lp_build_context *base = &radeon_bld->soa.bld_base.base; struct gallivm_state *gallivm = base->gallivm; @@ -498,11 +499,8 @@ static void declare_input_vs( /* Break up the vec4 into individual components */ for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); - /* XXX: Use a helper function for this. There is one in - * tgsi_llvm.c. */ - ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] = - LLVMBuildExtractElement(gallivm->builder, - input, llvm_chan, ""); + out[chan] = LLVMBuildExtractElement(gallivm->builder, + input, llvm_chan, ""); } } @@ -1463,7 +1461,8 @@ static LLVMValueRef get_interp_param(struct si_shader_context *ctx, static void declare_input_fs( struct radeon_llvm_context *radeon_bld, unsigned input_index, - const struct tgsi_full_declaration *decl) + const struct tgsi_full_declaration *decl, + LLVMValueRef out[4]) { struct lp_build_context *base = &radeon_bld->soa.bld_base.base; struct si_shader_context *ctx = @@ -1482,14 +1481,10 @@ static void declare_input_fs( unsigned offset = SI_PARAM_POS_FIXED_PT + 1 + (i ? util_bitcount(colors_read & 0xf) : 0); - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] = - mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] = - mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] = - mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; - radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] = - mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; + out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef; + out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef; + out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef; + out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef; return; } @@ -1513,7 +1508,7 @@ static void declare_input_fs( shader->selector->info.colors_read, interp_param, LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK), LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE), - &radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]); + &out[0]); } static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)