From bd1186572f6924a15ea10cd72a95c6d451016bae Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 5 Feb 2019 20:22:01 +0100 Subject: [PATCH] radv: add support for push constants inlining when possible This removes some scalar loads from shaders, but it increases the number of SET_SH_REG packets. This is currently basic but it could be improved if needed. Inlining dynamic offsets might also help. Original idea from Dave Airlie. 29077 shaders in 15096 tests Totals: SGPRS: 1321325 -> 1357101 (2.71 %) VGPRS: 936000 -> 932576 (-0.37 %) Spilled SGPRs: 24804 -> 24791 (-0.05 %) Code Size: 49827960 -> 49642232 (-0.37 %) bytes Max Waves: 242007 -> 242700 (0.29 %) Totals from affected shaders: SGPRS: 290989 -> 326765 (12.29 %) VGPRS: 244680 -> 241256 (-1.40 %) Spilled SGPRs: 1442 -> 1429 (-0.90 %) Code Size: 8126688 -> 7940960 (-2.29 %) bytes Max Waves: 80952 -> 81645 (0.86 %) Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen --- src/amd/common/ac_nir_to_llvm.c | 27 +++++++++-- src/amd/common/ac_shader_abi.h | 5 ++ src/amd/vulkan/radv_cmd_buffer.c | 79 +++++++++++++++++++++++-------- src/amd/vulkan/radv_nir_to_llvm.c | 58 +++++++++++++++++++++++ src/amd/vulkan/radv_shader.h | 11 +++-- 5 files changed, 152 insertions(+), 28 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 54559b19f02..4f44e32d9f9 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -1392,10 +1392,31 @@ static LLVMValueRef visit_load_push_constant(struct ac_nir_context *ctx, nir_intrinsic_instr *instr) { LLVMValueRef ptr, addr; + LLVMValueRef src0 = get_src(ctx, instr->src[0]); + unsigned index = nir_intrinsic_base(instr); - addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0); - addr = LLVMBuildAdd(ctx->ac.builder, addr, - get_src(ctx, instr->src[0]), ""); + addr = LLVMConstInt(ctx->ac.i32, index, 0); + addr = LLVMBuildAdd(ctx->ac.builder, addr, src0, ""); + + /* Load constant values from user SGPRS when possible, otherwise + * fallback to the default path that loads directly from memory. + */ + if (LLVMIsConstant(src0) && + instr->dest.ssa.bit_size == 32) { + unsigned count = instr->dest.ssa.num_components; + unsigned offset = index; + + offset += LLVMConstIntGetZExtValue(src0); + offset /= 4; + + offset -= ctx->abi->base_inline_push_consts; + + if (offset + count <= ctx->abi->num_inline_push_consts) { + return ac_build_gather_values(&ctx->ac, + ctx->abi->inline_push_consts + offset, + count); + } + } ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr); diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h index ee18e6c1923..c9b2c2eb4b8 100644 --- a/src/amd/common/ac_shader_abi.h +++ b/src/amd/common/ac_shader_abi.h @@ -32,6 +32,8 @@ struct nir_variable; #define AC_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1) +#define AC_MAX_INLINE_PUSH_CONSTS 8 + enum ac_descriptor_type { AC_DESC_IMAGE, AC_DESC_FMASK, @@ -66,6 +68,9 @@ struct ac_shader_abi { /* Vulkan only */ LLVMValueRef push_constants; + LLVMValueRef inline_push_consts[AC_MAX_INLINE_PUSH_CONSTS]; + unsigned num_inline_push_consts; + unsigned base_inline_push_consts; LLVMValueRef view_index; LLVMValueRef outputs[AC_LLVM_MAX_OUTPUTS * 4]; diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index 3b215b4b103..989372e48b7 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -628,6 +628,23 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer, } } +static void +radv_emit_inline_push_consts(struct radv_cmd_buffer *cmd_buffer, + struct radv_pipeline *pipeline, + gl_shader_stage stage, + int idx, int count, uint32_t *values) +{ + struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, stage, idx); + uint32_t base_reg = pipeline->user_data_0[stage]; + if (loc->sgpr_idx == -1) + return; + + assert(loc->num_sgprs == count); + + radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, count); + radeon_emit_array(cmd_buffer->cs, values, count); +} + static void radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline) @@ -1901,6 +1918,7 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, radv_get_descriptors_state(cmd_buffer, bind_point); struct radv_pipeline_layout *layout = pipeline->layout; struct radv_shader_variant *shader, *prev_shader; + bool need_push_constants = false; unsigned offset; void *ptr; uint64_t va; @@ -1910,37 +1928,56 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, (!layout->push_constant_size && !layout->dynamic_offset_count)) return; - if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size + - 16 * layout->dynamic_offset_count, - 256, &offset, &ptr)) - return; + radv_foreach_stage(stage, stages) { + if (!pipeline->shaders[stage]) + continue; - memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size); - memcpy((char*)ptr + layout->push_constant_size, - descriptors_state->dynamic_buffers, - 16 * layout->dynamic_offset_count); + need_push_constants |= pipeline->shaders[stage]->info.info.loads_push_constants; + need_push_constants |= pipeline->shaders[stage]->info.info.loads_dynamic_offsets; - va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); - va += offset; + uint8_t base = pipeline->shaders[stage]->info.info.base_inline_push_consts; + uint8_t count = pipeline->shaders[stage]->info.info.num_inline_push_consts; - MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, - cmd_buffer->cs, MESA_SHADER_STAGES * 4); + radv_emit_inline_push_consts(cmd_buffer, pipeline, stage, + AC_UD_INLINE_PUSH_CONSTANTS, + count, + (uint32_t *)&cmd_buffer->push_constants[base * 4]); + } - prev_shader = NULL; - radv_foreach_stage(stage, stages) { - shader = radv_get_shader(pipeline, stage); + if (need_push_constants) { + if (!radv_cmd_buffer_upload_alloc(cmd_buffer, layout->push_constant_size + + 16 * layout->dynamic_offset_count, + 256, &offset, &ptr)) + return; + + memcpy(ptr, cmd_buffer->push_constants, layout->push_constant_size); + memcpy((char*)ptr + layout->push_constant_size, + descriptors_state->dynamic_buffers, + 16 * layout->dynamic_offset_count); + + va = radv_buffer_get_va(cmd_buffer->upload.upload_bo); + va += offset; + + MAYBE_UNUSED unsigned cdw_max = + radeon_check_space(cmd_buffer->device->ws, + cmd_buffer->cs, MESA_SHADER_STAGES * 4); + + prev_shader = NULL; + radv_foreach_stage(stage, stages) { + shader = radv_get_shader(pipeline, stage); - /* Avoid redundantly emitting the address for merged stages. */ - if (shader && shader != prev_shader) { - radv_emit_userdata_address(cmd_buffer, pipeline, stage, - AC_UD_PUSH_CONSTANTS, va); + /* Avoid redundantly emitting the address for merged stages. */ + if (shader && shader != prev_shader) { + radv_emit_userdata_address(cmd_buffer, pipeline, stage, + AC_UD_PUSH_CONSTANTS, va); - prev_shader = shader; + prev_shader = shader; + } } + assert(cmd_buffer->cs->cdw <= cdw_max); } cmd_buffer->push_constant_stages &= ~stages; - assert(cmd_buffer->cs->cdw <= cdw_max); } static void diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 29300aeab9f..a0ce569d409 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -627,6 +627,50 @@ count_vs_user_sgprs(struct radv_shader_context *ctx) return count; } +static void allocate_inline_push_consts(struct radv_shader_context *ctx, + struct user_sgpr_info *user_sgpr_info) +{ + uint8_t remaining_sgprs = user_sgpr_info->remaining_sgprs; + + /* Only supported if shaders use push constants. */ + if (ctx->shader_info->info.min_push_constant_used == UINT8_MAX) + return; + + /* Only supported if shaders don't have indirect push constants. */ + if (ctx->shader_info->info.has_indirect_push_constants) + return; + + /* Only supported for 32-bit push constants. */ + if (!ctx->shader_info->info.has_only_32bit_push_constants) + return; + + uint8_t num_push_consts = + (ctx->shader_info->info.max_push_constant_used - + ctx->shader_info->info.min_push_constant_used) / 4; + + /* Check if the number of user SGPRs is large enough. */ + if (num_push_consts < remaining_sgprs) { + ctx->shader_info->info.num_inline_push_consts = num_push_consts; + } else { + ctx->shader_info->info.num_inline_push_consts = remaining_sgprs; + } + + /* Clamp to the maximum number of allowed inlined push constants. */ + if (ctx->shader_info->info.num_inline_push_consts > AC_MAX_INLINE_PUSH_CONSTS) + ctx->shader_info->info.num_inline_push_consts = AC_MAX_INLINE_PUSH_CONSTS; + + if (ctx->shader_info->info.num_inline_push_consts == num_push_consts && + !ctx->shader_info->info.loads_dynamic_offsets) { + /* Disable the default push constants path if all constants are + * inlined and if shaders don't use dynamic descriptors. + */ + ctx->shader_info->info.loads_push_constants = false; + } + + ctx->shader_info->info.base_inline_push_consts = + ctx->shader_info->info.min_push_constant_used / 4; +} + static void allocate_user_sgprs(struct radv_shader_context *ctx, gl_shader_stage stage, bool has_previous_stage, @@ -706,6 +750,8 @@ static void allocate_user_sgprs(struct radv_shader_context *ctx, } else { user_sgpr_info->remaining_sgprs = remaining_sgprs - num_desc_set; } + + allocate_inline_push_consts(ctx, user_sgpr_info); } static void @@ -735,6 +781,13 @@ declare_global_input_sgprs(struct radv_shader_context *ctx, add_arg(args, ARG_SGPR, type, &ctx->abi.push_constants); } + for (unsigned i = 0; i < ctx->shader_info->info.num_inline_push_consts; i++) { + add_arg(args, ARG_SGPR, ctx->ac.i32, + &ctx->abi.inline_push_consts[i]); + } + ctx->abi.num_inline_push_consts = ctx->shader_info->info.num_inline_push_consts; + ctx->abi.base_inline_push_consts = ctx->shader_info->info.base_inline_push_consts; + if (ctx->shader_info->info.so.num_outputs) { add_arg(args, ARG_SGPR, ac_array_in_const32_addr_space(ctx->ac.v4i32), @@ -853,6 +906,11 @@ set_global_input_locs(struct radv_shader_context *ctx, set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx); } + if (ctx->shader_info->info.num_inline_push_consts) { + set_loc_shader(ctx, AC_UD_INLINE_PUSH_CONSTANTS, user_sgpr_idx, + ctx->shader_info->info.num_inline_push_consts); + } + if (ctx->streamout_buffers) { set_loc_shader_ptr(ctx, AC_UD_STREAMOUT_BUFFERS, user_sgpr_idx); diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index c194401c02d..e0d27378724 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -129,10 +129,11 @@ struct radv_nir_compiler_options { enum radv_ud_index { AC_UD_SCRATCH_RING_OFFSETS = 0, AC_UD_PUSH_CONSTANTS = 1, - AC_UD_INDIRECT_DESCRIPTOR_SETS = 2, - AC_UD_VIEW_INDEX = 3, - AC_UD_STREAMOUT_BUFFERS = 4, - AC_UD_SHADER_START = 5, + AC_UD_INLINE_PUSH_CONSTANTS = 2, + AC_UD_INDIRECT_DESCRIPTOR_SETS = 3, + AC_UD_VIEW_INDEX = 4, + AC_UD_STREAMOUT_BUFFERS = 5, + AC_UD_SHADER_START = 6, AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START, AC_UD_VS_BASE_VERTEX_START_INSTANCE, AC_UD_VS_MAX_UD, @@ -167,6 +168,8 @@ struct radv_shader_info { uint8_t max_push_constant_used; bool has_only_32bit_push_constants; bool has_indirect_push_constants; + uint8_t num_inline_push_consts; + uint8_t base_inline_push_consts; uint32_t desc_set_used_mask; bool needs_multiview_view_index; bool uses_invocation_id; -- 2.30.2