From 85a6bcca615f9aae1ffd2a1e790ee5d980e7cc43 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 21 May 2020 05:13:25 -0400 Subject: [PATCH] radeonsi: pass at most 3 images and/or shader buffers via user SGPRs for compute This should slightly decrease shader lifetime. Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/si_compute.c | 38 +++++++++++-- src/gallium/drivers/radeonsi/si_descriptors.c | 54 +++++++++++++++++++ src/gallium/drivers/radeonsi/si_pipe.h | 2 + src/gallium/drivers/radeonsi/si_shader.c | 18 +++++++ src/gallium/drivers/radeonsi/si_shader.h | 6 +++ .../drivers/radeonsi/si_shader_internal.h | 2 + .../radeonsi/si_shader_llvm_resources.c | 12 +++++ src/gallium/drivers/radeonsi/si_shader_nir.c | 1 + 8 files changed, 130 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 73b2a69923e..b3dea264982 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -141,6 +141,38 @@ static void si_create_compute_state_async(void *job, int thread_index) program->num_cs_user_data_dwords = sel->info.properties[TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD]; + unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) + + (program->reads_variable_block_size ? 3 : 0) + + program->num_cs_user_data_dwords; + + /* Fast path for compute shaders - some descriptors passed via user SGPRs. */ + /* Shader buffers in user SGPRs. */ + for (unsigned i = 0; i < 3 && user_sgprs <= 12 && sel->info.shader_buffers_declared & (1 << i); i++) { + user_sgprs = align(user_sgprs, 4); + if (i == 0) + sel->cs_shaderbufs_sgpr_index = user_sgprs; + user_sgprs += 4; + sel->cs_num_shaderbufs_in_user_sgprs++; + } + + /* Images in user SGPRs. */ + unsigned non_msaa_images = sel->info.images_declared & ~sel->info.msaa_images_declared; + + for (unsigned i = 0; i < 3 && non_msaa_images & (1 << i); i++) { + unsigned num_sgprs = sel->info.image_buffers & (1 << i) ? 4 : 8; + + if (align(user_sgprs, num_sgprs) + num_sgprs > 16) + break; + + user_sgprs = align(user_sgprs, num_sgprs); + if (i == 0) + sel->cs_images_sgpr_index = user_sgprs; + user_sgprs += num_sgprs; + sel->cs_num_images_in_user_sgprs++; + } + sel->cs_images_num_sgprs = user_sgprs - sel->cs_images_sgpr_index; + assert(user_sgprs <= 16); + unsigned char ir_sha1_cache_key[20]; si_get_ir_cache_key(sel, false, false, ir_sha1_cache_key); @@ -164,9 +196,6 @@ static void si_create_compute_state_async(void *job, int thread_index) } bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0; - unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + (sel->info.uses_grid_size ? 3 : 0) + - (program->reads_variable_block_size ? 3 : 0) + - program->num_cs_user_data_dwords; shader->config.rsrc1 = S_00B848_VGPRS((shader->config.num_vgprs - 1) / (sscreen->compute_wave_size == 32 ? 8 : 4)) | @@ -275,6 +304,9 @@ static void si_bind_compute_state(struct pipe_context *ctx, void *state) sel->active_const_and_shader_buffers); si_set_active_descriptors(sctx, SI_DESCS_FIRST_COMPUTE + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, sel->active_samplers_and_images); + + sctx->compute_shaderbuf_sgprs_dirty = true; + sctx->compute_image_sgprs_dirty = true; } static void si_set_global_binding(struct pipe_context *ctx, unsigned first, unsigned n, diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 3e354d106b0..865f0b1b771 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -812,6 +812,11 @@ static void si_set_shader_images(struct pipe_context *pipe, enum pipe_shader_typ si_set_shader_image(ctx, shader, slot, NULL, false); } + if (shader == PIPE_SHADER_COMPUTE && + ctx->cs_shader_state.program && + start_slot < ctx->cs_shader_state.program->sel.cs_num_images_in_user_sgprs) + ctx->compute_image_sgprs_dirty = true; + si_update_shader_needs_decompress_mask(ctx, shader); } @@ -1338,6 +1343,11 @@ static void si_set_shader_buffers(struct pipe_context *ctx, enum pipe_shader_typ assert(start_slot + count <= SI_NUM_SHADER_BUFFERS); + if (shader == PIPE_SHADER_COMPUTE && + sctx->cs_shader_state.program && + start_slot < sctx->cs_shader_state.program->sel.cs_num_shaderbufs_in_user_sgprs) + sctx->compute_shaderbuf_sgprs_dirty = true; + for (i = 0; i < count; ++i) { const struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL; unsigned slot = si_get_shaderbuf_slot(start_slot + i); @@ -1939,6 +1949,8 @@ void si_shader_pointers_mark_dirty(struct si_context *sctx) si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers); sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL; + sctx->compute_shaderbuf_sgprs_dirty = true; + sctx->compute_image_sgprs_dirty = true; } /* Set a base register address for user data constants in the given shader. @@ -2137,6 +2149,8 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx) void si_emit_compute_shader_pointers(struct si_context *sctx) { + struct radeon_cmdbuf *cs = sctx->gfx_cs; + struct si_shader_selector *shader = &sctx->cs_shader_state.program->sel; unsigned base = R_00B900_COMPUTE_USER_DATA_0; si_emit_consecutive_shader_pointers(sctx, SI_DESCS_SHADER_MASK(COMPUTE), @@ -2147,6 +2161,46 @@ void si_emit_compute_shader_pointers(struct si_context *sctx) si_emit_shader_pointer(sctx, &sctx->bindless_descriptors, base); sctx->compute_bindless_pointer_dirty = false; } + + /* Set shader buffer descriptors in user SGPRs. */ + unsigned num_shaderbufs = shader->cs_num_shaderbufs_in_user_sgprs; + if (num_shaderbufs && sctx->compute_shaderbuf_sgprs_dirty) { + struct si_descriptors *desc = si_const_and_shader_buffer_descriptors(sctx, PIPE_SHADER_COMPUTE); + + si_emit_shader_pointer_head(cs, R_00B900_COMPUTE_USER_DATA_0 + + shader->cs_shaderbufs_sgpr_index * 4, + num_shaderbufs * 4); + + for (unsigned i = 0; i < num_shaderbufs; i++) + radeon_emit_array(cs, &desc->list[si_get_shaderbuf_slot(i) * 4], 4); + + sctx->compute_shaderbuf_sgprs_dirty = false; + } + + /* Set image descriptors in user SGPRs. */ + unsigned num_images = shader->cs_num_images_in_user_sgprs; + if (num_images && sctx->compute_image_sgprs_dirty) { + struct si_descriptors *desc = si_sampler_and_image_descriptors(sctx, PIPE_SHADER_COMPUTE); + + si_emit_shader_pointer_head(cs, R_00B900_COMPUTE_USER_DATA_0 + + shader->cs_images_sgpr_index * 4, + shader->cs_images_num_sgprs); + + for (unsigned i = 0; i < num_images; i++) { + unsigned desc_offset = si_get_image_slot(i) * 8; + unsigned num_sgprs = 8; + + /* Image buffers are in desc[4..7]. */ + if (shader->info.image_buffers & (1 << i)) { + desc_offset += 4; + num_sgprs = 4; + } + + radeon_emit_array(cs, &desc->list[desc_offset], num_sgprs); + } + + sctx->compute_image_sgprs_dirty = false; + } } /* BINDLESS */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 9e5a8e87df4..df03eb75963 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1030,6 +1030,8 @@ struct si_context { unsigned cs_max_waves_per_sh; bool flatshade; bool do_update_shaders; + bool compute_shaderbuf_sgprs_dirty; + bool compute_image_sgprs_dirty; /* shader descriptors */ struct si_descriptors descriptors[SI_NUM_DESCS]; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 7ae29880adb..011abed49f3 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -698,6 +698,24 @@ void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader) ac_add_arg(&ctx->args, AC_ARG_SGPR, cs_user_data_dwords, AC_ARG_INT, &ctx->cs_user_data); } + /* Some descriptors can be in user SGPRs. */ + /* Shader buffers in user SGPRs. */ + for (unsigned i = 0; i < shader->selector->cs_num_shaderbufs_in_user_sgprs; i++) { + while (ctx->args.num_sgprs_used % 4 != 0) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + + ac_add_arg(&ctx->args, AC_ARG_SGPR, 4, AC_ARG_INT, &ctx->cs_shaderbuf[i]); + } + /* Images in user SGPRs. */ + for (unsigned i = 0; i < shader->selector->cs_num_images_in_user_sgprs; i++) { + unsigned num_sgprs = shader->selector->info.image_buffers & (1 << i) ? 4 : 8; + + while (ctx->args.num_sgprs_used % num_sgprs != 0) + ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, NULL); + + ac_add_arg(&ctx->args, AC_ARG_SGPR, num_sgprs, AC_ARG_INT, &ctx->cs_image[i]); + } + /* Hardware SGPRs. */ for (i = 0; i < 3; i++) { if (shader->selector->info.uses_block_id[i]) { diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 4b3bdf4a30e..756bcd5a90e 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -394,6 +394,7 @@ struct si_shader_info { unsigned num_written_clipdistance; unsigned images_declared; /**< bitmask of declared images */ + unsigned image_buffers; /**< bitmask of images that are buffers */ unsigned msaa_images_declared; /**< bitmask of declared MSAA images */ unsigned shader_buffers_declared; /**< bitmask of declared shader buffers */ @@ -439,6 +440,11 @@ struct si_shader_selector { bool vs_needs_prolog; bool prim_discard_cs_allowed; bool ngg_culling_allowed; + ubyte cs_shaderbufs_sgpr_index; + ubyte cs_num_shaderbufs_in_user_sgprs; + ubyte cs_images_sgpr_index; + ubyte cs_images_num_sgprs; + ubyte cs_num_images_in_user_sgprs; unsigned num_vs_inputs; unsigned num_vbos_in_user_sgprs; unsigned pa_cl_vs_out_cntl; diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 2191604b706..2aba488b35b 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -171,6 +171,8 @@ struct si_shader_context { /* CS */ struct ac_arg block_size; struct ac_arg cs_user_data; + struct ac_arg cs_shaderbuf[3]; + struct ac_arg cs_image[3]; struct ac_llvm_compiler *compiler; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c index 122e6976261..ebde256c9b0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c @@ -107,6 +107,12 @@ static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index) static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write) { struct si_shader_context *ctx = si_shader_context_from_abi(abi); + + /* Fast path if the shader buffer is in user SGPRs. */ + if (LLVMIsConstant(index) && + LLVMConstIntGetZExtValue(index) < ctx->shader->selector->cs_num_shaderbufs_in_user_sgprs) + return ac_get_arg(&ctx->ac, ctx->cs_shaderbuf[LLVMConstIntGetZExtValue(index)]); + LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers); index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers); @@ -270,6 +276,12 @@ static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned } if (image) { + /* Fast path if the image is in user SGPRs. */ + if (!dynamic_index && + const_index < ctx->shader->selector->cs_num_images_in_user_sgprs && + (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER)) + return ac_get_arg(&ctx->ac, ctx->cs_image[const_index]); + /* FMASKs are separate from images. */ if (desc_type == AC_DESC_FMASK) { index = diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index 74aa9f475f3..6b8bea1c11c 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -717,6 +717,7 @@ void si_nir_scan_shader(const struct nir_shader *nir, struct si_shader_info *inf info->const_buffers_declared = u_bit_consecutive(0, nir->info.num_ubos); info->images_declared = u_bit_consecutive(0, nir->info.num_images); info->msaa_images_declared = nir->info.msaa_images; + info->image_buffers = nir->info.image_buffers; info->samplers_declared = nir->info.textures_used; info->num_written_clipdistance = nir->info.clip_distance_array_size; -- 2.30.2