From: Marek Olšák Date: Mon, 15 May 2017 21:03:01 +0000 (+0200) Subject: radeonsi: only upload (dump to L2) those descriptors that are used by shaders X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a7f098fb769bdfdac692a04eab6bdd84e061e5cd;p=mesa.git radeonsi: only upload (dump to L2) those descriptors that are used by shaders This decreases the size of CE RAM dumps to L2, or the size of descriptor uploads without CE. Reviewed-by: Nicolai Hähnle --- diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 22ef1116afe..4c980668d35 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -208,7 +208,24 @@ static void *si_create_compute_state( static void si_bind_compute_state(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context*)ctx; - sctx->cs_shader_state.program = (struct si_compute*)state; + struct si_compute *program = (struct si_compute*)state; + + sctx->cs_shader_state.program = program; + if (!program) + return; + + /* Wait because we need active slot usage masks. */ + if (program->ir_type == PIPE_SHADER_IR_TGSI) + util_queue_fence_wait(&program->ready); + + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, + program->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + SI_DESCS_FIRST_COMPUTE + + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, + program->active_samplers_and_images); } static void si_set_global_binding( @@ -756,12 +773,9 @@ static void si_launch_grid( sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH; - if (program->ir_type == PIPE_SHADER_IR_TGSI) { - util_queue_fence_wait(&program->ready); - - if (program->shader.compilation_failed) - return; - } + if (program->ir_type == PIPE_SHADER_IR_TGSI && + program->shader.compilation_failed) + return; si_decompress_compute_textures(sctx); diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index b38b6b5fa9c..b514961925f 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -125,12 +125,14 @@ static void si_release_descriptors(struct si_descriptors *desc) } static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size, - unsigned *out_offset, struct r600_resource **out_buf) { + unsigned *out_offset, struct r600_resource **out_buf) +{ uint64_t va; u_suballocator_alloc(sctx->ce_suballocator, size, - sctx->screen->b.info.tcc_cache_line_size, - out_offset, (struct pipe_resource**)out_buf); + si_optimal_tcc_alignment(sctx, size), + out_offset, + (struct pipe_resource**)out_buf); if (!out_buf) return false; @@ -193,7 +195,16 @@ static bool si_upload_descriptors(struct si_context *sctx, struct si_descriptors *desc, struct r600_atom * atom) { - unsigned list_size = desc->num_elements * desc->element_dw_size * 4; + unsigned slot_size = desc->element_dw_size * 4; + unsigned first_slot_offset = desc->first_active_slot * slot_size; + unsigned upload_size = desc->num_active_slots * slot_size; + + /* Skip the upload if no shader is using the descriptors. dirty_mask + * will stay dirty and the descriptors will be uploaded when there is + * a shader using them. + */ + if (!upload_size) + return true; if (sctx->ce_ib && desc->uses_ce) { uint32_t const* list = (uint32_t const*)desc->list; @@ -212,25 +223,32 @@ static bool si_upload_descriptors(struct si_context *sctx, radeon_emit_array(sctx->ce_ib, list + begin, count); } - if (!si_ce_upload(sctx, desc->ce_offset, list_size, - &desc->buffer_offset, &desc->buffer)) + if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset, + upload_size, (unsigned*)&desc->buffer_offset, + &desc->buffer)) return false; } else { - void *ptr; + uint32_t *ptr; - u_upload_alloc(sctx->b.b.const_uploader, 0, list_size, - sctx->screen->b.info.tcc_cache_line_size, - &desc->buffer_offset, - (struct pipe_resource**)&desc->buffer, &ptr); + u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size, + si_optimal_tcc_alignment(sctx, upload_size), + (unsigned*)&desc->buffer_offset, + (struct pipe_resource**)&desc->buffer, + (void**)&ptr); if (!desc->buffer) return false; /* skip the draw call */ - util_memcpy_cpu_to_le32(ptr, desc->list, list_size); - desc->gpu_list = ptr; + util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset, + upload_size); + desc->gpu_list = ptr - first_slot_offset / 4; radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } + + /* The shader pointer should point to slot 0. */ + desc->buffer_offset -= first_slot_offset; + desc->dirty_mask = 0; if (atom) @@ -1030,7 +1048,7 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) u_upload_alloc(sctx->b.b.const_uploader, 0, desc_list_byte_size, si_optimal_tcc_alignment(sctx, desc_list_byte_size), - &desc->buffer_offset, + (unsigned*)&desc->buffer_offset, (struct pipe_resource**)&desc->buffer, (void**)&ptr); if (!desc->buffer) return false; @@ -1891,7 +1909,8 @@ static void si_emit_shader_pointer(struct si_context *sctx, struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint64_t va; - assert(desc->buffer); + if (!desc->buffer) + return; /* the pointer is not used by current shaders */ va = desc->buffer->gpu_address + desc->buffer_offset; @@ -2034,6 +2053,8 @@ void si_init_all_descriptors(struct si_context *sctx) RADEON_USAGE_READWRITE, RADEON_USAGE_READ, RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER, &ce_offset); + sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS; + si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, 4, SI_NUM_VERTEX_BUFFERS, NULL); @@ -2156,3 +2177,41 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) si_shader_userdata_begin_new_cs(sctx); } + +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, + uint64_t new_active_mask) +{ + struct si_descriptors *desc = &sctx->descriptors[desc_idx]; + + /* Ignore no-op updates and updates that disable all slots. */ + if (!new_active_mask || + new_active_mask == u_bit_consecutive64(desc->first_active_slot, + desc->num_active_slots)) + return; + + int first, count; + u_bit_scan_consecutive_range64(&new_active_mask, &first, &count); + assert(new_active_mask == 0); + + /* Upload/dump descriptors if slots are being enabled. */ + if (first < desc->first_active_slot || + first + count > desc->first_active_slot + desc->num_active_slots) + sctx->descriptors_dirty |= 1u << desc_idx; + + desc->first_active_slot = first; + desc->num_active_slots = count; +} + +void si_set_active_descriptors_for_shader(struct si_context *sctx, + struct si_shader_selector *sel) +{ + if (!sel) + return; + + si_set_active_descriptors(sctx, + si_const_and_shader_buffer_descriptors_idx(sel->type), + sel->active_const_and_shader_buffers); + si_set_active_descriptors(sctx, + si_sampler_and_image_descriptors_idx(sel->type), + sel->active_samplers_and_images); +} diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f2003a5072e..dfabaa35566 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -42,6 +42,7 @@ struct si_screen; struct si_shader; +struct si_shader_selector; struct si_state_blend { struct si_pm4_state pm4; @@ -222,12 +223,20 @@ struct si_descriptors { /* The buffer where the descriptors have been uploaded. */ struct r600_resource *buffer; - unsigned buffer_offset; + int buffer_offset; /* can be negative if not using lower slots */ /* Offset in CE RAM */ unsigned ce_offset; - /* elements of the list that are changed and need to be uploaded */ + /* Slots that are used by currently-bound shaders. + * With CE: It determines which slots are dumped to L2. + * It doesn't skip uploads to CE RAM. + * Without CE: It determines which slots are uploaded. + */ + unsigned first_active_slot; + unsigned num_active_slots; + + /* Slots that have been changed and need to be uploaded. */ uint64_t dirty_mask; /* Whether CE is used to upload this descriptor array. */ @@ -315,6 +324,11 @@ void si_emit_graphics_shader_userdata(struct si_context *sctx, void si_emit_compute_shader_userdata(struct si_context *sctx); void si_set_rw_buffer(struct si_context *sctx, uint slot, const struct pipe_constant_buffer *input); +void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx, + uint64_t new_active_mask); +void si_set_active_descriptors_for_shader(struct si_context *sctx, + struct si_shader_selector *sel); + /* si_state.c */ struct si_shader_selector; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 45d996b6b6c..8ac430975d7 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -2151,6 +2151,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) sctx->do_update_shaders = true; si_mark_atom_dirty(sctx, &sctx->clip_regs); r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_update_tess_uses_prim_id(struct si_context *sctx) @@ -2188,6 +2189,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state) si_update_tess_uses_prim_id(sctx); } r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) @@ -2206,6 +2208,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) if (enable_changed) sctx->last_tcs = NULL; /* invalidate derived tess state */ + + si_set_active_descriptors_for_shader(sctx, sel); } static void si_bind_tes_shader(struct pipe_context *ctx, void *state) @@ -2230,6 +2234,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) sctx->last_tes_sh_base = -1; /* invalidate derived tess state */ } r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_bind_ps_shader(struct pipe_context *ctx, void *state) @@ -2247,6 +2252,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess) si_update_tess_uses_prim_id(sctx); si_mark_atom_dirty(sctx, &sctx->cb_render_state); + si_set_active_descriptors_for_shader(sctx, sel); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)