radeonsi: only upload (dump to L2) those descriptors that are used by shaders
authorMarek Olšák <marek.olsak@amd.com>
Mon, 15 May 2017 21:03:01 +0000 (23:03 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Thu, 18 May 2017 20:15:02 +0000 (22:15 +0200)
This decreases the size of CE RAM dumps to L2, or the size of descriptor
uploads without CE.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_state.h
src/gallium/drivers/radeonsi/si_state_shaders.c

index 22ef1116afee8b6e5de5c11461917d15a2e1b274..4c980668d350531631d1a24819b58f85037a2e98 100644 (file)
@@ -208,7 +208,24 @@ static void *si_create_compute_state(
 static void si_bind_compute_state(struct pipe_context *ctx, void *state)
 {
        struct si_context *sctx = (struct si_context*)ctx;
-       sctx->cs_shader_state.program = (struct si_compute*)state;
+       struct si_compute *program = (struct si_compute*)state;
+
+       sctx->cs_shader_state.program = program;
+       if (!program)
+               return;
+
+       /* Wait because we need active slot usage masks. */
+       if (program->ir_type == PIPE_SHADER_IR_TGSI)
+               util_queue_fence_wait(&program->ready);
+
+       si_set_active_descriptors(sctx,
+                                 SI_DESCS_FIRST_COMPUTE +
+                                 SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS,
+                                 program->active_const_and_shader_buffers);
+       si_set_active_descriptors(sctx,
+                                 SI_DESCS_FIRST_COMPUTE +
+                                 SI_SHADER_DESCS_SAMPLERS_AND_IMAGES,
+                                 program->active_samplers_and_images);
 }
 
 static void si_set_global_binding(
@@ -756,12 +773,9 @@ static void si_launch_grid(
                sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                                 SI_CONTEXT_CS_PARTIAL_FLUSH;
 
-       if (program->ir_type == PIPE_SHADER_IR_TGSI) {
-               util_queue_fence_wait(&program->ready);
-
-               if (program->shader.compilation_failed)
-                       return;
-       }
+       if (program->ir_type == PIPE_SHADER_IR_TGSI &&
+           program->shader.compilation_failed)
+               return;
 
        si_decompress_compute_textures(sctx);
 
index b38b6b5fa9c9916c6c0ba19292269dbfd0a27062..b514961925f4aae5058a9b27925ce387d82ee172 100644 (file)
@@ -125,12 +125,14 @@ static void si_release_descriptors(struct si_descriptors *desc)
 }
 
 static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned size,
-                        unsigned *out_offset, struct r600_resource **out_buf) {
+                        unsigned *out_offset, struct r600_resource **out_buf)
+{
        uint64_t va;
 
        u_suballocator_alloc(sctx->ce_suballocator, size,
-                            sctx->screen->b.info.tcc_cache_line_size,
-                            out_offset, (struct pipe_resource**)out_buf);
+                            si_optimal_tcc_alignment(sctx, size),
+                            out_offset,
+                            (struct pipe_resource**)out_buf);
        if (!out_buf)
                        return false;
 
@@ -193,7 +195,16 @@ static bool si_upload_descriptors(struct si_context *sctx,
                                  struct si_descriptors *desc,
                                  struct r600_atom * atom)
 {
-       unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+       unsigned slot_size = desc->element_dw_size * 4;
+       unsigned first_slot_offset = desc->first_active_slot * slot_size;
+       unsigned upload_size = desc->num_active_slots * slot_size;
+
+       /* Skip the upload if no shader is using the descriptors. dirty_mask
+        * will stay dirty and the descriptors will be uploaded when there is
+        * a shader using them.
+        */
+       if (!upload_size)
+               return true;
 
        if (sctx->ce_ib && desc->uses_ce) {
                uint32_t const* list = (uint32_t const*)desc->list;
@@ -212,25 +223,32 @@ static bool si_upload_descriptors(struct si_context *sctx,
                        radeon_emit_array(sctx->ce_ib, list + begin, count);
                }
 
-               if (!si_ce_upload(sctx, desc->ce_offset, list_size,
-                                          &desc->buffer_offset, &desc->buffer))
+               if (!si_ce_upload(sctx, desc->ce_offset + first_slot_offset,
+                                 upload_size, (unsigned*)&desc->buffer_offset,
+                                 &desc->buffer))
                        return false;
        } else {
-               void *ptr;
+               uint32_t *ptr;
 
-               u_upload_alloc(sctx->b.b.const_uploader, 0, list_size,
-                              sctx->screen->b.info.tcc_cache_line_size,
-                              &desc->buffer_offset,
-                              (struct pipe_resource**)&desc->buffer, &ptr);
+               u_upload_alloc(sctx->b.b.const_uploader, 0, upload_size,
+                              si_optimal_tcc_alignment(sctx, upload_size),
+                              (unsigned*)&desc->buffer_offset,
+                              (struct pipe_resource**)&desc->buffer,
+                              (void**)&ptr);
                if (!desc->buffer)
                        return false; /* skip the draw call */
 
-               util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
-               desc->gpu_list = ptr;
+               util_memcpy_cpu_to_le32(ptr, (char*)desc->list + first_slot_offset,
+                                       upload_size);
+               desc->gpu_list = ptr - first_slot_offset / 4;
 
                radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
                                    RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
        }
+
+       /* The shader pointer should point to slot 0. */
+       desc->buffer_offset -= first_slot_offset;
+
        desc->dirty_mask = 0;
 
        if (atom)
@@ -1030,7 +1048,7 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
        u_upload_alloc(sctx->b.b.const_uploader, 0,
                       desc_list_byte_size,
                       si_optimal_tcc_alignment(sctx, desc_list_byte_size),
-                      &desc->buffer_offset,
+                      (unsigned*)&desc->buffer_offset,
                       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
        if (!desc->buffer)
                return false;
@@ -1891,7 +1909,8 @@ static void si_emit_shader_pointer(struct si_context *sctx,
        struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        uint64_t va;
 
-       assert(desc->buffer);
+       if (!desc->buffer)
+               return; /* the pointer is not used by current shaders */
 
        va = desc->buffer->gpu_address +
             desc->buffer_offset;
@@ -2034,6 +2053,8 @@ void si_init_all_descriptors(struct si_context *sctx)
                                 RADEON_USAGE_READWRITE, RADEON_USAGE_READ,
                                 RADEON_PRIO_SHADER_RINGS, RADEON_PRIO_CONST_BUFFER,
                                 &ce_offset);
+       sctx->descriptors[SI_DESCS_RW_BUFFERS].num_active_slots = SI_NUM_RW_BUFFERS;
+
        si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
                            4, SI_NUM_VERTEX_BUFFERS, NULL);
 
@@ -2156,3 +2177,41 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 
        si_shader_userdata_begin_new_cs(sctx);
 }
+
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+                              uint64_t new_active_mask)
+{
+       struct si_descriptors *desc = &sctx->descriptors[desc_idx];
+
+       /* Ignore no-op updates and updates that disable all slots. */
+       if (!new_active_mask ||
+           new_active_mask == u_bit_consecutive64(desc->first_active_slot,
+                                                  desc->num_active_slots))
+               return;
+
+       int first, count;
+       u_bit_scan_consecutive_range64(&new_active_mask, &first, &count);
+       assert(new_active_mask == 0);
+
+       /* Upload/dump descriptors if slots are being enabled. */
+       if (first < desc->first_active_slot ||
+           first + count > desc->first_active_slot + desc->num_active_slots)
+               sctx->descriptors_dirty |= 1u << desc_idx;
+
+       desc->first_active_slot = first;
+       desc->num_active_slots = count;
+}
+
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+                                         struct si_shader_selector *sel)
+{
+       if (!sel)
+               return;
+
+       si_set_active_descriptors(sctx,
+               si_const_and_shader_buffer_descriptors_idx(sel->type),
+               sel->active_const_and_shader_buffers);
+       si_set_active_descriptors(sctx,
+               si_sampler_and_image_descriptors_idx(sel->type),
+               sel->active_samplers_and_images);
+}
index f2003a5072e7315ee64f1cf7cfdae76b6429ed8f..dfabaa35566b99ca785ed3493901af203d3ac3c8 100644 (file)
@@ -42,6 +42,7 @@
 
 struct si_screen;
 struct si_shader;
+struct si_shader_selector;
 
 struct si_state_blend {
        struct si_pm4_state     pm4;
@@ -222,12 +223,20 @@ struct si_descriptors {
 
        /* The buffer where the descriptors have been uploaded. */
        struct r600_resource *buffer;
-       unsigned buffer_offset;
+       int buffer_offset; /* can be negative if not using lower slots */
 
        /* Offset in CE RAM */
        unsigned ce_offset;
 
-       /* elements of the list that are changed and need to be uploaded */
+       /* Slots that are used by currently-bound shaders.
+        * With CE: It determines which slots are dumped to L2.
+        *          It doesn't skip uploads to CE RAM.
+        * Without CE: It determines which slots are uploaded.
+        */
+       unsigned first_active_slot;
+       unsigned num_active_slots;
+
+       /* Slots that have been changed and need to be uploaded. */
        uint64_t dirty_mask;
 
        /* Whether CE is used to upload this descriptor array. */
@@ -315,6 +324,11 @@ void si_emit_graphics_shader_userdata(struct si_context *sctx,
 void si_emit_compute_shader_userdata(struct si_context *sctx);
 void si_set_rw_buffer(struct si_context *sctx,
                      uint slot, const struct pipe_constant_buffer *input);
+void si_set_active_descriptors(struct si_context *sctx, unsigned desc_idx,
+                              uint64_t new_active_mask);
+void si_set_active_descriptors_for_shader(struct si_context *sctx,
+                                         struct si_shader_selector *sel);
+
 /* si_state.c */
 struct si_shader_selector;
 
index 45d996b6b6ca1bcbbeecd08dc7f1fa6608c965dc..8ac430975d7f736ccf4e2ac166fd255b2f9e211c 100644 (file)
@@ -2151,6 +2151,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
        sctx->do_update_shaders = true;
        si_mark_atom_dirty(sctx, &sctx->clip_regs);
        r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+       si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_update_tess_uses_prim_id(struct si_context *sctx)
@@ -2188,6 +2189,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
                        si_update_tess_uses_prim_id(sctx);
        }
        r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+       si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
@@ -2206,6 +2208,8 @@ static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
 
        if (enable_changed)
                sctx->last_tcs = NULL; /* invalidate derived tess state */
+
+       si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
@@ -2230,6 +2234,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
                sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
        }
        r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
+       si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
@@ -2247,6 +2252,7 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
        if (sel && sctx->ia_multi_vgt_param_key.u.uses_tess)
                si_update_tess_uses_prim_id(sctx);
        si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+       si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)