radeonsi: do only 1 big CE dump at end of IBs and one reload in the preamble
authorMarek Olšák <marek.olsak@amd.com>
Mon, 15 May 2017 21:45:57 +0000 (23:45 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Thu, 18 May 2017 20:15:02 +0000 (22:15 +0200)
A later commit will only upload descriptors used by shaders, so we won't do
full dumps anymore, so the only way to have a complete mirror of CE RAM
in memory is to do a separate dump after the last draw call.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_hw_context.c
src/gallium/drivers/radeonsi/si_pipe.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_state.h

index bc27a124d95bc1883b9693dd420c8eecedaa9631..b38b6b5fa9c9916c6c0ba19292269dbfd0a27062 100644 (file)
@@ -149,38 +149,36 @@ static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned s
        return true;
 }
 
-static void si_ce_reinitialize_descriptors(struct si_context *sctx,
-                                           struct si_descriptors *desc)
+void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx)
 {
-       if (desc->buffer) {
-               struct r600_resource *buffer = (struct r600_resource*)desc->buffer;
-               unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
-               uint64_t va = buffer->gpu_address + desc->buffer_offset;
-               struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
-
-               if (!ib)
-                       ib = sctx->ce_ib;
+       bool success = si_ce_upload(sctx, 0, sctx->total_ce_ram_allocated,
+                                   &sctx->ce_ram_saved_offset,
+                                   &sctx->ce_ram_saved_buffer);
+       (void)success;
+       assert(success);
+}
 
-               list_size = align(list_size, 32);
+void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx)
+{
+       if (!sctx->ce_ram_saved_buffer)
+               return;
 
-               radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
-               radeon_emit(ib, va);
-               radeon_emit(ib, va >> 32);
-               radeon_emit(ib, list_size / 4);
-               radeon_emit(ib, desc->ce_offset);
+       struct radeon_winsys_cs *ib = sctx->ce_preamble_ib;
+       if (!ib)
+               ib = sctx->ce_ib;
 
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
-                                   RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
-       }
-       desc->ce_ram_dirty = false;
-}
+       uint64_t va = sctx->ce_ram_saved_buffer->gpu_address +
+                     sctx->ce_ram_saved_offset;
 
-void si_ce_reinitialize_all_descriptors(struct si_context *sctx)
-{
-       int i;
+       radeon_emit(ib, PKT3(PKT3_LOAD_CONST_RAM, 3, 0));
+       radeon_emit(ib, va);
+       radeon_emit(ib, va >> 32);
+       radeon_emit(ib, sctx->total_ce_ram_allocated / 4);
+       radeon_emit(ib, 0);
 
-       for (i = 0; i < SI_NUM_DESCS; ++i)
-               si_ce_reinitialize_descriptors(sctx, &sctx->descriptors[i]);
+       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+                                 sctx->ce_ram_saved_buffer,
+                                 RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 }
 
 void si_ce_enable_loads(struct radeon_winsys_cs *ib)
@@ -200,9 +198,6 @@ static bool si_upload_descriptors(struct si_context *sctx,
        if (sctx->ce_ib && desc->uses_ce) {
                uint32_t const* list = (uint32_t const*)desc->list;
 
-               if (desc->ce_ram_dirty)
-                       si_ce_reinitialize_descriptors(sctx, desc);
-
                while(desc->dirty_mask) {
                        int begin, count;
                        u_bit_scan_consecutive_range64(&desc->dirty_mask, &begin,
@@ -247,8 +242,6 @@ static bool si_upload_descriptors(struct si_context *sctx,
 static void
 si_descriptors_begin_new_cs(struct si_context *sctx, struct si_descriptors *desc)
 {
-       desc->ce_ram_dirty = true;
-
        if (!desc->buffer)
                return;
 
@@ -2045,6 +2038,7 @@ void si_init_all_descriptors(struct si_context *sctx)
                            4, SI_NUM_VERTEX_BUFFERS, NULL);
 
        sctx->descriptors_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
+       sctx->total_ce_ram_allocated = ce_offset;
 
        if (sctx->b.chip_class >= GFX9)
                assert(ce_offset <= 4096);
index e15f6a9cc69bcb663219515cd5ff96f343cf9877..5e97d56481d9c94ffd78c63affa6e6b57b3cab6e 100644 (file)
@@ -123,6 +123,10 @@ void si_context_gfx_flush(void *context, unsigned flags,
 
        ctx->gfx_flush_in_progress = true;
 
+       /* This CE dump should be done in parallel with the last draw. */
+       if (ctx->ce_ib)
+               si_ce_save_all_descriptors_at_ib_end(ctx);
+
        r600_preflush_suspend_features(&ctx->b);
 
        ctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
@@ -207,8 +211,8 @@ void si_begin_new_cs(struct si_context *ctx)
        else if (ctx->ce_ib)
                si_ce_enable_loads(ctx->ce_ib);
 
-       if (ctx->ce_preamble_ib)
-               si_ce_reinitialize_all_descriptors(ctx);
+       if (ctx->ce_ib)
+               si_ce_restore_all_descriptors_at_ib_start(ctx);
 
        if (ctx->b.chip_class >= CIK)
                si_mark_atom_dirty(ctx, &ctx->prefetch_L2);
index dd962e08e7ad9f777114f5532fe3374e39981883..eaa3348bcb7571030bceb041216a2e12a682f1ef 100644 (file)
@@ -55,6 +55,7 @@ static void si_destroy_context(struct pipe_context *context)
        if (sctx->ce_suballocator)
                u_suballocator_destroy(sctx->ce_suballocator);
 
+       r600_resource_reference(&sctx->ce_ram_saved_buffer, NULL);
        pipe_resource_reference(&sctx->esgs_ring, NULL);
        pipe_resource_reference(&sctx->gsvs_ring, NULL);
        pipe_resource_reference(&sctx->tf_ring, NULL);
index 449a802f76b90723ec52e2c1f2412e50bb459417..13ec0729b192e62825d71dafd95b314e8cb8adc7 100644 (file)
@@ -236,6 +236,9 @@ struct si_context {
 
        struct radeon_winsys_cs         *ce_ib;
        struct radeon_winsys_cs         *ce_preamble_ib;
+       struct r600_resource            *ce_ram_saved_buffer;
+       unsigned                        ce_ram_saved_offset;
+       unsigned                        total_ce_ram_allocated;
        bool                            ce_need_synchronization;
        struct u_suballocator           *ce_suballocator;
 
index c4ef90372fb9697be22b5a40ed243cb90137084e..9b506a8e3f730a1987dce8b69c9807590ad2ebee 100644 (file)
@@ -232,9 +232,6 @@ struct si_descriptors {
 
        /* Whether CE is used to upload this descriptor array. */
        bool uses_ce;
-       /* Whether the CE ram is dirty and needs to be reinitialized entirely
-        * before we can do partial updates. */
-       bool ce_ram_dirty;
 
        /* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
         * array will be stored. */
@@ -282,7 +279,8 @@ struct si_buffer_resources {
        } while(0)
 
 /* si_descriptors.c */
-void si_ce_reinitialize_all_descriptors(struct si_context *sctx);
+void si_ce_save_all_descriptors_at_ib_end(struct si_context* sctx);
+void si_ce_restore_all_descriptors_at_ib_start(struct si_context *sctx);
 void si_ce_enable_loads(struct radeon_winsys_cs *ib);
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
                                    struct r600_texture *tex,