radeonsi: add a separate dirty mask for prefetches
authorMarek Olšák <marek.olsak@amd.com>
Fri, 4 Aug 2017 15:10:58 +0000 (17:10 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Mon, 7 Aug 2017 19:12:24 +0000 (21:12 +0200)
so that we don't rely on si_pm4_state_enabled_and_changed, allowing us
to move prefetches after draw calls.

v2: ckear the dirty mask after unbinding shaders

Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de> (v1)
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com> (v1)
src/gallium/drivers/radeonsi/si_cp_dma.c
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_hw_context.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_state_draw.c
src/gallium/drivers/radeonsi/si_state_shaders.c

index 24fa6fd08706592084dbebe37c0c55ed77191bc9..21202b3d5b5a9d49fc73a896dbfdcf1b5230600e 100644 (file)
@@ -451,28 +451,28 @@ static void cik_prefetch_shader_async(struct si_context *sctx,
 void cik_emit_prefetch_L2(struct si_context *sctx)
 {
        /* Prefetch shaders and VBO descriptors to TC L2. */
-       if (si_pm4_state_enabled_and_changed(sctx, ls))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_LS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
-       if (si_pm4_state_enabled_and_changed(sctx, hs))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_HS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
-       if (si_pm4_state_enabled_and_changed(sctx, es))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_ES)
                cik_prefetch_shader_async(sctx, sctx->queued.named.es);
-       if (si_pm4_state_enabled_and_changed(sctx, gs))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_GS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
-       if (si_pm4_state_enabled_and_changed(sctx, vs))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_VS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
 
        /* Vertex buffer descriptors are uploaded uncached, so prefetch
         * them right after the VS binary. */
-       if (sctx->vertex_buffer_pointer_dirty) {
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) {
                cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
                                         sctx->vertex_buffers.buffer_offset,
                                         sctx->vertex_elements->desc_list_byte_size);
        }
-       if (si_pm4_state_enabled_and_changed(sctx, ps))
+       if (sctx->prefetch_L2_mask & SI_PREFETCH_PS)
                cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
 
-       sctx->prefetch_L2 = false;
+       sctx->prefetch_L2_mask = 0;
 }
 
 void si_init_cp_dma_functions(struct si_context *sctx)
index 917b0e1a529e8638384316d0c4919039bab38f75..43f1792edddf001d57be602e8d075213412f6e2d 100644 (file)
@@ -1176,10 +1176,9 @@ bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
         * uploaded to a fresh new buffer, so I don't think flushing the const
         * cache is needed. */
        si_mark_atom_dirty(sctx, &sctx->shader_userdata.atom);
-       if (sctx->b.chip_class >= CIK)
-               sctx->prefetch_L2 = true;
        sctx->vertex_buffers_dirty = false;
        sctx->vertex_buffer_pointer_dirty = true;
+       sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
        return true;
 }
 
index 756b15968d979abee669916b285f3c6a03ace3fc..3582cd711b54cf1bc35055ee6e9edb4ef16e4def 100644 (file)
@@ -216,8 +216,20 @@ void si_begin_new_cs(struct si_context *ctx)
        if (ctx->ce_ib)
                si_ce_restore_all_descriptors_at_ib_start(ctx);
 
-       if (ctx->b.chip_class >= CIK)
-               ctx->prefetch_L2 = true;
+       if (ctx->queued.named.ls)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+       if (ctx->queued.named.hs)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+       if (ctx->queued.named.es)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+       if (ctx->queued.named.gs)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+       if (ctx->queued.named.vs)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+       if (ctx->queued.named.ps)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+       if (ctx->vertex_buffers.buffer)
+               ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
 
        /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
        ctx->framebuffer.dirty_cbufs =
index d21388649a1635f552811cb095def55ddbed109d..62b64e168799f427478fff0c17348f10d94b0ec7 100644 (file)
 #define SI_CONTEXT_VGT_FLUSH           (R600_CONTEXT_PRIVATE_FLAG << 12)
 #define SI_CONTEXT_VGT_STREAMOUT_SYNC  (R600_CONTEXT_PRIVATE_FLAG << 13)
 
+#define SI_PREFETCH_VBO_DESCRIPTORS    (1 << 0)
+#define SI_PREFETCH_LS                 (1 << 1)
+#define SI_PREFETCH_HS                 (1 << 2)
+#define SI_PREFETCH_ES                 (1 << 3)
+#define SI_PREFETCH_GS                 (1 << 4)
+#define SI_PREFETCH_VS                 (1 << 5)
+#define SI_PREFETCH_PS                 (1 << 6)
+
 #define SI_MAX_BORDER_COLORS   4096
 #define SIX_BITS               0x3F
 
@@ -279,11 +287,11 @@ struct si_context {
        struct u_suballocator           *ce_suballocator;
        unsigned                        ce_ram_saved_offset;
        uint16_t                        total_ce_ram_allocated;
+       uint16_t                        prefetch_L2_mask;
        bool                            ce_need_synchronization:1;
 
        bool                            gfx_flush_in_progress:1;
        bool                            compute_is_busy:1;
-       bool                            prefetch_L2:1;
 
        /* Atoms (direct states). */
        union si_state_atoms            atoms;
index 3f933fec2dcb876f531bf4e891d1dbffa6ac9ce1..c78450c2ce3022c5ace9193f3c070af901a0998e 100644 (file)
@@ -1346,7 +1346,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
        if (sctx->b.flags)
                si_emit_cache_flush(sctx);
 
-       if (sctx->prefetch_L2)
+       if (sctx->b.chip_class >= CIK && sctx->prefetch_L2_mask)
                cik_emit_prefetch_L2(sctx);
 
        /* Emit state atoms. */
index cb5a23e9c8025271b3c543936f4a079c9bdaf183..de5260ccd8f22c66de973cd3d7995c5fb12463d1 100644 (file)
@@ -3307,8 +3307,37 @@ bool si_update_shaders(struct si_context *sctx)
                        return false;
        }
 
-       if (sctx->b.chip_class >= CIK)
-               sctx->prefetch_L2 = true;
+       if (sctx->b.chip_class >= CIK) {
+               if (si_pm4_state_enabled_and_changed(sctx, ls))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_LS;
+               else if (!sctx->queued.named.ls)
+                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_LS;
+
+               if (si_pm4_state_enabled_and_changed(sctx, hs))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_HS;
+               else if (!sctx->queued.named.hs)
+                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_HS;
+
+               if (si_pm4_state_enabled_and_changed(sctx, es))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_ES;
+               else if (!sctx->queued.named.es)
+                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_ES;
+
+               if (si_pm4_state_enabled_and_changed(sctx, gs))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_GS;
+               else if (!sctx->queued.named.gs)
+                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_GS;
+
+               if (si_pm4_state_enabled_and_changed(sctx, vs))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_VS;
+               else if (!sctx->queued.named.vs)
+                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_VS;
+
+               if (si_pm4_state_enabled_and_changed(sctx, ps))
+                       sctx->prefetch_L2_mask |= SI_PREFETCH_PS;
+               else if (!sctx->queued.named.ps)
+                       sctx->prefetch_L2_mask &= ~SI_PREFETCH_PS;
+       }
 
        sctx->do_update_shaders = false;
        return true;