radeonsi: add TC L2 prefetch for shaders and VBO descriptors
authorMarek Olšák <marek.olsak@amd.com>
Sun, 25 Dec 2016 17:11:59 +0000 (18:11 +0100)
committerMarek Olšák <marek.olsak@amd.com>
Fri, 6 Jan 2017 20:05:48 +0000 (21:05 +0100)
Reviewed-by: Edward O'Callaghan <funfunctor@folklore1984.net>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeonsi/si_cp_dma.c
src/gallium/drivers/radeonsi/si_pipe.h
src/gallium/drivers/radeonsi/si_state_draw.c

index 934fd8df2dbeaa354575142462afb465fc4825a5..f06b8ddb79bacf1d39512b0d45df93b3ad5b8666 100644 (file)
@@ -386,6 +386,18 @@ void si_copy_buffer(struct si_context *sctx,
                sctx->b.num_cp_dma_calls++;
 }
 
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
+                             uint64_t offset, unsigned size)
+{
+       assert(sctx->b.chip_class >= CIK);
+
+       si_copy_buffer(sctx, buf, buf, offset, offset, size,
+                      SI_CPDMA_SKIP_CHECK_CS_SPACE |
+                      SI_CPDMA_SKIP_SYNC_AFTER |
+                      SI_CPDMA_SKIP_SYNC_BEFORE |
+                      SI_CPDMA_SKIP_GFX_SYNC);
+}
+
 void si_init_cp_dma_functions(struct si_context *sctx)
 {
        sctx->b.clear_buffer = si_clear_buffer;
index dc37c8d28f3359e21f108ab396fde0416c3ca13c..c0a4636cc636bb56a4297588b9256c8f47d5969d 100644 (file)
@@ -381,6 +381,8 @@ void si_copy_buffer(struct si_context *sctx,
                    struct pipe_resource *dst, struct pipe_resource *src,
                    uint64_t dst_offset, uint64_t src_offset, unsigned size,
                    unsigned user_flags);
+void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
+                             uint64_t offset, unsigned size);
 void si_init_cp_dma_functions(struct si_context *sctx);
 
 /* si_debug.c */
index b3f664eff2fb8b93101924494c867ec082f930eb..7b756025f81270221ca2573b7eb906332a7f35a6 100644 (file)
@@ -937,6 +937,17 @@ void si_ce_post_draw_synchronization(struct si_context *sctx)
        }
 }
 
+static void cik_prefetch_shader_async(struct si_context *sctx,
+                                     struct si_pm4_state *state)
+{
+       if (state) {
+               struct pipe_resource *bo = &state->bo[0]->b.b;
+               assert(state->nbo == 1);
+
+               cik_prefetch_TC_L2_async(sctx, bo, 0, bo->width0);
+       }
+}
+
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
        struct si_context *sctx = (struct si_context *)ctx;
@@ -1114,10 +1125,34 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
        if (!si_upload_vertex_buffer_descriptors(sctx))
                return;
 
-       /* Flushed caches prior to emitting states. */
+       /* Flushed caches prior to prefetching shaders. */
        if (sctx->b.flags)
                si_emit_cache_flush(sctx);
 
+       /* Prefetch shaders and VBO descriptors to TC L2. */
+       if (sctx->b.chip_class >= CIK) {
+               if (si_pm4_state_changed(sctx, ls))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.ls);
+               if (si_pm4_state_changed(sctx, hs))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.hs);
+               if (si_pm4_state_changed(sctx, es))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.es);
+               if (si_pm4_state_changed(sctx, gs))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.gs);
+               if (si_pm4_state_changed(sctx, vs))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.vs);
+
+               /* Vertex buffer descriptors are uploaded uncached, so prefetch
+                * them right after the VS binary. */
+               if (sctx->vertex_buffers.pointer_dirty) {
+                       cik_prefetch_TC_L2_async(sctx, &sctx->vertex_buffers.buffer->b.b,
+                                               sctx->vertex_buffers.buffer_offset,
+                                               sctx->vertex_elements->count * 16);
+               }
+               if (si_pm4_state_changed(sctx, ps))
+                       cik_prefetch_shader_async(sctx, sctx->queued.named.ps);
+       }
+
        /* Emit states. */
        mask = sctx->dirty_atoms;
        while (mask) {