From 9a1363427ea3300d2ff9ef5ec0cc2ffbee22cffe Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 2 Apr 2018 21:08:05 -0400 Subject: [PATCH] radeonsi: always prefetch later shaders after the draw packet MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit so that the draw is started as soon as possible. v2: only prefetch the API VS and VBO descriptors Reviewed-by: Samuel Pitoiset Tested-by: Dieter Nützel --- src/gallium/drivers/radeonsi/si_cp_dma.c | 89 +++++++++++++++----- src/gallium/drivers/radeonsi/si_pipe.h | 2 +- src/gallium/drivers/radeonsi/si_state_draw.c | 10 ++- 3 files changed, 75 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index e2d261d7e09..358b33c4eb1 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -520,67 +520,110 @@ static void cik_prefetch_VBO_descriptors(struct si_context *sctx) sctx->vertex_elements->desc_list_byte_size); } -void cik_emit_prefetch_L2(struct si_context *sctx) +/** + * Prefetch shaders and VBO descriptors. + * + * \param vertex_stage_only Whether only the the API VS and VBO descriptors + * should be prefetched. + */ +void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only) { + unsigned mask = sctx->prefetch_L2_mask; + assert(mask); + /* Prefetch shaders and VBO descriptors to TC L2. */ if (sctx->chip_class >= GFX9) { /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_HS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_GS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } } } else { /* SI-CI-VI */ /* Choose the right spot for the VBO prefetch. */ if (sctx->tes_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_LS) + if (mask & SI_PREFETCH_LS) cik_prefetch_shader_async(sctx, sctx->queued.named.ls); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_HS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_LS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_HS) cik_prefetch_shader_async(sctx, sctx->queued.named.hs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) + if (mask & SI_PREFETCH_ES) cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else if (sctx->gs_shader.cso) { - if (sctx->prefetch_L2_mask & SI_PREFETCH_ES) + if (mask & SI_PREFETCH_ES) cik_prefetch_shader_async(sctx, sctx->queued.named.es); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); - if (sctx->prefetch_L2_mask & SI_PREFETCH_GS) + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_ES | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } + + if (mask & SI_PREFETCH_GS) cik_prefetch_shader_async(sctx, sctx->queued.named.gs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); } else { - if (sctx->prefetch_L2_mask & SI_PREFETCH_VS) + if (mask & SI_PREFETCH_VS) cik_prefetch_shader_async(sctx, sctx->queued.named.vs); - if (sctx->prefetch_L2_mask & SI_PREFETCH_VBO_DESCRIPTORS) + if (mask & SI_PREFETCH_VBO_DESCRIPTORS) cik_prefetch_VBO_descriptors(sctx); + if (vertex_stage_only) { + sctx->prefetch_L2_mask &= ~(SI_PREFETCH_VS | + SI_PREFETCH_VBO_DESCRIPTORS); + return; + } } } - if (sctx->prefetch_L2_mask & SI_PREFETCH_PS) + if (mask & SI_PREFETCH_PS) cik_prefetch_shader_async(sctx, sctx->queued.named.ps); sctx->prefetch_L2_mask = 0; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index e3e5d5ac91b..c7ad5366a68 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -911,7 +911,7 @@ void si_copy_buffer(struct si_context *sctx, unsigned user_flags); void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size); -void cik_emit_prefetch_L2(struct si_context *sctx); +void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only); void si_init_cp_dma_functions(struct si_context *sctx); /* si_debug.c */ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index f8d52cbc98f..96dfd93645d 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1456,7 +1456,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) * in parallel, but starting the draw first is more important. */ if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) - cik_emit_prefetch_L2(sctx); + cik_emit_prefetch_L2(sctx, false); } else { /* If we don't wait for idle, start prefetches first, then set * states, and draw at the end. @@ -1464,14 +1464,20 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if (sctx->flags) si_emit_cache_flush(sctx); + /* Only prefetch the API VS and VBO descriptors. */ if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) - cik_emit_prefetch_L2(sctx); + cik_emit_prefetch_L2(sctx, true); if (!si_upload_graphics_shader_descriptors(sctx)) return; si_emit_all_states(sctx, info, 0); si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset); + + /* Prefetch the remaining shaders after the draw has been + * started. */ + if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask) + cik_emit_prefetch_L2(sctx, false); } if (unlikely(sctx->current_saved_cs)) { -- 2.30.2