From 18a30c97780bef9c498db915ba5e7debe832f576 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 29 Dec 2014 14:53:11 +0100 Subject: [PATCH] radeonsi: use TC L2 for CP DMA operations with shader resources on CIK MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit So that TC L2 doesn't need to be flushed. The only problem is with index buffers, which don't use TC. A simple solution is added that flushes TC L2 before a draw call (TC_L2_dirty). Reviewed-by: Michel Dänzer --- src/gallium/drivers/radeon/r600_pipe_common.h | 12 +++++++ src/gallium/drivers/radeonsi/si_descriptors.c | 32 +++++++++++++------ src/gallium/drivers/radeonsi/si_state_draw.c | 5 +++ 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index a9416b686ed..60b8faeb29b 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -138,6 +138,18 @@ struct r600_resource { * the unsynchronized map flag and expect the driver to figure it out. */ struct util_range valid_buffer_range; + + /* For buffers only. This indicates that a write operation has been + * performed by TC L2, but the cache hasn't been flushed. + * Any hw block which doesn't use or bypasses TC L2 should check this + * flag and flush the cache before using the buffer. + * + * For example, TC L2 must be flushed if a buffer which has been + * modified by a shader store instruction is about to be used as + * an index buffer. The reason is that VGT DMA index fetching doesn't + * use TC L2. + */ + bool TC_L2_dirty; }; struct r600_transfer { diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index e2da476ab0f..454e12cc835 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1073,7 +1073,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, bool is_framebuffer) { struct si_context *sctx = (struct si_context*)ctx; - unsigned flush_flags; + unsigned flush_flags, tc_l2_flag; if (!size) return; @@ -1098,19 +1098,22 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, uint64_t va = r600_resource(dst)->gpu_address + offset; /* Flush the caches where the resource is bound. */ - if (is_framebuffer) + if (is_framebuffer) { flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - else + tc_l2_flag = 0; + } else { flush_flags = SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | SI_CONTEXT_INV_KCACHE; + tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + } sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; while (size) { unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); - unsigned dma_flags = 0; + unsigned dma_flags = tc_l2_flag; si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE); @@ -1141,6 +1144,9 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, /* Flush the caches again in case the 3D engine has been prefetching * the resource. */ sctx->b.flags |= flush_flags; + + if (tc_l2_flag) + r600_resource(dst)->TC_L2_dirty = true; } void si_copy_buffer(struct si_context *sctx, @@ -1148,7 +1154,7 @@ void si_copy_buffer(struct si_context *sctx, uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer) { - unsigned flush_flags; + unsigned flush_flags, tc_l2_flag; if (!size) return; @@ -1163,18 +1169,21 @@ void si_copy_buffer(struct si_context *sctx, src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resource is bound. */ - if (is_framebuffer) + if (is_framebuffer) { flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - else + tc_l2_flag = 0; + } else { flush_flags = SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | SI_CONTEXT_INV_KCACHE; + tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + } sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; while (size) { - unsigned sync_flags = 0; + unsigned sync_flags = tc_l2_flag; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE); @@ -1206,6 +1215,9 @@ void si_copy_buffer(struct si_context *sctx, /* Flush the caches again in case the 3D engine has been prefetching * the resource. */ sctx->b.flags |= flush_flags; + + if (tc_l2_flag) + r600_resource(dst)->TC_L2_dirty = true; } /* INIT/DEINIT */ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index ccc44d5bc6c..e6916c10f8f 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -551,6 +551,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) } } + if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) { + sctx->b.flags |= SI_CONTEXT_INV_TC_L2; + r600_resource(ib.buffer)->TC_L2_dirty = false; + } + /* Check flush flags. */ if (sctx->b.flags) sctx->atoms.s.cache_flush->dirty = true; -- 2.30.2