From: Marek Olšák Date: Mon, 29 Dec 2014 13:53:11 +0000 (+0100) Subject: radeonsi: use TC L2 for CP DMA operations with shader resources on CIK X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=18a30c97780bef9c498db915ba5e7debe832f576;p=mesa.git radeonsi: use TC L2 for CP DMA operations with shader resources on CIK So that TC L2 doesn't need to be flushed. The only problem is with index buffers, which don't use TC. A simple solution is added that flushes TC L2 before a draw call (TC_L2_dirty). Reviewed-by: Michel Dänzer --- diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index a9416b686ed..60b8faeb29b 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -138,6 +138,18 @@ struct r600_resource { * the unsynchronized map flag and expect the driver to figure it out. */ struct util_range valid_buffer_range; + + /* For buffers only. This indicates that a write operation has been + * performed by TC L2, but the cache hasn't been flushed. + * Any hw block which doesn't use or bypasses TC L2 should check this + * flag and flush the cache before using the buffer. + * + * For example, TC L2 must be flushed if a buffer which has been + * modified by a shader store instruction is about to be used as + * an index buffer. The reason is that VGT DMA index fetching doesn't + * use TC L2. + */ + bool TC_L2_dirty; }; struct r600_transfer { diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index e2da476ab0f..454e12cc835 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1073,7 +1073,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, bool is_framebuffer) { struct si_context *sctx = (struct si_context*)ctx; - unsigned flush_flags; + unsigned flush_flags, tc_l2_flag; if (!size) return; @@ -1098,19 +1098,22 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, uint64_t va = r600_resource(dst)->gpu_address + offset; /* Flush the caches where the resource is bound. */ - if (is_framebuffer) + if (is_framebuffer) { flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - else + tc_l2_flag = 0; + } else { flush_flags = SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | SI_CONTEXT_INV_KCACHE; + tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + } sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; while (size) { unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); - unsigned dma_flags = 0; + unsigned dma_flags = tc_l2_flag; si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE); @@ -1141,6 +1144,9 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, /* Flush the caches again in case the 3D engine has been prefetching * the resource. */ sctx->b.flags |= flush_flags; + + if (tc_l2_flag) + r600_resource(dst)->TC_L2_dirty = true; } void si_copy_buffer(struct si_context *sctx, @@ -1148,7 +1154,7 @@ void si_copy_buffer(struct si_context *sctx, uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer) { - unsigned flush_flags; + unsigned flush_flags, tc_l2_flag; if (!size) return; @@ -1163,18 +1169,21 @@ void si_copy_buffer(struct si_context *sctx, src_offset += r600_resource(src)->gpu_address; /* Flush the caches where the resource is bound. */ - if (is_framebuffer) + if (is_framebuffer) { flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - else + tc_l2_flag = 0; + } else { flush_flags = SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | SI_CONTEXT_INV_KCACHE; + tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + } sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; while (size) { - unsigned sync_flags = 0; + unsigned sync_flags = tc_l2_flag; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); si_need_cs_space(sctx, 7 + (sctx->b.flags ? sctx->cache_flush.num_dw : 0), FALSE); @@ -1206,6 +1215,9 @@ void si_copy_buffer(struct si_context *sctx, /* Flush the caches again in case the 3D engine has been prefetching * the resource. */ sctx->b.flags |= flush_flags; + + if (tc_l2_flag) + r600_resource(dst)->TC_L2_dirty = true; } /* INIT/DEINIT */ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index ccc44d5bc6c..e6916c10f8f 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -551,6 +551,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) } } + if (info->indexed && r600_resource(ib.buffer)->TC_L2_dirty) { + sctx->b.flags |= SI_CONTEXT_INV_TC_L2; + r600_resource(ib.buffer)->TC_L2_dirty = false; + } + /* Check flush flags. */ if (sctx->b.flags) sctx->atoms.s.cache_flush->dirty = true;