From 9e1aa81dfeced2381aa0df73758dd76f2722d857 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 27 Dec 2016 16:12:05 +0100 Subject: [PATCH] gallium/radeon: prevent SDMA stalls by detecting RAW hazards in need_dma_space MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Call r600_dma_emit_wait_idle only when there is a possibility of a read-after-write hazard. Buffers not yet used by the SDMA IB don't have to wait. Reviewed-by: Nicolai Hähnle --- .../drivers/r600/evergreen_hw_context.c | 1 - src/gallium/drivers/r600/evergreen_state.c | 1 - src/gallium/drivers/r600/r600_hw_context.c | 1 - src/gallium/drivers/r600/r600_state.c | 1 - src/gallium/drivers/radeon/r600_pipe_common.c | 48 +++++++++++-------- src/gallium/drivers/radeon/r600_pipe_common.h | 1 - src/gallium/drivers/radeonsi/cik_sdma.c | 8 ---- src/gallium/drivers/radeonsi/si_dma.c | 2 - 8 files changed, 27 insertions(+), 36 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c index 06f03482341..5352dc05779 100644 --- a/src/gallium/drivers/r600/evergreen_hw_context.c +++ b/src/gallium/drivers/r600/evergreen_hw_context.c @@ -77,7 +77,6 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx, src_offset += csize << shift; size -= csize; } - r600_dma_emit_wait_idle(&rctx->b); } /* The max number of bytes to copy per packet. */ diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 015ff026562..c5dd9f71dd9 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -3453,7 +3453,6 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx, addr += cheight * pitch; y += cheight; } - r600_dma_emit_wait_idle(&rctx->b); } static void evergreen_dma_copy(struct pipe_context *ctx, diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index bc6217ad223..4663d99c0b6 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -555,5 +555,4 @@ void r600_dma_copy_buffer(struct r600_context *rctx, src_offset += csize << 2; size -= csize; } - r600_dma_emit_wait_idle(&rctx->b); } diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index ba97490ac91..006bb629d60 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -2904,7 +2904,6 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx, addr += cheight * pitch; y += cheight; } - r600_dma_emit_wait_idle(&rctx->b); return TRUE; } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 73fc40db2ea..35d3e7b16bd 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -220,6 +220,21 @@ void r600_draw_rectangle(struct blitter_context *blitter, pipe_resource_reference(&buf, NULL); } +static void r600_dma_emit_wait_idle(struct r600_common_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->dma.cs; + + /* NOP waits for idle on Evergreen and later. */ + if (rctx->chip_class >= CIK) + radeon_emit(cs, 0x00000000); /* NOP */ + else if (rctx->chip_class >= EVERGREEN) + radeon_emit(cs, 0xf0000000); /* NOP */ + else { + /* TODO: R600-R700 should use the FENCE packet. + * CS checker support is required. */ + } +} + void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { @@ -257,6 +272,7 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, * It improves texture upload performance by keeping the DMA * engine busy while uploads are being submitted. */ + num_dw++; /* for emit_wait_idle below */ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 || !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) { @@ -264,6 +280,17 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); } + /* Wait for idle if either buffer has been used in the IB before to + * prevent read-after-write hazards. + */ + if ((dst && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf, + RADEON_USAGE_READWRITE)) || + (src && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf, + RADEON_USAGE_WRITE))) + r600_dma_emit_wait_idle(ctx); + /* If GPUVM is not supported, the CS checker needs 2 entries * in the buffer list per packet, which has to be done manually. */ @@ -282,27 +309,6 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, ctx->num_dma_calls++; } -/* This is required to prevent read-after-write hazards. */ -void r600_dma_emit_wait_idle(struct r600_common_context *rctx) -{ - struct radeon_winsys_cs *cs = rctx->dma.cs; - - r600_need_dma_space(rctx, 1, NULL, NULL); - - if (!radeon_emitted(cs, 0)) /* empty queue */ - return; - - /* NOP waits for idle on Evergreen and later. */ - if (rctx->chip_class >= CIK) - radeon_emit(cs, 0x00000000); /* NOP */ - else if (rctx->chip_class >= EVERGREEN) - radeon_emit(cs, 0xf0000000); /* NOP */ - else { - /* TODO: R600-R700 should use the FENCE packet. - * CS checker support is required. */ - } -} - static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags) { } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index 2bb622ab365..2e0655602c2 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -729,7 +729,6 @@ struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, const char *r600_get_llvm_processor_name(enum radeon_family family); void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src); -void r600_dma_emit_wait_idle(struct r600_common_context *rctx); void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs, struct radeon_saved_cs *saved); void radeon_clear_saved_cs(struct radeon_saved_cs *saved); diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index 648b1ca7687..bee35cd8212 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -67,7 +67,6 @@ static void cik_sdma_copy_buffer(struct si_context *ctx, src_offset += csize; size -= csize; } - r600_dma_emit_wait_idle(&ctx->b); } static void cik_sdma_clear_buffer(struct pipe_context *ctx, @@ -108,7 +107,6 @@ static void cik_sdma_clear_buffer(struct pipe_context *ctx, offset += csize; size -= csize; } - r600_dma_emit_wait_idle(&sctx->b); } static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w) @@ -251,8 +249,6 @@ static bool cik_sdma_copy_texture(struct si_context *sctx, radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); radeon_emit(cs, (copy_depth - 1)); } - - r600_dma_emit_wait_idle(&sctx->b); return true; } @@ -417,8 +413,6 @@ static bool cik_sdma_copy_texture(struct si_context *sctx, radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16)); radeon_emit(cs, (copy_depth - 1)); } - - r600_dma_emit_wait_idle(&sctx->b); return true; } } @@ -515,8 +509,6 @@ static bool cik_sdma_copy_texture(struct si_context *sctx, ((copy_height_aligned - 8) << 16)); radeon_emit(cs, (copy_depth - 1)); } - - r600_dma_emit_wait_idle(&sctx->b); return true; } } diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index 8d186c3fb43..1009bb23993 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -76,7 +76,6 @@ static void si_dma_copy_buffer(struct si_context *ctx, src_offset += csize << shift; size -= csize; } - r600_dma_emit_wait_idle(&ctx->b); } static void si_dma_copy_tile(struct si_context *ctx, @@ -177,7 +176,6 @@ static void si_dma_copy_tile(struct si_context *ctx, addr += cheight * pitch; tiled_y += cheight; } - r600_dma_emit_wait_idle(&ctx->b); } static void si_dma_copy(struct pipe_context *ctx, -- 2.30.2