From 1b3199d14d5ca96cb794dca4213bf5c17d1e264b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 16 Jun 2017 14:25:34 +0200 Subject: [PATCH] radeonsi: implement mechanism for IBs without partial flushes at the end (v6) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit (This patch doesn't enable the behavior. It will be enabled in a later commit.) Draw calls from multiple IBs can be executed in parallel. v2: do emit partial flushes on SI v3: invalidate all shader caches at the beginning of IBs v4: don't call si_emit_cache_flush in si_flush_gfx_cs if not needed, only do this for flushes invoked internally v5: empty IBs should wait for idle if the flush requires it v6: split the commit If we artificially limit the number of draw calls per IB to 5, we'll get a lot more IBs, leading to a lot more partial flushes. Let's see how the removal of partial flushes changes GPU utilization in that scenario: With partial flushes (time busy): CP: 99% SPI: 86% CB: 73: Without partial flushes (time busy): CP: 99% SPI: 93% CB: 81% Tested-by: Benedikt Schemmer Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeon/radeon_winsys.h | 7 +++ src/gallium/drivers/radeonsi/si_gfx_cs.c | 55 +++++++++++++++------- src/gallium/drivers/radeonsi/si_pipe.h | 1 + 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 157b2e40550..fae4fb7a95d 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -28,6 +28,13 @@ /* The public winsys interface header for the radeon driver. */ +/* Whether the next IB can start immediately and not wait for draws and + * dispatches from the current IB to finish. */ +#define RADEON_FLUSH_START_NEXT_GFX_IB_NOW (1u << 31) + +#define RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW \ + (PIPE_FLUSH_ASYNC | RADEON_FLUSH_START_NEXT_GFX_IB_NOW) + #include "pipebuffer/pb_buffer.h" #include "amd/common/ac_gpu_info.h" diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index b1ed620b0c6..147433b69b6 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -69,11 +69,28 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, { struct radeon_winsys_cs *cs = ctx->gfx_cs; struct radeon_winsys *ws = ctx->ws; + unsigned wait_flags = 0; if (ctx->gfx_flush_in_progress) return; - if (!radeon_emitted(cs, ctx->initial_gfx_cs_size)) + if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) { + /* DRM 3.1.0 doesn't flush TC for VI correctly. */ + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_INV_GLOBAL_L2; + } else if (ctx->chip_class == SI) { + /* The kernel flushes L2 before shaders are finished. */ + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { + wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + /* Drop this flush if it's a no-op. */ + if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && + (!wait_flags || !ctx->gfx_last_ib_is_busy)) return; if (si_check_device_reset(ctx)) @@ -103,20 +120,17 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, ctx->streamout.suspended = true; } - ctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_PS_PARTIAL_FLUSH; - - /* DRM 3.1.0 doesn't flush TC for VI correctly. */ - if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) - ctx->flags |= SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1; - /* Make sure CP DMA is idle at the end of IBs after L2 prefetches * because the kernel doesn't wait for it. */ if (ctx->chip_class >= CIK) si_cp_dma_wait_for_idle(ctx); - si_emit_cache_flush(ctx); + /* Wait for draw calls to finish if needed. */ + if (wait_flags) { + ctx->flags |= wait_flags; + si_emit_cache_flush(ctx); + } + ctx->gfx_last_ib_is_busy = wait_flags == 0; if (ctx->current_saved_cs) { si_trace_emit(ctx); @@ -189,12 +203,21 @@ void si_begin_new_gfx_cs(struct si_context *ctx) if (ctx->is_debug) si_begin_gfx_cs_debug(ctx); - /* Flush read caches at the beginning of CS not flushed by the kernel. */ - if (ctx->chip_class >= CIK) - ctx->flags |= SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_ICACHE; - - ctx->flags |= SI_CONTEXT_START_PIPELINE_STATS; + /* Always invalidate caches at the beginning of IBs, because external + * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our + * buffers. + * + * Note that the cache flush done by the kernel at the end of GFX IBs + * isn't useful here, because that flush can finish after the following + * IB starts drawing. + * + * TODO: Do we also need to invalidate CB & DB caches? + */ + ctx->flags |= SI_CONTEXT_INV_ICACHE | + SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_START_PIPELINE_STATS; /* set all valid group as dirty so they get reemited on * next draw command diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 3a2f7ca11d1..125b3a72bfb 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -559,6 +559,7 @@ struct si_context { uint16_t prefetch_L2_mask; bool gfx_flush_in_progress:1; + bool gfx_last_ib_is_busy:1; bool compute_is_busy:1; unsigned num_gfx_cs_flushes; -- 2.30.2