From 8cdce30cc20983dcb971dd906a9a9007e282081d Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 10 Oct 2016 18:49:22 +0200 Subject: [PATCH] radeonsi: implement TC L2 write-back (flush) without cache invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_pipe.h | 21 ++--- src/gallium/drivers/radeonsi/si_state_draw.c | 81 +++++++++++++++----- 2 files changed, 74 insertions(+), 28 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 3cefee7ad2f..e10d3fb4fc5 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -50,17 +50,20 @@ #define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 2) /* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */ #define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) +/* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't + * invalidate L2. SI-CIK can't do it, so they will do complete invalidation. */ +#define SI_CONTEXT_WRITEBACK_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 4) /* Framebuffer caches. */ -#define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4) -#define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5) -#define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 6) -#define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 7) +#define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 5) +#define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 6) +#define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 7) +#define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 8) /* Engine synchronization. */ -#define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 8) -#define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9) -#define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10) -#define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11) -#define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 12) +#define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9) +#define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10) +#define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11) +#define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 12) +#define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13) #define SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER (SI_CONTEXT_FLUSH_AND_INV_CB | \ SI_CONTEXT_FLUSH_AND_INV_CB_META | \ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 38e5cb45785..33b6b232343 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -696,6 +696,19 @@ static void si_emit_draw_packets(struct si_context *sctx, } } +static void si_emit_surface_sync(struct r600_common_context *rctx, + unsigned cp_coher_cntl) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + + /* ACQUIRE_MEM is only required on a compute ring. */ + radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); + radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ +} + void si_emit_cache_flush(struct si_context *sctx) { struct r600_common_context *rctx = &sctx->b; @@ -715,15 +728,6 @@ void si_emit_cache_flush(struct si_context *sctx) if (rctx->flags & SI_CONTEXT_INV_SMEM_L1) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); - if (rctx->flags & SI_CONTEXT_INV_VMEM_L1) - cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); - if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) { - cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); - - if (rctx->chip_class >= VI) - cp_coher_cntl |= S_0301F0_TC_WB_ACTION_ENA(1); - } - if (rctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) { cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | S_0085F0_CB0_DEST_BASE_ENA(1) | @@ -806,23 +810,62 @@ void si_emit_cache_flush(struct si_context *sctx) /* Make sure ME is idle (it executes most packets) before continuing. * This prevents read-after-write hazards between PFP and ME. */ - if (cp_coher_cntl || (rctx->flags & SI_CONTEXT_CS_PARTIAL_FLUSH)) { + if (cp_coher_cntl || + (rctx->flags & (SI_CONTEXT_CS_PARTIAL_FLUSH | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_WRITEBACK_GLOBAL_L2))) { radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); radeon_emit(cs, 0); } - /* When one of the DEST_BASE flags is set, SURFACE_SYNC waits for idle. - * Therefore, it should be last. Done in PFP. + /* When one of the CP_COHER_CNTL.DEST_BASE flags is set, SURFACE_SYNC + * waits for idle. Therefore, it should be last. SURFACE_SYNC is done + * in PFP. + * + * cp_coher_cntl should contain all necessary flags except TC flags + * at this point. + * + * SI-CIK don't support L2 write-back. */ - if (cp_coher_cntl) { - /* ACQUIRE_MEM is only required on a compute ring. */ - radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); - radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2 || + (rctx->chip_class <= CIK && + (rctx->flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2))) { + /* Invalidate L1 & L2. (L1 is always invalidated) + * WB must be set on VI+ when TC_ACTION is set. + */ + si_emit_surface_sync(rctx, cp_coher_cntl | + S_0085F0_TC_ACTION_ENA(1) | + S_0301F0_TC_WB_ACTION_ENA(rctx->chip_class >= VI)); + cp_coher_cntl = 0; + } else { + /* L1 invalidation and L2 writeback must be done separately, + * because both operations can't be done together. + */ + if (rctx->flags & SI_CONTEXT_WRITEBACK_GLOBAL_L2) { + /* WB = write-back + * NC = apply to non-coherent MTYPEs + * (i.e. MTYPE <= 1, which is what we use everywhere) + * + * WB doesn't work without NC. + */ + si_emit_surface_sync(rctx, cp_coher_cntl | + S_0301F0_TC_WB_ACTION_ENA(1) | + S_0301F0_TC_NC_ACTION_ENA(1)); + cp_coher_cntl = 0; + } + if (rctx->flags & SI_CONTEXT_INV_VMEM_L1) { + /* Invalidate per-CU VMEM L1. */ + si_emit_surface_sync(rctx, cp_coher_cntl | + S_0085F0_TCL1_ACTION_ENA(1)); + cp_coher_cntl = 0; + } } + /* If TC flushes haven't cleared this... */ + if (cp_coher_cntl) + si_emit_surface_sync(rctx, cp_coher_cntl); + if (rctx->flags & R600_CONTEXT_START_PIPELINE_STATS) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | -- 2.30.2