From 7bb9bb0540cf5ce59040f278e0fc3dc68d6fceeb Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nicolai=20H=C3=A4hnle?= Date: Thu, 16 Nov 2017 13:33:36 +0100 Subject: [PATCH] radeonsi/gfx10: implement gfx10_emit_cache_flush Acked-by: Bas Nieuwenhuizen --- src/gallium/drivers/radeonsi/si_pipe.c | 6 +- src/gallium/drivers/radeonsi/si_pipe.h | 6 +- src/gallium/drivers/radeonsi/si_state.h | 1 + src/gallium/drivers/radeonsi/si_state_draw.c | 185 +++++++++++++++++++ 4 files changed, 195 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 65a027fe928..a290d16b02d 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -487,7 +487,11 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, goto fail; /* Initialize context functions used by graphics and compute. */ - sctx->emit_cache_flush = si_emit_cache_flush; + if (sctx->chip_class >= GFX10) + sctx->emit_cache_flush = gfx10_emit_cache_flush; + else + sctx->emit_cache_flush = si_emit_cache_flush; + sctx->b.emit_string_marker = si_emit_string_marker; sctx->b.set_debug_callback = si_set_debug_callback; sctx->b.set_log_context = si_set_log_context; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 162adf05b60..588de59e0cd 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -65,9 +65,11 @@ #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2) /* Instruction cache. */ #define SI_CONTEXT_INV_ICACHE (1 << 3) -/* Scalar L1 cache. */ +/* Scalar cache. (GFX6-9: scalar L1; GFX10: scalar L0) + * GFX10: This also invalidates the L1 shader array cache. */ #define SI_CONTEXT_INV_SCACHE (1 << 4) -/* Vector L1 cache. */ +/* Vector cache. (GFX6-9: vector L1; GFX10: vector L0) + * GFX10: This also invalidates the L1 shader array cache. */ #define SI_CONTEXT_INV_VCACHE (1 << 5) /* L2 cache + L2 metadata cache writeback & invalidate. * GFX6-8: Used by shaders only. GFX9-10: Used by everything. */ diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index b55c99fdd4c..c271effe053 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -596,6 +596,7 @@ void si_shader_selector_key_vs(struct si_context *sctx, void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl); void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx); +void gfx10_emit_cache_flush(struct si_context *sctx); void si_emit_cache_flush(struct si_context *sctx); void si_trace_emit(struct si_context *sctx); void si_init_draw_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index a2e7fa93659..9f6397854bf 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -951,6 +951,191 @@ void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx) *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0); } +void gfx10_emit_cache_flush(struct si_context *ctx) +{ + struct radeon_cmdbuf *cs = ctx->gfx_cs; + uint32_t gcr_cntl = 0; + unsigned cb_db_event = 0; + unsigned flags = ctx->flags; + + if (!ctx->has_graphics) { + /* Only process compute flags. */ + flags &= SI_CONTEXT_INV_ICACHE | + SI_CONTEXT_INV_SCACHE | + SI_CONTEXT_INV_VCACHE | + SI_CONTEXT_INV_L2 | + SI_CONTEXT_WB_L2 | + SI_CONTEXT_INV_L2_METADATA | + SI_CONTEXT_CS_PARTIAL_FLUSH; + } + + /* We don't need these. */ + assert(!(flags & (SI_CONTEXT_VGT_FLUSH | + SI_CONTEXT_VGT_STREAMOUT_SYNC | + SI_CONTEXT_FLUSH_AND_INV_DB_META))); + + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) + ctx->num_cb_cache_flushes++; + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) + ctx->num_db_cache_flushes++; + + if (flags & SI_CONTEXT_INV_ICACHE) + gcr_cntl |= S_586_GLI_INV(V_586_GLI_ALL); + if (flags & SI_CONTEXT_INV_SCACHE) { + /* TODO: When writing to the SMEM L1 cache, we need to set SEQ + * to FORWARD when both L1 and L2 are written out (WB or INV). + */ + gcr_cntl |= S_586_GL1_INV(1) | S_586_GLK_INV(1); + } + if (flags & SI_CONTEXT_INV_VCACHE) + gcr_cntl |= S_586_GL1_INV(1) | S_586_GLV_INV(1); + if (flags & SI_CONTEXT_INV_L2) { + /* Writeback and invalidate everything in L2. */ + gcr_cntl |= S_586_GL2_INV(1) | S_586_GLM_INV(1); + ctx->num_L2_invalidates++; + } else if (flags & SI_CONTEXT_WB_L2) { + /* Writeback but do not invalidate. */ + gcr_cntl |= S_586_GL2_WB(1); + } + if (flags & SI_CONTEXT_INV_L2_METADATA) + gcr_cntl |= S_586_GLM_INV(1); + + if (flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { + if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + /* Flush CMASK/FMASK/DCC. Will wait for idle later. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_META) | + EVENT_INDEX(0)); + } + if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { + /* Flush HTILE. Will wait for idle later. */ + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_DB_META) | + EVENT_INDEX(0)); + } + + /* First flush CB/DB, then L1/L2. */ + gcr_cntl |= S_586_SEQ(V_586_SEQ_FORWARD); + + if ((flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) == + (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB)) { + cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT; + } else if (flags & SI_CONTEXT_FLUSH_AND_INV_CB) { + cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS; + } else if (flags & SI_CONTEXT_FLUSH_AND_INV_DB) { + cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS; + } else { + assert(0); + } + } else { + /* Wait for graphics shaders to go idle if requested. */ + if (flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + /* Only count explicit shader flushes, not implicit ones. */ + ctx->num_vs_flushes++; + ctx->num_ps_flushes++; + } else if (flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + ctx->num_vs_flushes++; + } + } + + if (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && ctx->compute_is_busy) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4))); + ctx->num_cs_flushes++; + ctx->compute_is_busy = false; + } + + if (cb_db_event) { + /* CB/DB flush and invalidate (or possibly just a wait for a + * meta flush) via RELEASE_MEM. + * + * Combine this with other cache flushes when possible; this + * requires affected shaders to be idle, so do it after the + * CS_PARTIAL_FLUSH before (VS/PS partial flushes are always + * implied). + */ + uint64_t va; + + /* Do the flush (enqueue the event and wait for it). */ + va = ctx->wait_mem_scratch->gpu_address; + ctx->wait_mem_number++; + + /* Get GCR_CNTL fields, because the encoding is different in RELEASE_MEM. */ + unsigned glm_wb = G_586_GLM_WB(gcr_cntl); + unsigned glm_inv = G_586_GLM_INV(gcr_cntl); + unsigned glv_inv = G_586_GLV_INV(gcr_cntl); + unsigned gl1_inv = G_586_GL1_INV(gcr_cntl); + assert(G_586_GL2_US(gcr_cntl) == 0); + assert(G_586_GL2_RANGE(gcr_cntl) == 0); + assert(G_586_GL2_DISCARD(gcr_cntl) == 0); + unsigned gl2_inv = G_586_GL2_INV(gcr_cntl); + unsigned gl2_wb = G_586_GL2_WB(gcr_cntl); + unsigned gcr_seq = G_586_SEQ(gcr_cntl); + + gcr_cntl &= C_586_GLM_WB & + C_586_GLM_INV & + C_586_GLV_INV & + C_586_GL1_INV & + C_586_GL2_INV & + C_586_GL2_WB; /* keep SEQ */ + + si_cp_release_mem(ctx, cs, cb_db_event, + S_490_GLM_WB(glm_wb) | + S_490_GLM_INV(glm_inv) | + S_490_GLV_INV(glv_inv) | + S_490_GL1_INV(gl1_inv) | + S_490_GL2_INV(gl2_inv) | + S_490_GL2_WB(gl2_wb) | + S_490_SEQ(gcr_seq), + EOP_DST_SEL_MEM, + EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, + EOP_DATA_SEL_VALUE_32BIT, + ctx->wait_mem_scratch, va, + ctx->wait_mem_number, SI_NOT_QUERY); + si_cp_wait_mem(ctx, ctx->gfx_cs, va, ctx->wait_mem_number, 0xffffffff, + WAIT_REG_MEM_EQUAL); + } + + /* Ignore fields that only modify the behavior of other fields. */ + if (gcr_cntl & C_586_GL1_RANGE & C_586_GL2_RANGE & C_586_SEQ) { + /* Flush caches and wait for the caches to assert idle. + * The cache flush is executed in the ME, but the PFP waits + * for completion. + */ + radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); + radeon_emit(cs, 0); /* CP_COHER_CNTL */ + radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ + radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ + radeon_emit(cs, 0); /* CP_COHER_BASE */ + radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ + radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ + radeon_emit(cs, gcr_cntl); /* GCR_CNTL */ + } else if (cb_db_event || + (flags & (SI_CONTEXT_VS_PARTIAL_FLUSH | + SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH))) { + /* We need to ensure that PFP waits as well. */ + radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); + radeon_emit(cs, 0); + } + + if (flags & SI_CONTEXT_START_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | + EVENT_INDEX(0)); + } else if (flags & SI_CONTEXT_STOP_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | + EVENT_INDEX(0)); + } + + ctx->flags = 0; +} + void si_emit_cache_flush(struct si_context *sctx) { struct radeon_cmdbuf *cs = sctx->gfx_cs; -- 2.30.2