From: Marek Olšák Date: Tue, 30 Dec 2014 15:45:51 +0000 (+0100) Subject: radeonsi: improve and fix streamout flushing X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=ca9c5b2be5ed6aa34032264432bb5465d37641ed;p=mesa.git radeonsi: improve and fix streamout flushing - we don't usually need to flush TC L2 - we should flush KCACHE (not really an issue now since we always flush KCACHE when updating descriptors, but it could be a problem if we used CE, which doesn't require flushing KCACHE) - add an explicit VS_PARTIAL_FLUSH flag Reviewed-by: Michel Dänzer --- diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 454e12cc835..1644ec7a8d3 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -856,6 +856,36 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned old_num_targets = sctx->b.streamout.num_targets; unsigned i, bufidx; + /* We are going to unbind the buffers. Mark which caches need to be flushed. */ + if (sctx->b.streamout.num_targets && sctx->b.streamout.begin_emitted) { + /* Since streamout uses vector writes which go through TC L2 + * and most other clients can use TC L2 as well, we don't need + * to flush it. + * + * The only case which requires flushing it is VGT DMA index + * fetching, which is a rare case. Thus, flag the TC L2 + * dirtiness in the resource and handle it when index fetching + * is used. + */ + for (i = 0; i < sctx->b.streamout.num_targets; i++) + if (sctx->b.streamout.targets[i]) + r600_resource(sctx->b.streamout.targets[i]->b.buffer)->TC_L2_dirty = true; + + /* Invalidate the scalar cache in case a streamout buffer is + * going to be used as a constant buffer. + * + * Invalidate TC L1, because streamout bypasses it (done by + * setting GLC=1 in the store instruction), but it can contain + * outdated data of streamout buffers. + * + * VS_PARTIAL_FLUSH is required if the buffers are going to be + * used as an input immediately. + */ + sctx->b.flags |= SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_TC_L1 | + SI_CONTEXT_VS_PARTIAL_FLUSH; + } + /* Streamout buffers must be bound in 2 places: * 1) in VGT by setting the VGT_STRMOUT registers * 2) as shader resources diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 287537fe8bd..dfb1cd6e065 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -65,13 +65,14 @@ #define SI_CONTEXT_FLUSH_AND_INV_DB (R600_CONTEXT_PRIVATE_FLAG << 6) #define SI_CONTEXT_FLUSH_AND_INV_CB (R600_CONTEXT_PRIVATE_FLAG << 7) /* Engine synchronization. */ -#define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 8) -#define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9) -#define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10) -#define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 11) +#define SI_CONTEXT_VS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 8) +#define SI_CONTEXT_PS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 9) +#define SI_CONTEXT_CS_PARTIAL_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 10) +#define SI_CONTEXT_VGT_FLUSH (R600_CONTEXT_PRIVATE_FLAG << 11) +#define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 12) /* Compute only. */ -#define SI_CONTEXT_FLUSH_WITH_INV_L2 (R600_CONTEXT_PRIVATE_FLAG << 12) /* TODO: merge with TC? */ -#define SI_CONTEXT_FLAG_COMPUTE (R600_CONTEXT_PRIVATE_FLAG << 13) +#define SI_CONTEXT_FLUSH_WITH_INV_L2 (R600_CONTEXT_PRIVATE_FLAG << 13) /* TODO: merge with TC? */ +#define SI_CONTEXT_FLAG_COMPUTE (R600_CONTEXT_PRIVATE_FLAG << 14) #define SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER (SI_CONTEXT_FLUSH_AND_INV_CB | \ SI_CONTEXT_FLUSH_AND_INV_CB_META | \ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index e6916c10f8f..3703e5f3d58 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -388,9 +388,9 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); } - if (sctx->flags & (SI_CONTEXT_INV_TC_L1 | R600_CONTEXT_STREAMOUT_FLUSH)) + if (sctx->flags & SI_CONTEXT_INV_TC_L1) cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); - if (sctx->flags & (SI_CONTEXT_INV_TC_L2 | R600_CONTEXT_STREAMOUT_FLUSH)) + if (sctx->flags & SI_CONTEXT_INV_TC_L2) cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_CB) { @@ -444,8 +444,7 @@ void si_emit_cache_flush(struct r600_common_context *sctx, struct r600_atom *ato if (sctx->flags & SI_CONTEXT_PS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); radeon_emit(cs, EVENT_TYPE(V_028A90_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); - } else if (sctx->flags & R600_CONTEXT_STREAMOUT_FLUSH) { - /* Needed if streamout buffers are going to be used as a source. */ + } else if (sctx->flags & SI_CONTEXT_VS_PARTIAL_FLUSH) { radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0) | compute); radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4)); }