From: Bas Nieuwenhuizen Date: Sun, 27 Mar 2016 09:14:34 +0000 (+0200) Subject: radeonsi: do not do two full flushes on every compute dispatch X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=7a92c0842892bf55a82b7d95ab5a3b7dfbb83407;p=mesa.git radeonsi: do not do two full flushes on every compute dispatch v2: Add more CS_PARTIAL_FLUSH events. Essentially every place with waits on finishing for pixel shaders also has a write after read hazard with compute shaders. Invalidating L2 waits implicitly on pixel and compute shaders, so, we don't need a CS_PARTIAL_FLUSH for switching FBO. v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2. According to Marek the INV_GLOBAL_L2 events don't wait for compute shaders to finish, so wait for them explicitly. Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Marek Olšák Reviewed-by: Nicolai Hähnle Reviewed-by: Edward O'Callaghan --- diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 921b62cb5a0..105cf8cb99f 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -441,13 +441,8 @@ static void si_launch_grid( if (!sctx->cs_shader_state.initialized) si_initialize_compute(sctx); - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_FLUSH_WITH_INV_L2 | - SI_CONTEXT_FLAG_COMPUTE; - si_emit_cache_flush(sctx, NULL); + if (sctx->b.flags) + si_emit_cache_flush(sctx, NULL); if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc)) return; @@ -480,14 +475,6 @@ static void si_launch_grid( si_setup_tgsi_grid(sctx, info); si_emit_dispatch_packets(sctx, info); - - sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_FLAG_COMPUTE; - si_emit_cache_flush(sctx, NULL); } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 001ddd4bfae..38e0ee60d64 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, uint64_t va = r600_resource(dst)->gpu_address + offset; /* Flush the caches. */ - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; while (size) { unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); @@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx, } /* Flush the caches. */ - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; /* This is the main part doing the copying. Src is always aligned. */ main_dst_offset = dst_offset + skipped_size; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 301a865b712..1580e61e17f 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -1032,7 +1032,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, * start writing to the targets. */ if (num_targets) - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; /* Streamout buffers must be bound in 2 places: * 1) in VGT by setting the VGT_STRMOUT registers diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 69fecce0308..e3abb7f67cc 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -117,6 +117,7 @@ void si_context_gfx_flush(void *context, unsigned flags, ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | SI_CONTEXT_INV_VMEM_L1 | SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_CS_PARTIAL_FLUSH | /* this is probably not needed anymore */ SI_CONTEXT_PS_PARTIAL_FLUSH; si_emit_cache_flush(ctx, NULL); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index af9ffdd381f..305a70b9dfa 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, */ sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; + SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | + SI_CONTEXT_CS_PARTIAL_FLUSH; /* Take the maximum of the old and new count. If the new count is lower, * dirtying is needed to disable the unbound colorbuffers. @@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx) sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_FLUSH_AND_INV_CB; + SI_CONTEXT_FLUSH_AND_INV_CB | + SI_CONTEXT_CS_PARTIAL_FLUSH; } static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) @@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) /* Subsequent commands must wait for all shader invocations to * complete. */ - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH; if (flags & PIPE_BARRIER_CONSTANT_BUFFER) sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | @@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE | PIPE_BARRIER_IMAGE | - PIPE_BARRIER_STREAMOUT_BUFFER)) { + PIPE_BARRIER_STREAMOUT_BUFFER | + PIPE_BARRIER_GLOBAL_BUFFER)) { /* As far as I can tell, L1 contents are written back to L2 * automatically at end of shader, but the contents of other * L1 caches might still be stale. */