radeonsi: do not do two full flushes on every compute dispatch
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Sun, 27 Mar 2016 09:14:34 +0000 (11:14 +0200)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Tue, 19 Apr 2016 16:10:31 +0000 (18:10 +0200)
v2: Add more CS_PARTIAL_FLUSH events.

Essentially every place with waits on finishing for pixel shaders
also has a write after read hazard with compute shaders.

Invalidating L2 waits implicitly on pixel and compute shaders,
so, we don't need a CS_PARTIAL_FLUSH for switching FBO.

v3: Add CS_PARTIAL_FLUSH events even if we already have INV_GLOBAL_L2.

According to Marek the INV_GLOBAL_L2 events don't wait for compute
shaders to finish, so wait for them explicitly.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
src/gallium/drivers/radeonsi/si_compute.c
src/gallium/drivers/radeonsi/si_cp_dma.c
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_hw_context.c
src/gallium/drivers/radeonsi/si_state.c

index 921b62cb5a0ac346bef938cb5208b3e0cb261701..105cf8cb99fab6bfe811010f1d1480aa01aeba3b 100644 (file)
@@ -441,13 +441,8 @@ static void si_launch_grid(
        if (!sctx->cs_shader_state.initialized)
                si_initialize_compute(sctx);
 
-       sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SMEM_L1 |
-                        SI_CONTEXT_FLUSH_WITH_INV_L2 |
-                        SI_CONTEXT_FLAG_COMPUTE;
-       si_emit_cache_flush(sctx, NULL);
+       if (sctx->b.flags)
+               si_emit_cache_flush(sctx, NULL);
 
        if (!si_switch_compute_shader(sctx, program, &program->shader, info->pc))
                return;
@@ -480,14 +475,6 @@ static void si_launch_grid(
                si_setup_tgsi_grid(sctx, info);
 
        si_emit_dispatch_packets(sctx, info);
-
-       sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-                        SI_CONTEXT_INV_VMEM_L1 |
-                        SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_INV_ICACHE |
-                        SI_CONTEXT_INV_SMEM_L1 |
-                        SI_CONTEXT_FLAG_COMPUTE;
-       si_emit_cache_flush(sctx, NULL);
 }
 
 
index 001ddd4bfae38a4e6a1bb4aeb90859e4bd478594..38e0ee60d648ec1e1ce0993cb61746f1334c1d9c 100644 (file)
@@ -190,7 +190,8 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
        uint64_t va = r600_resource(dst)->gpu_address + offset;
 
        /* Flush the caches. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
        while (size) {
                unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
@@ -296,7 +297,8 @@ void si_copy_buffer(struct si_context *sctx,
        }
 
        /* Flush the caches. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
        /* This is the main part doing the copying. Src is always aligned. */
        main_dst_offset = dst_offset + skipped_size;
index 301a865b712491b5e6c6575e0a2df3ecb58dc19e..1580e61e17f837056e62a01d94cbf8e72a9787e5 100644 (file)
@@ -1032,7 +1032,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
         * start writing to the targets.
         */
        if (num_targets)
-               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+               sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                                SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        /* Streamout buffers must be bound in 2 places:
         * 1) in VGT by setting the VGT_STRMOUT registers
index 69fecce0308911c3e4040c95e09e808d7465c51e..e3abb7f67cc4179a616dac816228e94318e64ac4 100644 (file)
@@ -117,6 +117,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
        ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
                        SI_CONTEXT_INV_VMEM_L1 |
                        SI_CONTEXT_INV_GLOBAL_L2 |
+                       SI_CONTEXT_CS_PARTIAL_FLUSH |
                        /* this is probably not needed anymore */
                        SI_CONTEXT_PS_PARTIAL_FLUSH;
        si_emit_cache_flush(ctx, NULL);
index af9ffdd381fd4b4cbcb4e536af7ada06f2a4a8fa..305a70b9dfaba0fc38ced3eac64feab66e26cfd9 100644 (file)
@@ -2436,7 +2436,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
         */
        sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
                         SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+                        SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        /* Take the maximum of the old and new count. If the new count is lower,
         * dirtying is needed to disable the unbound colorbuffers.
@@ -3458,7 +3459,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
 
        sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
                         SI_CONTEXT_INV_GLOBAL_L2 |
-                        SI_CONTEXT_FLUSH_AND_INV_CB;
+                        SI_CONTEXT_FLUSH_AND_INV_CB |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
 }
 
 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
@@ -3467,7 +3469,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 
        /* Subsequent commands must wait for all shader invocations to
         * complete. */
-       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+       sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                        SI_CONTEXT_CS_PARTIAL_FLUSH;
 
        if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
                sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
@@ -3477,7 +3480,8 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
                     PIPE_BARRIER_SHADER_BUFFER |
                     PIPE_BARRIER_TEXTURE |
                     PIPE_BARRIER_IMAGE |
-                    PIPE_BARRIER_STREAMOUT_BUFFER)) {
+                    PIPE_BARRIER_STREAMOUT_BUFFER |
+                    PIPE_BARRIER_GLOBAL_BUFFER)) {
                /* As far as I can tell, L1 contents are written back to L2
                 * automatically at end of shader, but the contents of other
                 * L1 caches might still be stale. */