From 44d27fd6fba92a3b11c10d06f1b3babeffbcd4bb Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 26 Apr 2020 07:22:52 -0400 Subject: [PATCH] radeonsi: clean up and deduplicate code around internal compute dispatches In addition to the cleanup, there are these changes in behavior: - clear_render_target waits for idle after the dispatch and then flushes L0-L1 caches (this was missing) - sL0 is no longer invalidated before the dispatch, because src resources don't use it - sL0 is no longer invalidated after the dispatch if dst is an image Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- .../drivers/radeonsi/si_compute_blit.c | 81 ++++++++++--------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 0a81c6867e6..a4784fd477f 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -56,8 +56,18 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, } } -static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info) +#define SI_CS_IMAGE_OP (1 << 0) +#define SI_CS_WAIT_FOR_IDLE (1 << 1) + +static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info, + void *restore_cs, unsigned flags) { + /* Wait for previous shaders to finish. */ + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH; + /* Invalidate L0-L1 caches. */ + /* sL0 is never invalidated, because src resources don't use it. */ + sctx->flags |= SI_CONTEXT_INV_VCACHE; + /* Set settings for driver-internal compute dispatches. */ sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS; sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS; @@ -75,6 +85,25 @@ static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_in sctx->render_cond_force_off = false; if (sctx->blitter) sctx->blitter->running = false; + + /* Restore the original compute shader. */ + sctx->b.bind_compute_state(&sctx->b, restore_cs); + + if (flags & SI_CS_WAIT_FOR_IDLE) { + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; + + if (flags & SI_CS_IMAGE_OP) { + /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */ + sctx->flags |= sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0; + /* Make sure image stores are visible to all CUs. */ + sctx->flags |= SI_CONTEXT_INV_VCACHE; + } else { + /* Make sure buffer stores are visible to all CUs. */ + sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; + } + } else { + assert(!(flags & SI_CS_IMAGE_OP)); + } } static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst, @@ -90,8 +119,7 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe unsigned data[4] = {0}; memcpy(data, clear_value, 12); - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); struct pipe_shader_buffer saved_sb = {0}; si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb); @@ -131,9 +159,8 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe info.grid[1] = 1; info.grid[2] = 1; - si_launch_grid_internal(sctx, &info); + si_launch_grid_internal(sctx, &info, saved_cs, SI_CS_WAIT_FOR_IDLE); - ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask); ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); @@ -156,8 +183,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); assert(!src || src_offset + size <= src->width0); - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); /* Save states. */ void *saved_cs = sctx->cs_shader_state.program; @@ -228,16 +254,15 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res ctx->bind_compute_state(ctx, sctx->cs_clear_buffer); } - si_launch_grid_internal(sctx, &info); + si_launch_grid_internal(sctx, &info, saved_cs, SI_CS_WAIT_FOR_IDLE); enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0); + sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0; if (cache_policy != L2_BYPASS) si_resource(dst)->TC_L2_dirty = true; /* Restore states. */ - ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask); for (int i = 0; i < 2; i++) pipe_resource_reference(&saved_sb[i].buffer, NULL); @@ -376,9 +401,6 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u if (width == 0 || height == 0) return; - sctx->flags |= - SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - /* The driver doesn't decompress resources automatically here. */ si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz, dstz + src_box->depth - 1); @@ -461,11 +483,9 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u info.grid[2] = depth; } - si_launch_grid_internal(sctx, &info); + si_launch_grid_internal(sctx, &info, saved_cs, + SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP); - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image); ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); for (int i = 0; i < 2; i++) @@ -477,10 +497,7 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) { struct pipe_context *ctx = &sctx->b; - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU); - sctx->emit_cache_flush(sctx); + sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU); /* Save states. */ void *saved_cs = sctx->cs_shader_state.program; @@ -537,14 +554,13 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex) info.grid[2] = 1; info.last_block[0] = num_threads % 64; - si_launch_grid_internal(sctx, &info); + si_launch_grid_internal(sctx, &info, saved_cs, 0); /* Don't flush caches or wait. The driver will wait at the end of this IB, * and L2 will be flushed by the kernel fence. */ /* Restore states. */ - ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img); for (unsigned i = 0; i < 3; i++) { @@ -565,9 +581,6 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex if (tex->nr_samples != tex->nr_storage_samples) return; - /* Flush caches and sync engines. */ - sctx->flags |= - SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); si_make_CB_shader_coherent(sctx, tex->nr_samples, true, true /* DCC is not possible with image stores */); @@ -605,14 +618,10 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex info.grid[1] = DIV_ROUND_UP(tex->height0, 8); info.grid[2] = is_array ? tex->array_size : 1; - si_launch_grid_internal(sctx, &info); - - /* Flush caches and sync engines. */ - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); + si_launch_grid_internal(sctx, &info, saved_cs, + SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP); /* Restore previous states. */ - ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); pipe_resource_reference(&saved_image.resource, NULL); @@ -668,8 +677,6 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac sctx->render_cond_force_off = !render_condition_enabled; - sctx->flags |= - SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true, true /* DCC is not possible with image stores */); @@ -724,11 +731,9 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac info.grid[2] = 1; } - si_launch_grid_internal(sctx, &info); + si_launch_grid_internal(sctx, &info, saved_cs, + SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP); - sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) | - si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM); - ctx->bind_compute_state(ctx, saved_cs); ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image); ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb); pipe_resource_reference(&saved_image.resource, NULL); -- 2.30.2