radeonsi: clean up and deduplicate code around internal compute dispatches

author Marek Olšák <marek.olsak@amd.com>

Sun, 26 Apr 2020 11:22:52 +0000 (07:22 -0400)

committer Marge Bot <eric+marge@anholt.net>

Thu, 30 Apr 2020 22:27:31 +0000 (22:27 +0000)
author Marek Olšák <marek.olsak@amd.com>
Sun, 26 Apr 2020 11:22:52 +0000 (07:22 -0400)
committer Marge Bot <eric+marge@anholt.net>
Thu, 30 Apr 2020 22:27:31 +0000 (22:27 +0000)
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c

index 0a81c6867e6780c3cd421a9f9368b43ea261f677..a4784fd477f7b8ad034e515288c245acab2b1526 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -56,8 +56,18 @@ unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
     }
  }
  
-static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info)
+#define SI_CS_IMAGE_OP        (1 << 0)
+#define SI_CS_WAIT_FOR_IDLE   (1 << 1)
+
+static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_info *info,
+                                    void *restore_cs, unsigned flags)
  {
+   /* Wait for previous shaders to finish. */
+   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | SI_CONTEXT_PS_PARTIAL_FLUSH;
+   /* Invalidate L0-L1 caches. */
+   /* sL0 is never invalidated, because src resources don't use it. */
+   sctx->flags |= SI_CONTEXT_INV_VCACHE;
+
     /* Set settings for driver-internal compute dispatches. */
     sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
     sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
@@ -75,6 +85,25 @@ static void si_launch_grid_internal(struct si_context *sctx, struct pipe_grid_in
     sctx->render_cond_force_off = false;
     if (sctx->blitter)
        sctx->blitter->running = false;
+
+   /* Restore the original compute shader. */
+   sctx->b.bind_compute_state(&sctx->b, restore_cs);
+
+   if (flags & SI_CS_WAIT_FOR_IDLE) {
+      sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+
+      if (flags & SI_CS_IMAGE_OP) {
+         /* Make sure image stores are visible to CB, which doesn't use L2 on GFX6-8. */
+         sctx->flags |= sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0;
+         /* Make sure image stores are visible to all CUs. */
+         sctx->flags |= SI_CONTEXT_INV_VCACHE;
+      } else {
+         /* Make sure buffer stores are visible to all CUs. */
+         sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
+      }
+   } else {
+      assert(!(flags & SI_CS_IMAGE_OP));
+   }
  }
  
  static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe_resource *dst,
@@ -90,8 +119,7 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe
     unsigned data[4] = {0};
     memcpy(data, clear_value, 12);
  
-   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
-                  si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+   sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
  
     struct pipe_shader_buffer saved_sb = {0};
     si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb);
@@ -131,9 +159,8 @@ static void si_compute_clear_12bytes_buffer(struct si_context *sctx, struct pipe
     info.grid[1] = 1;
     info.grid[2] = 1;
  
-   si_launch_grid_internal(sctx, &info);
+   si_launch_grid_internal(sctx, &info, saved_cs, SI_CS_WAIT_FOR_IDLE);
  
-   ctx->bind_compute_state(ctx, saved_cs);
     ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_sb, saved_writable_mask);
     ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
  
@@ -156,8 +183,7 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
     assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
     assert(!src || src_offset + size <= src->width0);
  
-   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
-                  si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+   sctx->flags |= si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
  
     /* Save states. */
     void *saved_cs = sctx->cs_shader_state.program;
@@ -228,16 +254,15 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
        ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
     }
  
-   si_launch_grid_internal(sctx, &info);
+   si_launch_grid_internal(sctx, &info, saved_cs, SI_CS_WAIT_FOR_IDLE);
  
     enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
-   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0);
+   sctx->flags |= cache_policy == L2_BYPASS ? SI_CONTEXT_WB_L2 : 0;
  
     if (cache_policy != L2_BYPASS)
        si_resource(dst)->TC_L2_dirty = true;
  
     /* Restore states. */
-   ctx->bind_compute_state(ctx, saved_cs);
     ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb, saved_writable_mask);
     for (int i = 0; i < 2; i++)
        pipe_resource_reference(&saved_sb[i].buffer, NULL);
@@ -376,9 +401,6 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
     if (width == 0 || height == 0)
        return;
  
-   sctx->flags |=
-      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-
     /* The driver doesn't decompress resources automatically here. */
     si_decompress_subresource(ctx, dst, PIPE_MASK_RGBAZS, dst_level, dstz,
                               dstz + src_box->depth - 1);
@@ -461,11 +483,9 @@ void si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
        info.grid[2] = depth;
     }
  
-   si_launch_grid_internal(sctx, &info);
+   si_launch_grid_internal(sctx, &info, saved_cs,
+                           SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP);
  
-   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-   ctx->bind_compute_state(ctx, saved_cs);
     ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 2, saved_image);
     ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
     for (int i = 0; i < 2; i++)
@@ -477,10 +497,7 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
  {
     struct pipe_context *ctx = &sctx->b;
  
-   sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH |
-                  si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU) |
-                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_LRU);
-   sctx->emit_cache_flush(sctx);
+   sctx->flags |= si_get_flush_flags(sctx, SI_COHERENCY_CB_META, L2_LRU);
  
     /* Save states. */
     void *saved_cs = sctx->cs_shader_state.program;
@@ -537,14 +554,13 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
     info.grid[2] = 1;
     info.last_block[0] = num_threads % 64;
  
-   si_launch_grid_internal(sctx, &info);
+   si_launch_grid_internal(sctx, &info, saved_cs, 0);
  
     /* Don't flush caches or wait. The driver will wait at the end of this IB,
      * and L2 will be flushed by the kernel fence.
      */
  
     /* Restore states. */
-   ctx->bind_compute_state(ctx, saved_cs);
     ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 3, saved_img);
  
     for (unsigned i = 0; i < 3; i++) {
@@ -565,9 +581,6 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
     if (tex->nr_samples != tex->nr_storage_samples)
        return;
  
-   /* Flush caches and sync engines. */
-   sctx->flags |=
-      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
     si_make_CB_shader_coherent(sctx, tex->nr_samples, true,
                                true /* DCC is not possible with image stores */);
  
@@ -605,14 +618,10 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
     info.grid[1] = DIV_ROUND_UP(tex->height0, 8);
     info.grid[2] = is_array ? tex->array_size : 1;
  
-   si_launch_grid_internal(sctx, &info);
-
-   /* Flush caches and sync engines. */
-   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
+   si_launch_grid_internal(sctx, &info, saved_cs,
+                           SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP);
  
     /* Restore previous states. */
-   ctx->bind_compute_state(ctx, saved_cs);
     ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
     pipe_resource_reference(&saved_image.resource, NULL);
  
@@ -668,8 +677,6 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
  
     sctx->render_cond_force_off = !render_condition_enabled;
  
-   sctx->flags |=
-      SI_CONTEXT_CS_PARTIAL_FLUSH | si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
     si_make_CB_shader_coherent(sctx, dstsurf->texture->nr_samples, true,
                                true /* DCC is not possible with image stores */);
  
@@ -724,11 +731,9 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
        info.grid[2] = 1;
     }
  
-   si_launch_grid_internal(sctx, &info);
+   si_launch_grid_internal(sctx, &info, saved_cs,
+                           SI_CS_WAIT_FOR_IDLE | SI_CS_IMAGE_OP);
  
-   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | (sctx->chip_class <= GFX8 ? SI_CONTEXT_WB_L2 : 0) |
-                  si_get_flush_flags(sctx, SI_COHERENCY_SHADER, L2_STREAM);
-   ctx->bind_compute_state(ctx, saved_cs);
     ctx->set_shader_images(ctx, PIPE_SHADER_COMPUTE, 0, 1, &saved_image);
     ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, &saved_cb);
     pipe_resource_reference(&saved_image.resource, NULL);
author	Marek Olšák <marek.olsak@amd.com>
	Sun, 26 Apr 2020 11:22:52 +0000 (07:22 -0400)
committer	Marge Bot <eric+marge@anholt.net>
	Thu, 30 Apr 2020 22:27:31 +0000 (22:27 +0000)