From: Marek Olšák <marek.olsak@amd.com>
Date: Fri, 24 Aug 2018 04:29:04 +0000 (-0400)
Subject: radeonsi: use compute shaders for clear_buffer & copy_buffer
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=9b331e462e5021d994859756d46cd2519d9c9c6e;p=mesa.git

radeonsi: use compute shaders for clear_buffer & copy_buffer

Fast color clears should be much faster. Also, fast color clears on
evicted buffers should be 200x faster on GFX8 and older.
---

diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index abdc4e07f1e..aeb9b7982c4 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -11,6 +11,7 @@ C_SOURCES := \
 	si_clear.c \
 	si_compute.c \
 	si_compute.h \
+	si_compute_blit.c \
 	si_cp_dma.c \
 	si_debug.c \
 	si_descriptors.c \
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index 4d6044f724b..2542f136d11 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -27,6 +27,7 @@ files_libradeonsi = files(
   'si_clear.c',
   'si_compute.c',
   'si_compute.h',
+  'si_compute_blit.c',
   'si_cp_dma.c',
   'si_debug.c',
   'si_descriptors.c',
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 4e07de81bac..520e5b94f4a 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -256,7 +256,7 @@ void vi_dcc_clear_level(struct si_context *sctx,
 	}
 
 	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-			clear_value, SI_COHERENCY_CB_META);
+			&clear_value, 4, SI_COHERENCY_CB_META);
 }
 
 /* Set the same micro tile mode as the destination of the last MSAA resolve.
@@ -487,9 +487,10 @@ static void si_do_fast_color_clear(struct si_context *sctx,
 				if (eliminate_needed)
 					continue;
 
+				uint32_t clear_value = 0xCCCCCCCC;
 				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 						tex->cmask_offset, tex->surface.cmask_size,
-						0xCCCCCCCC, SI_COHERENCY_CB_META);
+						&clear_value, 4, SI_COHERENCY_CB_META);
 				need_decompress_pass = true;
 			}
 
@@ -518,9 +519,10 @@ static void si_do_fast_color_clear(struct si_context *sctx,
 				continue;
 
 			/* Do the fast clear. */
+			uint32_t clear_value = 0;
 			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
-					tex->cmask_offset, tex->surface.cmask_size, 0,
-					SI_COHERENCY_CB_META);
+					tex->cmask_offset, tex->surface.cmask_size,
+					&clear_value, 4, SI_COHERENCY_CB_META);
 			need_decompress_pass = true;
 		}
 
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
new file mode 100644
index 00000000000..20e4f591fbb
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "si_pipe.h"
+
+/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
+ * and L2_STREAM for src.
+ */
+static enum si_cache_policy get_cache_policy(struct si_context *sctx,
+					     enum si_coherency coher,
+					     uint64_t size)
+{
+	if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
+					  coher == SI_COHERENCY_CP)) ||
+	    (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
+		return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
+
+	return L2_BYPASS;
+}
+
+unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
+			    enum si_cache_policy cache_policy)
+{
+	switch (coher) {
+	default:
+	case SI_COHERENCY_NONE:
+	case SI_COHERENCY_CP:
+		return 0;
+	case SI_COHERENCY_SHADER:
+		return SI_CONTEXT_INV_SMEM_L1 |
+		       SI_CONTEXT_INV_VMEM_L1 |
+		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+	case SI_COHERENCY_CB_META:
+		return SI_CONTEXT_FLUSH_AND_INV_CB;
+	}
+}
+
+static void si_compute_do_clear_or_copy(struct si_context *sctx,
+					struct pipe_resource *dst,
+					unsigned dst_offset,
+					struct pipe_resource *src,
+					unsigned src_offset,
+					unsigned size,
+					const uint32_t *clear_value,
+					unsigned clear_value_size,
+					enum si_coherency coher)
+{
+	struct pipe_context *ctx = &sctx->b;
+
+	assert(src_offset % 4 == 0);
+	assert(dst_offset % 4 == 0);
+	assert(size % 4 == 0);
+
+	assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
+	assert(!src || src_offset + size <= src->width0);
+
+	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+		       SI_CONTEXT_CS_PARTIAL_FLUSH |
+		       si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY);
+	si_emit_cache_flush(sctx);
+
+	/* Save states. */
+	void *saved_cs = sctx->cs_shader_state.program;
+	struct pipe_shader_buffer saved_sb[2] = {};
+	si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+
+	/* The memory accesses are coalesced, meaning that the 1st instruction writes
+	 * the 1st contiguous block of data for the whole wave, the 2nd instruction
+	 * writes the 2nd contiguous block of data, etc.
+	 */
+	unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
+					   SI_COMPUTE_CLEAR_DW_PER_THREAD;
+	unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
+	unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
+	unsigned dwords_per_wave = dwords_per_thread * 64;
+
+	unsigned num_dwords = size / 4;
+	unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+	struct pipe_grid_info info = {};
+	info.block[0] = MIN2(64, num_instructions);
+	info.block[1] = 1;
+	info.block[2] = 1;
+	info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+	info.grid[1] = 1;
+	info.grid[2] = 1;
+
+	struct pipe_shader_buffer sb[2] = {};
+	sb[0].buffer = dst;
+	sb[0].buffer_offset = dst_offset;
+	sb[0].buffer_size = size;
+
+	if (src) {
+		sb[1].buffer = src;
+		sb[1].buffer_offset = src_offset;
+		sb[1].buffer_size = size;
+
+		ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
+		ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
+	} else {
+		assert(clear_value_size >= 4 &&
+		       clear_value_size <= 16 &&
+		       util_is_power_of_two_or_zero(clear_value_size));
+
+		for (unsigned i = 0; i < 4; i++)
+			sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
+
+		ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
+		ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
+	}
+
+	ctx->launch_grid(ctx, &info);
+
+	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+	sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
+		       (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
+
+	if (cache_policy != L2_BYPASS)
+		r600_resource(dst)->TC_L2_dirty = true;
+
+	/* Restore states. */
+	ctx->bind_compute_state(ctx, saved_cs);
+	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb);
+}
+
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+		     uint64_t offset, uint64_t size, uint32_t *clear_value,
+		     uint32_t clear_value_size, enum si_coherency coher)
+{
+	if (!size)
+		return;
+
+	unsigned clear_alignment = MIN2(clear_value_size, 4);
+
+	assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
+	assert(offset % clear_alignment == 0);
+	assert(size % clear_alignment == 0);
+	assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
+
+	/* Reduce a large clear value size if possible. */
+	if (clear_value_size > 4) {
+		bool clear_dword_duplicated = true;
+
+		/* See if we can lower large fills to dword fills. */
+		for (unsigned i = 1; i < clear_value_size / 4; i++) {
+			if (clear_value[0] != clear_value[i]) {
+				clear_dword_duplicated = false;
+				break;
+			}
+		}
+		if (clear_dword_duplicated)
+			clear_value_size = 4;
+	}
+
+	/* Expand a small clear value size. */
+	uint32_t tmp_clear_value;
+	if (clear_value_size <= 2) {
+		if (clear_value_size == 1) {
+			tmp_clear_value = *(uint8_t*)clear_value;
+			tmp_clear_value |= (tmp_clear_value << 8) |
+					   (tmp_clear_value << 16) |
+					   (tmp_clear_value << 24);
+		} else {
+			tmp_clear_value = *(uint16_t*)clear_value;
+			tmp_clear_value |= tmp_clear_value << 16;
+		}
+		clear_value = &tmp_clear_value;
+		clear_value_size = 4;
+	}
+
+	/* Use transform feedback for 12-byte clears. */
+	/* TODO: Use compute. */
+	if (clear_value_size == 12) {
+		union pipe_color_union streamout_clear_value;
+
+		memcpy(&streamout_clear_value, clear_value, clear_value_size);
+		si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
+		util_blitter_clear_buffer(sctx->blitter, dst, offset,
+					  size, clear_value_size / 4,
+					  &streamout_clear_value);
+		si_blitter_end(sctx);
+		return;
+	}
+
+	uint64_t aligned_size = size & ~3ull;
+	if (aligned_size >= 4) {
+		/* Before GFX9, CP DMA was very slow when clearing GTT, so never
+		 * use CP DMA clears on those chips, because we can't be certain
+		 * about buffer placements.
+		 */
+		if (clear_value_size > 4 ||
+		    (clear_value_size == 4 &&
+		     offset % 4 == 0 &&
+		     (size > 32*1024 || sctx->chip_class <= VI))) {
+			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
+						    aligned_size, clear_value,
+						    clear_value_size, coher);
+		} else {
+			assert(clear_value_size == 4);
+			si_cp_dma_clear_buffer(sctx, dst, offset,
+					       aligned_size, *clear_value, coher,
+					       get_cache_policy(sctx, coher, size));
+		}
+
+		offset += aligned_size;
+		size -= aligned_size;
+	}
+
+	/* Handle non-dword alignment. */
+	if (size) {
+		assert(dst);
+		assert(dst->target == PIPE_BUFFER);
+		assert(size < 4);
+
+		pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+	}
+}
+
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 unsigned offset, unsigned size,
+				 const void *clear_value,
+				 int clear_value_size)
+{
+	enum si_coherency coher;
+
+	if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE)
+		coher = SI_COHERENCY_CP;
+	else
+		coher = SI_COHERENCY_SHADER;
+
+	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
+			clear_value_size, coher);
+}
+
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size)
+{
+	if (!size)
+		return;
+
+	enum si_coherency coher = SI_COHERENCY_SHADER;
+	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+
+	/* Only use compute for VRAM copies on dGPUs. */
+	if (sctx->screen->info.has_dedicated_vram &&
+	    r600_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+	    r600_resource(src)->domains & RADEON_DOMAIN_VRAM &&
+	    size > 32 * 1024 &&
+	    dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
+		si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset,
+					    size, NULL, 0, coher);
+	} else {
+		si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
+				      0, coher, cache_policy);
+	}
+}
+
+void si_init_compute_blit_functions(struct si_context *sctx)
+{
+	sctx->b.clear_buffer = si_pipe_clear_buffer;
+}
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index c1ecd5fb3e8..839b31b7fdf 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -25,12 +25,6 @@
 #include "si_pipe.h"
 #include "sid.h"
 
-/* Recommended maximum sizes for optimal performance.
- * Fall back to compute or SDMA if the size is greater.
- */
-#define CP_DMA_COPY_PERF_THRESHOLD	(64 * 1024) /* copied from Vulkan */
-#define CP_DMA_CLEAR_PERF_THRESHOLD	(32 * 1024) /* guess (clear is much slower) */
-
 /* Set this if you want the ME to wait until CP DMA is done.
  * It should be set on the last CP DMA packet. */
 #define CP_DMA_SYNC		(1 << 0)
@@ -155,35 +149,6 @@ void si_cp_dma_wait_for_idle(struct si_context *sctx)
 	si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
 }
 
-static unsigned get_flush_flags(struct si_context *sctx, enum si_coherency coher,
-				enum si_cache_policy cache_policy)
-{
-	switch (coher) {
-	default:
-	case SI_COHERENCY_NONE:
-		return 0;
-	case SI_COHERENCY_SHADER:
-		assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
-		return SI_CONTEXT_INV_SMEM_L1 |
-		       SI_CONTEXT_INV_VMEM_L1 |
-		       (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
-	case SI_COHERENCY_CB_META:
-		assert(sctx->chip_class >= GFX9 ? cache_policy != L2_BYPASS :
-						  cache_policy == L2_BYPASS);
-		return SI_CONTEXT_FLUSH_AND_INV_CB;
-	}
-}
-
-static enum si_cache_policy get_cache_policy(struct si_context *sctx,
-					     enum si_coherency coher)
-{
-	if ((sctx->chip_class >= GFX9 && coher == SI_COHERENCY_CB_META) ||
-	    (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
-		return L2_LRU;
-
-	return L2_BYPASS;
-}
-
 static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
 			      struct pipe_resource *src, unsigned byte_count,
 			      uint64_t remaining_size, unsigned user_flags,
@@ -262,7 +227,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 	/* Flush the caches. */
 	sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 		       SI_CONTEXT_CS_PARTIAL_FLUSH |
-		       get_flush_flags(sctx, coher, cache_policy);
+		       si_get_flush_flags(sctx, coher, cache_policy);
 
 	while (size) {
 		unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
@@ -286,122 +251,6 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		sctx->num_cp_dma_calls++;
 }
 
-/* dst == NULL means GDS. */
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-		     uint64_t offset, uint64_t size, unsigned value,
-		     enum si_coherency coher)
-{
-	struct radeon_winsys *ws = sctx->ws;
-	struct r600_resource *rdst = r600_resource(dst);
-	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
-	uint64_t dma_clear_size;
-
-	if (!size)
-		return;
-
-	dma_clear_size = size & ~3ull;
-
-	/* dma_clear_buffer can use clear_buffer on failure. Make sure that
-	 * doesn't happen. We don't want an infinite recursion: */
-	if (sctx->dma_cs &&
-	    !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) &&
-	    (offset % 4 == 0) &&
-	    /* CP DMA is very slow. Always use SDMA for big clears. This
-	     * alone improves DeusEx:MD performance by 70%. */
-	    (size > CP_DMA_CLEAR_PERF_THRESHOLD ||
-	     /* Buffers not used by the GFX IB yet will be cleared by SDMA.
-	      * This happens to move most buffer clears to SDMA, including
-	      * DCC and CMASK clears, because pipe->clear clears them before
-	      * si_emit_framebuffer_state (in a draw call) adds them.
-	      * For example, DeusEx:MD has 21 buffer clears per frame and all
-	      * of them are moved to SDMA thanks to this. */
-	     !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf,
-				          RADEON_USAGE_READWRITE))) {
-		si_sdma_clear_buffer(sctx, dst, offset, dma_clear_size, value);
-
-		offset += dma_clear_size;
-		size -= dma_clear_size;
-	} else if (dma_clear_size >= 4) {
-		si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value,
-				       coher, cache_policy);
-
-		offset += dma_clear_size;
-		size -= dma_clear_size;
-	}
-
-	if (size) {
-		/* Handle non-dword alignment.
-		 *
-		 * This function is called for embedded texture metadata clears,
-		 * but those should always be properly aligned. */
-		assert(dst);
-		assert(dst->target == PIPE_BUFFER);
-		assert(size < 4);
-
-		pipe_buffer_write(&sctx->b, dst, offset, size, &value);
-	}
-}
-
-static void si_pipe_clear_buffer(struct pipe_context *ctx,
-				 struct pipe_resource *dst,
-				 unsigned offset, unsigned size,
-				 const void *clear_value_ptr,
-				 int clear_value_size)
-{
-	struct si_context *sctx = (struct si_context*)ctx;
-	uint32_t dword_value;
-	unsigned i;
-
-	assert(offset % clear_value_size == 0);
-	assert(size % clear_value_size == 0);
-
-	if (clear_value_size > 4) {
-		const uint32_t *u32 = clear_value_ptr;
-		bool clear_dword_duplicated = true;
-
-		/* See if we can lower large fills to dword fills. */
-		for (i = 1; i < clear_value_size / 4; i++)
-			if (u32[0] != u32[i]) {
-				clear_dword_duplicated = false;
-				break;
-			}
-
-		if (!clear_dword_duplicated) {
-			/* Use transform feedback for 64-bit, 96-bit, and
-			 * 128-bit fills.
-			 */
-			union pipe_color_union clear_value;
-
-			memcpy(&clear_value, clear_value_ptr, clear_value_size);
-			si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
-			util_blitter_clear_buffer(sctx->blitter, dst, offset,
-						  size, clear_value_size / 4,
-						  &clear_value);
-			si_blitter_end(sctx);
-			return;
-		}
-	}
-
-	/* Expand the clear value to a dword. */
-	switch (clear_value_size) {
-	case 1:
-		dword_value = *(uint8_t*)clear_value_ptr;
-		dword_value |= (dword_value << 8) |
-			       (dword_value << 16) |
-			       (dword_value << 24);
-		break;
-	case 2:
-		dword_value = *(uint16_t*)clear_value_ptr;
-		dword_value |= dword_value << 16;
-		break;
-	default:
-		dword_value = *(uint32_t*)clear_value_ptr;
-	}
-
-	si_clear_buffer(sctx, dst, offset, size, dword_value,
-			SI_COHERENCY_SHADER);
-}
-
 /**
  * Realign the CP DMA engine. This must be done after a copy with an unaligned
  * size.
@@ -509,7 +358,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
 	if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
 		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 			       SI_CONTEXT_CS_PARTIAL_FLUSH |
-			       get_flush_flags(sctx, coher, cache_policy);
+			       si_get_flush_flags(sctx, coher, cache_policy);
 	}
 
 	/* This is the main part doing the copying. Src is always aligned. */
@@ -549,26 +398,12 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
 		si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
 					 cache_policy, &is_first);
 	}
-}
-
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size)
-{
-	enum si_coherency coher = SI_COHERENCY_SHADER;
-	enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
-
-	if (!size)
-		return;
-
-	si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
-			      0, coher, cache_policy);
 
-	if (cache_policy != L2_BYPASS)
+	if (dst && cache_policy != L2_BYPASS)
 		r600_resource(dst)->TC_L2_dirty = true;
 
-	/* If it's not a prefetch... */
-	if (dst_offset != src_offset)
+	/* If it's not a prefetch or GDS copy... */
+	if (dst && src && (dst != src || dst_offset != src_offset))
 		sctx->num_cp_dma_calls++;
 }
 
@@ -744,8 +579,3 @@ void si_test_gds(struct si_context *sctx)
 	pipe_resource_reference(&dst, NULL);
 	exit(0);
 }
-
-void si_init_cp_dma_functions(struct si_context *sctx)
-{
-	sctx->b.clear_buffer = si_pipe_clear_buffer;
-}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 4b481b47af3..9d25748df40 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -195,6 +195,10 @@ static void si_destroy_context(struct pipe_context *context)
 		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
 	if (sctx->vs_blit_texcoord)
 		sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
+	if (sctx->cs_clear_buffer)
+		sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
+	if (sctx->cs_copy_buffer)
+		sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
 
 	if (sctx->blitter)
 		util_blitter_destroy(sctx->blitter);
@@ -416,7 +420,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 
 	sctx->allocator_zeroed_memory =
 			u_suballocator_create(&sctx->b, sscreen->info.gart_page_size,
-					      0, PIPE_USAGE_DEFAULT, 0, true);
+					      0, PIPE_USAGE_DEFAULT,
+					      SI_RESOURCE_FLAG_SO_FILLED_SIZE, true);
 	if (!sctx->allocator_zeroed_memory)
 		goto fail;
 
@@ -453,7 +458,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	si_init_clear_functions(sctx);
 	si_init_blit_functions(sctx);
 	si_init_compute_functions(sctx);
-	si_init_cp_dma_functions(sctx);
+	si_init_compute_blit_functions(sctx);
 	si_init_debug_functions(sctx);
 	si_init_msaa_functions(sctx);
 	si_init_streamout_functions(sctx);
@@ -503,6 +508,14 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	if (sscreen->debug_flags & DBG(FORCE_DMA))
 		sctx->b.resource_copy_region = sctx->dma_copy;
 
+	bool dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
+	sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
+					     SI_COMPUTE_CLEAR_DW_PER_THREAD,
+					     dst_stream_policy, false);
+	sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
+					     SI_COMPUTE_COPY_DW_PER_THREAD,
+					     dst_stream_policy, true);
+
 	sctx->blitter = util_blitter_create(&sctx->b);
 	if (sctx->blitter == NULL)
 		goto fail;
@@ -561,9 +574,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 				 &sctx->null_const_buf);
 
 		/* Clear the NULL constant buffer, because loads should return zeros. */
+		uint32_t clear_value = 0;
 		si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
-				sctx->null_const_buf.buffer->width0, 0,
-				SI_COHERENCY_SHADER);
+				sctx->null_const_buf.buffer->width0,
+				&clear_value, 4, SI_COHERENCY_SHADER);
 	}
 
 	uint64_t max_threads_per_block;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 7e15412ef87..7ae17435ab6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -52,6 +52,11 @@
 /* Alignment for optimal CP DMA performance. */
 #define SI_CPDMA_ALIGNMENT		32
 
+/* Tunables for compute-based clear_buffer and copy_buffer: */
+#define SI_COMPUTE_CLEAR_DW_PER_THREAD	4
+#define SI_COMPUTE_COPY_DW_PER_THREAD	4
+#define SI_COMPUTE_DST_CACHE_POLICY	L2_STREAM
+
 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS	(1 << 0)
 #define SI_CONTEXT_STOP_PIPELINE_STATS	(1 << 1)
@@ -102,6 +107,7 @@
 #define SI_RESOURCE_FLAG_UNMAPPABLE	(PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
 #define SI_RESOURCE_FLAG_READ_ONLY	(PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
 #define SI_RESOURCE_FLAG_32BIT		(PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
+#define SI_RESOURCE_FLAG_SO_FILLED_SIZE	(PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
 
 /* Debug flags. */
 enum {
@@ -172,6 +178,19 @@ enum {
 #define DBG_ALL_SHADERS		(((1 << (DBG_CS + 1)) - 1))
 #define DBG(name)		(1ull << DBG_##name)
 
+enum si_cache_policy {
+	L2_BYPASS,
+	L2_STREAM, /* same as SLC=1 */
+	L2_LRU,    /* same as SLC=0 */
+};
+
+enum si_coherency {
+	SI_COHERENCY_NONE, /* no cache flushes needed */
+	SI_COHERENCY_SHADER,
+	SI_COHERENCY_CB_META,
+	SI_COHERENCY_CP,
+};
+
 struct si_compute;
 struct hash_table;
 struct u_suballocator;
@@ -773,6 +792,8 @@ struct si_context {
 	void				*vs_blit_color;
 	void				*vs_blit_color_layered;
 	void				*vs_blit_texcoord;
+	void				*cs_clear_buffer;
+	void				*cs_copy_buffer;
 	struct si_screen		*screen;
 	struct pipe_debug_callback	debug;
 	struct ac_llvm_compiler		compiler; /* only non-threaded compilation */
@@ -1110,6 +1131,17 @@ void vi_dcc_clear_level(struct si_context *sctx,
 			unsigned level, unsigned clear_value);
 void si_init_clear_functions(struct si_context *sctx);
 
+/* si_compute_blit.c */
+unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
+			    enum si_cache_policy cache_policy);
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+		     uint64_t offset, uint64_t size, uint32_t *clear_value,
+		     uint32_t clear_value_size, enum si_coherency coher);
+void si_copy_buffer(struct si_context *sctx,
+		    struct pipe_resource *dst, struct pipe_resource *src,
+		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
+void si_init_compute_blit_functions(struct si_context *sctx);
+
 /* si_cp_dma.c */
 #define SI_CPDMA_SKIP_CHECK_CS_SPACE	(1 << 0) /* don't call need_cs_space */
 #define SI_CPDMA_SKIP_SYNC_AFTER	(1 << 1) /* don't wait for DMA after the copy */
@@ -1122,39 +1154,20 @@ void si_init_clear_functions(struct si_context *sctx);
 			   SI_CPDMA_SKIP_GFX_SYNC | \
 			   SI_CPDMA_SKIP_BO_LIST_UPDATE)
 
-enum si_cache_policy {
-	L2_BYPASS,
-	L2_STREAM, /* same as SLC=1 */
-	L2_LRU,    /* same as SLC=0 */
-};
-
-enum si_coherency {
-	SI_COHERENCY_NONE, /* no cache flushes needed */
-	SI_COHERENCY_SHADER,
-	SI_COHERENCY_CB_META,
-};
-
 void si_cp_dma_wait_for_idle(struct si_context *sctx);
 void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 			    uint64_t offset, uint64_t size, unsigned value,
 			    enum si_coherency coher,
 			    enum si_cache_policy cache_policy);
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-		     uint64_t offset, uint64_t size, unsigned value,
-		     enum si_coherency coher);
 void si_cp_dma_copy_buffer(struct si_context *sctx,
 			   struct pipe_resource *dst, struct pipe_resource *src,
 			   uint64_t dst_offset, uint64_t src_offset, unsigned size,
 			   unsigned user_flags, enum si_coherency coher,
 			   enum si_cache_policy cache_policy);
-void si_copy_buffer(struct si_context *sctx,
-		    struct pipe_resource *dst, struct pipe_resource *src,
-		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
 			      uint64_t offset, unsigned size);
 void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
 void si_test_gds(struct si_context *sctx);
-void si_init_cp_dma_functions(struct si_context *sctx);
 
 /* si_debug.c */
 void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index c81ec75dde2..90a2032cd80 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -307,7 +307,8 @@ void si_test_dma(struct si_screen *sscreen)
 		set_random_pixels(ctx, src, &src_cpu);
 
 		/* clear dst pixels */
-		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0,
+		uint32_t zero = 0;
+		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
 		                SI_COHERENCY_SHADER);
 		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);