X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_test_dma_perf.c;h=182089932cf7dfba37eb4269cec39b334fd40974;hb=06d7648f116b031882ad7ec90c10a8d9ebc83f27;hp=6c04720e963b472f657704b0730f1b91f1cf73f5;hpb=203ef19f48b6d983dfba383b6a8fcebfe0a02aee;p=mesa.git

diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
index 6c04720e963..182089932cf 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -28,448 +28,450 @@
 #include "si_pipe.h"
 #include "si_query.h"
 
-#define MIN_SIZE	512
-#define MAX_SIZE	(128 * 1024 * 1024)
-#define SIZE_SHIFT	1
-#define NUM_RUNS	128
+#define MIN_SIZE   512
+#define MAX_SIZE   (128 * 1024 * 1024)
+#define SIZE_SHIFT 1
+#define NUM_RUNS   128
 
 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
 {
-	return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+   return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
 }
 
 void si_test_dma_perf(struct si_screen *sscreen)
 {
-	struct pipe_screen *screen = &sscreen->b;
-	struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-	struct si_context *sctx = (struct si_context*)ctx;
-	const uint32_t clear_value = 0x12345678;
-	static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
-	static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+   struct pipe_screen *screen = &sscreen->b;
+   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   const uint32_t clear_value = 0x12345678;
+   static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+   static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16};
 
 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
-
-	static const char *method_str[] = {
-		"CP MC   ",
-		"CP L2   ",
-		"CP L2   ",
-		"SDMA    ",
-	};
-	static const char *placement_str[] = {
-		/* Clear */
-		"fill->VRAM",
-		"fill->GTT ",
-		/* Copy */
-		"VRAM->VRAM",
-		"VRAM->GTT ",
-		"GTT ->VRAM",
-	};
-
-	printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
-	printf("Heap       ,Method  ,L2p,Wa,");
-	for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-		if (size >= 1024)
-			printf("%6uKB,", size / 1024);
-		else
-			printf(" %6uB,", size);
-	}
-	printf("\n");
-
-	/* results[log2(size)][placement][method][] */
-	struct si_result {
-		bool is_valid;
-		bool is_cp;
-		bool is_sdma;
-		bool is_cs;
-		unsigned cache_policy;
-		unsigned dwords_per_thread;
-		unsigned waves_per_sh;
-		unsigned score;
-		unsigned index; /* index in results[x][y][index] */
-	} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
-
-	/* Run benchmarks. */
-	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-		bool is_copy = placement >= 2;
-
-		printf("-----------,--------,---,--,");
-		for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
-			printf("--------,");
-		printf("\n");
-
-		for (unsigned method = 0; method < NUM_METHODS; method++) {
-			bool test_cp = method <= 2;
-			bool test_sdma = method == 3;
-			bool test_cs = method >= 4;
-			unsigned cs_method = method - 4;
-			STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
-			unsigned cs_waves_per_sh =
-				test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
-			cs_method %= 2*NUM_SHADERS;
-			unsigned cache_policy = test_cp ? method % 3 :
-						test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
-			unsigned cs_dwords_per_thread =
-				test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
-
-			if (sctx->chip_class == SI) {
-				/* SI doesn't support CP DMA operations through L2. */
-				if (test_cp && cache_policy != L2_BYPASS)
-					continue;
-				/* WAVES_PER_SH is in multiples of 16 on SI. */
-				if (test_cs && cs_waves_per_sh % 16 != 0)
-					continue;
-			}
-
-			printf("%s ,", placement_str[placement]);
-			if (test_cs) {
-				printf("CS x%-4u,%3s,", cs_dwords_per_thread,
-				       cache_policy == L2_LRU ? "LRU" :
-				       cache_policy == L2_STREAM ? "Str" : "");
-			} else {
-				printf("%s,%3s,", method_str[method],
-				       method == L2_LRU ? "LRU" :
-				       method == L2_STREAM ? "Str" : "");
-			}
-			if (test_cs && cs_waves_per_sh)
-				printf("%2u,", cs_waves_per_sh);
-			else
-				printf("  ,");
-
-			double score = 0;
-			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-				/* Don't test bigger sizes if it's too slow. Print 0. */
-				if (size >= 512*1024 &&
-				    score < 400 * (size / (4*1024*1024))) {
-					printf("%7.0f ,", 0.0);
-					continue;
-				}
-
-				enum pipe_resource_usage dst_usage, src_usage;
-				struct pipe_resource *dst, *src;
-				struct pipe_query *q[NUM_RUNS];
-				unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
-
-				if (test_sdma) {
-					if (sctx->chip_class == SI)
-						query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
-					else
-						query_type = SI_QUERY_TIME_ELAPSED_SDMA;
-				}
-
-				if (placement == 0 || placement == 2 || placement == 4)
-					dst_usage = PIPE_USAGE_DEFAULT;
-				else
-					dst_usage = PIPE_USAGE_STREAM;
-
-				if (placement == 2 || placement == 3)
-					src_usage = PIPE_USAGE_DEFAULT;
-				else
-					src_usage = PIPE_USAGE_STREAM;
-
-				dst = pipe_buffer_create(screen, 0, dst_usage, size);
-				src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
-
-				/* Run tests. */
-				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-					q[iter] = ctx->create_query(ctx, query_type, 0);
-					ctx->begin_query(ctx, q[iter]);
-
-					if (test_cp) {
-						/* CP DMA */
-						if (is_copy) {
-							si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
-									      SI_COHERENCY_NONE, cache_policy);
-						} else {
-							si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
-									       SI_COHERENCY_NONE, cache_policy);
-						}
-					} else if (test_sdma) {
-						/* SDMA */
-						if (is_copy) {
-							struct pipe_box box;
-							u_box_1d(0, size, &box);
-							sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
-						} else {
-							si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
-						}
-					} else {
-						/* Compute */
-						/* The memory accesses are coalesced, meaning that the 1st instruction writes
-						 * the 1st contiguous block of data for the whole wave, the 2nd instruction
-						 * writes the 2nd contiguous block of data, etc.
-						 */
-						unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
-						unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
-						unsigned dwords_per_wave = cs_dwords_per_thread * 64;
-
-						unsigned num_dwords = size / 4;
-						unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
-						void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
-											cache_policy == L2_STREAM, is_copy);
-
-						struct pipe_grid_info info = {};
-						info.block[0] = MIN2(64, num_instructions);
-						info.block[1] = 1;
-						info.block[2] = 1;
-						info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
-						info.grid[1] = 1;
-						info.grid[2] = 1;
-
-						struct pipe_shader_buffer sb[2] = {};
-						sb[0].buffer = dst;
-						sb[0].buffer_size = size;
-
-						if (is_copy) {
-							sb[1].buffer = src;
-							sb[1].buffer_size = size;
-						} else {
-							for (unsigned i = 0; i < 4; i++)
-								sctx->cs_user_data[i] = clear_value;
-						}
-
-						sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
-							       SI_CONTEXT_INV_SMEM_L1;
-
-						ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb);
-						ctx->bind_compute_state(ctx, cs);
-						sctx->cs_max_waves_per_sh = cs_waves_per_sh;
-
-						ctx->launch_grid(ctx, &info);
-
-						ctx->bind_compute_state(ctx, NULL);
-						ctx->delete_compute_state(ctx, cs);
-						sctx->cs_max_waves_per_sh = 0; /* disable the limit */
-
-						sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-					}
-
-					/* Flush L2, so that we don't just test L2 cache performance. */
-					if (!test_sdma) {
-						sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-						si_emit_cache_flush(sctx);
-					}
-
-					ctx->end_query(ctx, q[iter]);
-					ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
-				}
-				pipe_resource_reference(&dst, NULL);
-				pipe_resource_reference(&src, NULL);
-
-				/* Get results. */
-				uint64_t min = ~0ull, max = 0, total = 0;
-
-				for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-					union pipe_query_result result;
-
-					ctx->get_query_result(ctx, q[iter], true, &result);
-					ctx->destroy_query(ctx, q[iter]);
-
-					min = MIN2(min, result.u64);
-					max = MAX2(max, result.u64);
-					total += result.u64;
-				}
-
-				score = get_MBps_rate(size, total / (double)NUM_RUNS);
-				printf("%7.0f ,", score);
-				fflush(stdout);
-
-				struct si_result *r = &results[util_logbase2(size)][placement][method];
-				r->is_valid = true;
-				r->is_cp = test_cp;
-				r->is_sdma = test_sdma;
-				r->is_cs = test_cs;
-				r->cache_policy = cache_policy;
-				r->dwords_per_thread = cs_dwords_per_thread;
-				r->waves_per_sh = cs_waves_per_sh;
-				r->score = score;
-				r->index = method;
-			}
-			puts("");
-		}
-	}
-
-	puts("");
-	puts("static struct si_method");
-	printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
-	       sctx->screen->info.name);
-	puts("{");
-	puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-
-	/* Analyze results and find the best methods. */
-	for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-		if (placement == 0)
-			puts("   if (dst == RADEON_DOMAIN_VRAM) {");
-		else if (placement == 1)
-			puts("   } else { /* GTT */");
-		else if (placement == 2) {
-			puts("}");
-			puts("");
-			puts("static struct si_method");
-			printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
-			       sctx->screen->info.name);
-			printf("                     uint64_t size64, bool async, bool cached)\n");
-			puts("{");
-			puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-			puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
-		} else if (placement == 3)
-			puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
-		else
-			puts("   } else { /* GTT -> VRAM */");
-
-		for (unsigned mode = 0; mode < 3; mode++) {
-			bool async = mode == 0;
-			bool cached = mode == 1;
-
-			if (async)
-				puts("      if (async) { /* SDMA or async compute */");
-			else if (cached)
-				puts("      if (cached) { /* gfx ring */");
-			else
-				puts("      } else { /* gfx ring - uncached */");
-
-			/* The list of best chosen methods. */
-			struct si_result *methods[32];
-			unsigned method_max_size[32];
-			unsigned num_methods = 0;
-
-			for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-				/* Find the best method. */
-				struct si_result *best = NULL;
-
-				for (unsigned i = 0; i < NUM_METHODS; i++) {
-					struct si_result *r = &results[util_logbase2(size)][placement][i];
-
-					if (!r->is_valid)
-						continue;
-
-					/* Ban CP DMA clears via MC on <= VI. They are super slow
-					 * on GTT, which we can get due to BO evictions.
-					 */
-					if (sctx->chip_class <= VI && placement == 1 &&
-					    r->is_cp && r->cache_policy == L2_BYPASS)
-						continue;
-
-					if (async) {
-						/* The following constraints for compute IBs try to limit
-						 * resource usage so as not to decrease the performance
-						 * of gfx IBs too much.
-						 */
-
-						/* Don't use CP DMA on asynchronous rings, because
-						 * the engine is shared with gfx IBs.
-						 */
-						if (r->is_cp)
-							continue;
-
-						/* Don't use L2 caching on asynchronous rings to minimize
-						 * L2 usage.
-						 */
-						if (r->cache_policy == L2_LRU)
-							continue;
-
-						/* Asynchronous compute recommends waves_per_sh != 0
-						 * to limit CU usage. */
-						if (r->is_cs && r->waves_per_sh == 0)
-							continue;
-					} else {
-						/* SDMA is always asynchronous */
-						if (r->is_sdma)
-							continue;
-
-						if (cached && r->cache_policy == L2_BYPASS)
-							continue;
-						if (!cached && r->cache_policy == L2_LRU)
-							continue;
-					}
-
-					if (!best) {
-						best = r;
-						continue;
-					}
-
-					/* Assume some measurement error. Earlier methods occupy fewer
-					 * resources, so the next method is always more greedy, and we
-					 * don't want to select it due to a measurement error.
-					 */
-					double min_improvement = 1.03;
-
-					if (best->score * min_improvement < r->score)
-						best = r;
-				}
-
-				if (num_methods > 0) {
-					unsigned prev_index = num_methods - 1;
-					struct si_result *prev = methods[prev_index];
-					struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
-
-					/* If the best one is also the best for the previous size,
-					 * just bump the size for the previous one.
-					 *
-					 * If there is no best, it means all methods were too slow
-					 * for this size and were not tested. Use the best one for
-					 * the previous size.
-					 */
-					if (!best ||
-					    /* If it's the same method as for the previous size: */
-					    (prev->is_cp == best->is_cp &&
-					     prev->is_sdma == best->is_sdma &&
-					     prev->is_cs == best->is_cs &&
-					     prev->cache_policy == best->cache_policy &&
-					     prev->dwords_per_thread == best->dwords_per_thread &&
-					     prev->waves_per_sh == best->waves_per_sh) ||
-					    /* If the method for the previous size is also the best
-					     * for this size: */
-					    (prev_this_size->is_valid &&
-					     prev_this_size->score * 1.03 > best->score)) {
-						method_max_size[prev_index] = size;
-						continue;
-					}
-				}
-
-				/* Add it to the list. */
-				assert(num_methods < ARRAY_SIZE(methods));
-				methods[num_methods] = best;
-				method_max_size[num_methods] = size;
-				num_methods++;
-			}
-
-			for (unsigned i = 0; i < num_methods; i++) {
-				struct si_result *best = methods[i];
-				unsigned size = method_max_size[i];
-
-				/* The size threshold is between the current benchmarked
-				 * size and the next benchmarked size. */
-				if (i < num_methods - 1)
-					printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
-				else if (i > 0)
-					printf("         else                   ");
-				else
-					printf("         ");
-				printf("return ");
-
-				assert(best);
-				if (best->is_cp) {
-					printf("CP_DMA(%s);\n",
-					       best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
-					       best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
-				}
-				if (best->is_sdma)
-					printf("SDMA;\n");
-				if (best->is_cs) {
-					printf("COMPUTE(%s, %u, %u);\n",
-					       best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
-					       best->dwords_per_thread,
-					       best->waves_per_sh);
-				}
-			}
-		}
-		puts("      }");
-	}
-	puts("   }");
-	puts("}");
-
-	ctx->destroy(ctx);
-	exit(0);
+#define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+   static const char *method_str[] = {
+      "CP MC   ",
+      "CP L2   ",
+      "CP L2   ",
+      "SDMA    ",
+   };
+   static const char *placement_str[] = {
+      /* Clear */
+      "fill->VRAM",
+      "fill->GTT ",
+      /* Copy */
+      "VRAM->VRAM",
+      "VRAM->GTT ",
+      "GTT ->VRAM",
+   };
+
+   printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+   printf("Heap       ,Method  ,L2p,Wa,");
+   for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+      if (size >= 1024)
+         printf("%6uKB,", size / 1024);
+      else
+         printf(" %6uB,", size);
+   }
+   printf("\n");
+
+   /* results[log2(size)][placement][method][] */
+   struct si_result {
+      bool is_valid;
+      bool is_cp;
+      bool is_sdma;
+      bool is_cs;
+      unsigned cache_policy;
+      unsigned dwords_per_thread;
+      unsigned waves_per_sh;
+      unsigned score;
+      unsigned index; /* index in results[x][y][index] */
+   } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+   /* Run benchmarks. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      bool is_copy = placement >= 2;
+
+      printf("-----------,--------,---,--,");
+      for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+         printf("--------,");
+      printf("\n");
+
+      for (unsigned method = 0; method < NUM_METHODS; method++) {
+         bool test_cp = method <= 2;
+         bool test_sdma = method == 3;
+         bool test_cs = method >= 4;
+         unsigned cs_method = method - 4;
+         unsigned cs_waves_per_sh =
+            test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
+         cs_method %= 3 * NUM_SHADERS;
+         unsigned cache_policy =
+            test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
+         unsigned cs_dwords_per_thread =
+            test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+         if (test_sdma && !sctx->sdma_cs)
+            continue;
+
+         if (sctx->chip_class == GFX6) {
+            /* GFX6 doesn't support CP DMA operations through L2. */
+            if (test_cp && cache_policy != L2_BYPASS)
+               continue;
+            /* WAVES_PER_SH is in multiples of 16 on GFX6. */
+            if (test_cs && cs_waves_per_sh % 16 != 0)
+               continue;
+         }
+
+         /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
+          * chips before gfx9.
+          */
+         if (test_cs && cache_policy && sctx->chip_class < GFX9)
+            continue;
+
+         printf("%s ,", placement_str[placement]);
+         if (test_cs) {
+            printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+                   cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
+         } else {
+            printf("%s,%3s,", method_str[method],
+                   method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
+         }
+         if (test_cs && cs_waves_per_sh)
+            printf("%2u,", cs_waves_per_sh);
+         else
+            printf("  ,");
+
+         double score = 0;
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Don't test bigger sizes if it's too slow. Print 0. */
+            if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
+               printf("%7.0f ,", 0.0);
+               continue;
+            }
+
+            enum pipe_resource_usage dst_usage, src_usage;
+            struct pipe_resource *dst, *src;
+            struct pipe_query *q[NUM_RUNS];
+            unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+            unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
+
+            if (test_sdma) {
+               if (sctx->chip_class == GFX6)
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+               else
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+            }
+
+            if (placement == 0 || placement == 2 || placement == 4)
+               dst_usage = PIPE_USAGE_DEFAULT;
+            else
+               dst_usage = PIPE_USAGE_STREAM;
+
+            if (placement == 2 || placement == 3)
+               src_usage = PIPE_USAGE_DEFAULT;
+            else
+               src_usage = PIPE_USAGE_STREAM;
+
+            dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
+            src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
+
+            /* Run tests. */
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               q[iter] = ctx->create_query(ctx, query_type, 0);
+               ctx->begin_query(ctx, q[iter]);
+
+               if (test_cp) {
+                  /* CP DMA */
+                  if (is_copy) {
+                     si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
+                                           cache_policy);
+                  } else {
+                     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
+                                            SI_COHERENCY_NONE, cache_policy);
+                  }
+               } else if (test_sdma) {
+                  /* SDMA */
+                  if (is_copy) {
+                     si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
+                  } else {
+                     si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+                  }
+               } else {
+                  /* Compute */
+                  /* The memory accesses are coalesced, meaning that the 1st instruction writes
+                   * the 1st contiguous block of data for the whole wave, the 2nd instruction
+                   * writes the 2nd contiguous block of data, etc.
+                   */
+                  unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+                  unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+                  unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+                  unsigned num_dwords = size / 4;
+                  unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+                  void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+                                                          cache_policy == L2_STREAM, is_copy);
+
+                  struct pipe_grid_info info = {};
+                  info.block[0] = MIN2(64, num_instructions);
+                  info.block[1] = 1;
+                  info.block[2] = 1;
+                  info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+                  info.grid[1] = 1;
+                  info.grid[2] = 1;
+
+                  struct pipe_shader_buffer sb[2] = {};
+                  sb[0].buffer = dst;
+                  sb[0].buffer_size = size;
+
+                  if (is_copy) {
+                     sb[1].buffer = src;
+                     sb[1].buffer_size = size;
+                  } else {
+                     for (unsigned i = 0; i < 4; i++)
+                        sctx->cs_user_data[i] = clear_value;
+                  }
+
+                  sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
+
+                  ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
+                  ctx->bind_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+                  ctx->launch_grid(ctx, &info);
+
+                  ctx->bind_compute_state(ctx, NULL);
+                  ctx->delete_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+                  sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+               }
+
+               /* Flush L2, so that we don't just test L2 cache performance. */
+               if (!test_sdma) {
+                  sctx->flags |= SI_CONTEXT_WB_L2;
+                  sctx->emit_cache_flush(sctx);
+               }
+
+               ctx->end_query(ctx, q[iter]);
+               ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+            }
+            pipe_resource_reference(&dst, NULL);
+            pipe_resource_reference(&src, NULL);
+
+            /* Get results. */
+            uint64_t min = ~0ull, max = 0, total = 0;
+
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               union pipe_query_result result;
+
+               ctx->get_query_result(ctx, q[iter], true, &result);
+               ctx->destroy_query(ctx, q[iter]);
+
+               min = MIN2(min, result.u64);
+               max = MAX2(max, result.u64);
+               total += result.u64;
+            }
+
+            score = get_MBps_rate(size, total / (double)NUM_RUNS);
+            printf("%7.0f ,", score);
+            fflush(stdout);
+
+            struct si_result *r = &results[util_logbase2(size)][placement][method];
+            r->is_valid = true;
+            r->is_cp = test_cp;
+            r->is_sdma = test_sdma;
+            r->is_cs = test_cs;
+            r->cache_policy = cache_policy;
+            r->dwords_per_thread = cs_dwords_per_thread;
+            r->waves_per_sh = cs_waves_per_sh;
+            r->score = score;
+            r->index = method;
+         }
+         puts("");
+      }
+   }
+
+   puts("");
+   puts("static struct si_method");
+   printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
+          "cached)\n",
+          sctx->screen->info.name);
+   puts("{");
+   puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+
+   /* Analyze results and find the best methods. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      if (placement == 0)
+         puts("   if (dst == RADEON_DOMAIN_VRAM) {");
+      else if (placement == 1)
+         puts("   } else { /* GTT */");
+      else if (placement == 2) {
+         puts("}");
+         puts("");
+         puts("static struct si_method");
+         printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+                sctx->screen->info.name);
+         printf("                     uint64_t size64, bool async, bool cached)\n");
+         puts("{");
+         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+         puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+      } else if (placement == 3)
+         puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+      else
+         puts("   } else { /* GTT -> VRAM */");
+
+      for (unsigned mode = 0; mode < 3; mode++) {
+         bool async = mode == 0;
+         bool cached = mode == 1;
+
+         if (async)
+            puts("      if (async) { /* SDMA or async compute */");
+         else if (cached)
+            puts("      if (cached) { /* gfx ring */");
+         else
+            puts("      } else { /* gfx ring - uncached */");
+
+         /* The list of best chosen methods. */
+         struct si_result *methods[32];
+         unsigned method_max_size[32];
+         unsigned num_methods = 0;
+
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Find the best method. */
+            struct si_result *best = NULL;
+
+            for (unsigned i = 0; i < NUM_METHODS; i++) {
+               struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+               if (!r->is_valid)
+                  continue;
+
+               /* Ban CP DMA clears via MC on <= GFX8. They are super slow
+                * on GTT, which we can get due to BO evictions.
+                */
+               if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
+                   r->cache_policy == L2_BYPASS)
+                  continue;
+
+               if (async) {
+                  /* The following constraints for compute IBs try to limit
+                   * resource usage so as not to decrease the performance
+                   * of gfx IBs too much.
+                   */
+
+                  /* Don't use CP DMA on asynchronous rings, because
+                   * the engine is shared with gfx IBs.
+                   */
+                  if (r->is_cp)
+                     continue;
+
+                  /* Don't use L2 caching on asynchronous rings to minimize
+                   * L2 usage.
+                   */
+                  if (r->cache_policy == L2_LRU)
+                     continue;
+
+                  /* Asynchronous compute recommends waves_per_sh != 0
+                   * to limit CU usage. */
+                  if (r->is_cs && r->waves_per_sh == 0)
+                     continue;
+               } else {
+                  /* SDMA is always asynchronous */
+                  if (r->is_sdma)
+                     continue;
+
+                  if (cached && r->cache_policy == L2_BYPASS)
+                     continue;
+                  if (!cached && r->cache_policy == L2_LRU)
+                     continue;
+               }
+
+               if (!best) {
+                  best = r;
+                  continue;
+               }
+
+               /* Assume some measurement error. Earlier methods occupy fewer
+                * resources, so the next method is always more greedy, and we
+                * don't want to select it due to a measurement error.
+                */
+               double min_improvement = 1.03;
+
+               if (best->score * min_improvement < r->score)
+                  best = r;
+            }
+
+            if (num_methods > 0) {
+               unsigned prev_index = num_methods - 1;
+               struct si_result *prev = methods[prev_index];
+               struct si_result *prev_this_size =
+                  &results[util_logbase2(size)][placement][prev->index];
+
+               /* If the best one is also the best for the previous size,
+                * just bump the size for the previous one.
+                *
+                * If there is no best, it means all methods were too slow
+                * for this size and were not tested. Use the best one for
+                * the previous size.
+                */
+               if (!best ||
+                   /* If it's the same method as for the previous size: */
+                   (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
+                    prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
+                    prev->dwords_per_thread == best->dwords_per_thread &&
+                    prev->waves_per_sh == best->waves_per_sh) ||
+                   /* If the method for the previous size is also the best
+                    * for this size: */
+                   (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
+                  method_max_size[prev_index] = size;
+                  continue;
+               }
+            }
+
+            /* Add it to the list. */
+            assert(num_methods < ARRAY_SIZE(methods));
+            methods[num_methods] = best;
+            method_max_size[num_methods] = size;
+            num_methods++;
+         }
+
+         for (unsigned i = 0; i < num_methods; i++) {
+            struct si_result *best = methods[i];
+            unsigned size = method_max_size[i];
+
+            /* The size threshold is between the current benchmarked
+             * size and the next benchmarked size. */
+            if (i < num_methods - 1)
+               printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+            else if (i > 0)
+               printf("         else                   ");
+            else
+               printf("         ");
+            printf("return ");
+
+            assert(best);
+            const char *cache_policy_str =
+               best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
+               best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM";
+
+            if (best->is_cp) {
+               printf("CP_DMA(%s);\n", cache_policy_str);
+            }
+            if (best->is_sdma)
+               printf("SDMA;\n");
+            if (best->is_cs) {
+               printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
+                      best->dwords_per_thread, best->waves_per_sh);
+            }
+         }
+      }
+      puts("      }");
+   }
+   puts("   }");
+   puts("}");
+
+   ctx->destroy(ctx);
+   exit(0);
 }