radeonsi: move nir_shader_compiler_options into si_screen
[mesa.git] / src / gallium / drivers / radeonsi / si_test_dma_perf.c
index 6c04720e963b472f657704b0730f1b91f1cf73f5..182089932cf7dfba37eb4269cec39b334fd40974 100644 (file)
 #include "si_pipe.h"
 #include "si_query.h"
 
-#define MIN_SIZE       512
-#define MAX_SIZE       (128 * 1024 * 1024)
-#define SIZE_SHIFT     1
-#define NUM_RUNS       128
+#define MIN_SIZE   512
+#define MAX_SIZE   (128 * 1024 * 1024)
+#define SIZE_SHIFT 1
+#define NUM_RUNS   128
 
 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
 {
-       return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+   return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
 }
 
 void si_test_dma_perf(struct si_screen *sscreen)
 {
-       struct pipe_screen *screen = &sscreen->b;
-       struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
-       struct si_context *sctx = (struct si_context*)ctx;
-       const uint32_t clear_value = 0x12345678;
-       static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
-       static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+   struct pipe_screen *screen = &sscreen->b;
+   struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+   struct si_context *sctx = (struct si_context *)ctx;
+   const uint32_t clear_value = 0x12345678;
+   static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+   static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16};
 
 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
-
-       static const char *method_str[] = {
-               "CP MC   ",
-               "CP L2   ",
-               "CP L2   ",
-               "SDMA    ",
-       };
-       static const char *placement_str[] = {
-               /* Clear */
-               "fill->VRAM",
-               "fill->GTT ",
-               /* Copy */
-               "VRAM->VRAM",
-               "VRAM->GTT ",
-               "GTT ->VRAM",
-       };
-
-       printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
-       printf("Heap       ,Method  ,L2p,Wa,");
-       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-               if (size >= 1024)
-                       printf("%6uKB,", size / 1024);
-               else
-                       printf(" %6uB,", size);
-       }
-       printf("\n");
-
-       /* results[log2(size)][placement][method][] */
-       struct si_result {
-               bool is_valid;
-               bool is_cp;
-               bool is_sdma;
-               bool is_cs;
-               unsigned cache_policy;
-               unsigned dwords_per_thread;
-               unsigned waves_per_sh;
-               unsigned score;
-               unsigned index; /* index in results[x][y][index] */
-       } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
-
-       /* Run benchmarks. */
-       for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-               bool is_copy = placement >= 2;
-
-               printf("-----------,--------,---,--,");
-               for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
-                       printf("--------,");
-               printf("\n");
-
-               for (unsigned method = 0; method < NUM_METHODS; method++) {
-                       bool test_cp = method <= 2;
-                       bool test_sdma = method == 3;
-                       bool test_cs = method >= 4;
-                       unsigned cs_method = method - 4;
-                       STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
-                       unsigned cs_waves_per_sh =
-                               test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
-                       cs_method %= 2*NUM_SHADERS;
-                       unsigned cache_policy = test_cp ? method % 3 :
-                                               test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
-                       unsigned cs_dwords_per_thread =
-                               test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
-
-                       if (sctx->chip_class == SI) {
-                               /* SI doesn't support CP DMA operations through L2. */
-                               if (test_cp && cache_policy != L2_BYPASS)
-                                       continue;
-                               /* WAVES_PER_SH is in multiples of 16 on SI. */
-                               if (test_cs && cs_waves_per_sh % 16 != 0)
-                                       continue;
-                       }
-
-                       printf("%s ,", placement_str[placement]);
-                       if (test_cs) {
-                               printf("CS x%-4u,%3s,", cs_dwords_per_thread,
-                                      cache_policy == L2_LRU ? "LRU" :
-                                      cache_policy == L2_STREAM ? "Str" : "");
-                       } else {
-                               printf("%s,%3s,", method_str[method],
-                                      method == L2_LRU ? "LRU" :
-                                      method == L2_STREAM ? "Str" : "");
-                       }
-                       if (test_cs && cs_waves_per_sh)
-                               printf("%2u,", cs_waves_per_sh);
-                       else
-                               printf("  ,");
-
-                       double score = 0;
-                       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-                               /* Don't test bigger sizes if it's too slow. Print 0. */
-                               if (size >= 512*1024 &&
-                                   score < 400 * (size / (4*1024*1024))) {
-                                       printf("%7.0f ,", 0.0);
-                                       continue;
-                               }
-
-                               enum pipe_resource_usage dst_usage, src_usage;
-                               struct pipe_resource *dst, *src;
-                               struct pipe_query *q[NUM_RUNS];
-                               unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
-
-                               if (test_sdma) {
-                                       if (sctx->chip_class == SI)
-                                               query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
-                                       else
-                                               query_type = SI_QUERY_TIME_ELAPSED_SDMA;
-                               }
-
-                               if (placement == 0 || placement == 2 || placement == 4)
-                                       dst_usage = PIPE_USAGE_DEFAULT;
-                               else
-                                       dst_usage = PIPE_USAGE_STREAM;
-
-                               if (placement == 2 || placement == 3)
-                                       src_usage = PIPE_USAGE_DEFAULT;
-                               else
-                                       src_usage = PIPE_USAGE_STREAM;
-
-                               dst = pipe_buffer_create(screen, 0, dst_usage, size);
-                               src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
-
-                               /* Run tests. */
-                               for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-                                       q[iter] = ctx->create_query(ctx, query_type, 0);
-                                       ctx->begin_query(ctx, q[iter]);
-
-                                       if (test_cp) {
-                                               /* CP DMA */
-                                               if (is_copy) {
-                                                       si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
-                                                                             SI_COHERENCY_NONE, cache_policy);
-                                               } else {
-                                                       si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
-                                                                              SI_COHERENCY_NONE, cache_policy);
-                                               }
-                                       } else if (test_sdma) {
-                                               /* SDMA */
-                                               if (is_copy) {
-                                                       struct pipe_box box;
-                                                       u_box_1d(0, size, &box);
-                                                       sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
-                                               } else {
-                                                       si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
-                                               }
-                                       } else {
-                                               /* Compute */
-                                               /* The memory accesses are coalesced, meaning that the 1st instruction writes
-                                                * the 1st contiguous block of data for the whole wave, the 2nd instruction
-                                                * writes the 2nd contiguous block of data, etc.
-                                                */
-                                               unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
-                                               unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
-                                               unsigned dwords_per_wave = cs_dwords_per_thread * 64;
-
-                                               unsigned num_dwords = size / 4;
-                                               unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
-                                               void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
-                                                                                       cache_policy == L2_STREAM, is_copy);
-
-                                               struct pipe_grid_info info = {};
-                                               info.block[0] = MIN2(64, num_instructions);
-                                               info.block[1] = 1;
-                                               info.block[2] = 1;
-                                               info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
-                                               info.grid[1] = 1;
-                                               info.grid[2] = 1;
-
-                                               struct pipe_shader_buffer sb[2] = {};
-                                               sb[0].buffer = dst;
-                                               sb[0].buffer_size = size;
-
-                                               if (is_copy) {
-                                                       sb[1].buffer = src;
-                                                       sb[1].buffer_size = size;
-                                               } else {
-                                                       for (unsigned i = 0; i < 4; i++)
-                                                               sctx->cs_user_data[i] = clear_value;
-                                               }
-
-                                               sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
-                                                              SI_CONTEXT_INV_SMEM_L1;
-
-                                               ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb);
-                                               ctx->bind_compute_state(ctx, cs);
-                                               sctx->cs_max_waves_per_sh = cs_waves_per_sh;
-
-                                               ctx->launch_grid(ctx, &info);
-
-                                               ctx->bind_compute_state(ctx, NULL);
-                                               ctx->delete_compute_state(ctx, cs);
-                                               sctx->cs_max_waves_per_sh = 0; /* disable the limit */
-
-                                               sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
-                                       }
-
-                                       /* Flush L2, so that we don't just test L2 cache performance. */
-                                       if (!test_sdma) {
-                                               sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
-                                               si_emit_cache_flush(sctx);
-                                       }
-
-                                       ctx->end_query(ctx, q[iter]);
-                                       ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
-                               }
-                               pipe_resource_reference(&dst, NULL);
-                               pipe_resource_reference(&src, NULL);
-
-                               /* Get results. */
-                               uint64_t min = ~0ull, max = 0, total = 0;
-
-                               for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
-                                       union pipe_query_result result;
-
-                                       ctx->get_query_result(ctx, q[iter], true, &result);
-                                       ctx->destroy_query(ctx, q[iter]);
-
-                                       min = MIN2(min, result.u64);
-                                       max = MAX2(max, result.u64);
-                                       total += result.u64;
-                               }
-
-                               score = get_MBps_rate(size, total / (double)NUM_RUNS);
-                               printf("%7.0f ,", score);
-                               fflush(stdout);
-
-                               struct si_result *r = &results[util_logbase2(size)][placement][method];
-                               r->is_valid = true;
-                               r->is_cp = test_cp;
-                               r->is_sdma = test_sdma;
-                               r->is_cs = test_cs;
-                               r->cache_policy = cache_policy;
-                               r->dwords_per_thread = cs_dwords_per_thread;
-                               r->waves_per_sh = cs_waves_per_sh;
-                               r->score = score;
-                               r->index = method;
-                       }
-                       puts("");
-               }
-       }
-
-       puts("");
-       puts("static struct si_method");
-       printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
-              sctx->screen->info.name);
-       puts("{");
-       puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-
-       /* Analyze results and find the best methods. */
-       for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
-               if (placement == 0)
-                       puts("   if (dst == RADEON_DOMAIN_VRAM) {");
-               else if (placement == 1)
-                       puts("   } else { /* GTT */");
-               else if (placement == 2) {
-                       puts("}");
-                       puts("");
-                       puts("static struct si_method");
-                       printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
-                              sctx->screen->info.name);
-                       printf("                     uint64_t size64, bool async, bool cached)\n");
-                       puts("{");
-                       puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
-                       puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
-               } else if (placement == 3)
-                       puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
-               else
-                       puts("   } else { /* GTT -> VRAM */");
-
-               for (unsigned mode = 0; mode < 3; mode++) {
-                       bool async = mode == 0;
-                       bool cached = mode == 1;
-
-                       if (async)
-                               puts("      if (async) { /* SDMA or async compute */");
-                       else if (cached)
-                               puts("      if (cached) { /* gfx ring */");
-                       else
-                               puts("      } else { /* gfx ring - uncached */");
-
-                       /* The list of best chosen methods. */
-                       struct si_result *methods[32];
-                       unsigned method_max_size[32];
-                       unsigned num_methods = 0;
-
-                       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
-                               /* Find the best method. */
-                               struct si_result *best = NULL;
-
-                               for (unsigned i = 0; i < NUM_METHODS; i++) {
-                                       struct si_result *r = &results[util_logbase2(size)][placement][i];
-
-                                       if (!r->is_valid)
-                                               continue;
-
-                                       /* Ban CP DMA clears via MC on <= VI. They are super slow
-                                        * on GTT, which we can get due to BO evictions.
-                                        */
-                                       if (sctx->chip_class <= VI && placement == 1 &&
-                                           r->is_cp && r->cache_policy == L2_BYPASS)
-                                               continue;
-
-                                       if (async) {
-                                               /* The following constraints for compute IBs try to limit
-                                                * resource usage so as not to decrease the performance
-                                                * of gfx IBs too much.
-                                                */
-
-                                               /* Don't use CP DMA on asynchronous rings, because
-                                                * the engine is shared with gfx IBs.
-                                                */
-                                               if (r->is_cp)
-                                                       continue;
-
-                                               /* Don't use L2 caching on asynchronous rings to minimize
-                                                * L2 usage.
-                                                */
-                                               if (r->cache_policy == L2_LRU)
-                                                       continue;
-
-                                               /* Asynchronous compute recommends waves_per_sh != 0
-                                                * to limit CU usage. */
-                                               if (r->is_cs && r->waves_per_sh == 0)
-                                                       continue;
-                                       } else {
-                                               /* SDMA is always asynchronous */
-                                               if (r->is_sdma)
-                                                       continue;
-
-                                               if (cached && r->cache_policy == L2_BYPASS)
-                                                       continue;
-                                               if (!cached && r->cache_policy == L2_LRU)
-                                                       continue;
-                                       }
-
-                                       if (!best) {
-                                               best = r;
-                                               continue;
-                                       }
-
-                                       /* Assume some measurement error. Earlier methods occupy fewer
-                                        * resources, so the next method is always more greedy, and we
-                                        * don't want to select it due to a measurement error.
-                                        */
-                                       double min_improvement = 1.03;
-
-                                       if (best->score * min_improvement < r->score)
-                                               best = r;
-                               }
-
-                               if (num_methods > 0) {
-                                       unsigned prev_index = num_methods - 1;
-                                       struct si_result *prev = methods[prev_index];
-                                       struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
-
-                                       /* If the best one is also the best for the previous size,
-                                        * just bump the size for the previous one.
-                                        *
-                                        * If there is no best, it means all methods were too slow
-                                        * for this size and were not tested. Use the best one for
-                                        * the previous size.
-                                        */
-                                       if (!best ||
-                                           /* If it's the same method as for the previous size: */
-                                           (prev->is_cp == best->is_cp &&
-                                            prev->is_sdma == best->is_sdma &&
-                                            prev->is_cs == best->is_cs &&
-                                            prev->cache_policy == best->cache_policy &&
-                                            prev->dwords_per_thread == best->dwords_per_thread &&
-                                            prev->waves_per_sh == best->waves_per_sh) ||
-                                           /* If the method for the previous size is also the best
-                                            * for this size: */
-                                           (prev_this_size->is_valid &&
-                                            prev_this_size->score * 1.03 > best->score)) {
-                                               method_max_size[prev_index] = size;
-                                               continue;
-                                       }
-                               }
-
-                               /* Add it to the list. */
-                               assert(num_methods < ARRAY_SIZE(methods));
-                               methods[num_methods] = best;
-                               method_max_size[num_methods] = size;
-                               num_methods++;
-                       }
-
-                       for (unsigned i = 0; i < num_methods; i++) {
-                               struct si_result *best = methods[i];
-                               unsigned size = method_max_size[i];
-
-                               /* The size threshold is between the current benchmarked
-                                * size and the next benchmarked size. */
-                               if (i < num_methods - 1)
-                                       printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
-                               else if (i > 0)
-                                       printf("         else                   ");
-                               else
-                                       printf("         ");
-                               printf("return ");
-
-                               assert(best);
-                               if (best->is_cp) {
-                                       printf("CP_DMA(%s);\n",
-                                              best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
-                                              best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
-                               }
-                               if (best->is_sdma)
-                                       printf("SDMA;\n");
-                               if (best->is_cs) {
-                                       printf("COMPUTE(%s, %u, %u);\n",
-                                              best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
-                                              best->dwords_per_thread,
-                                              best->waves_per_sh);
-                               }
-                       }
-               }
-               puts("      }");
-       }
-       puts("   }");
-       puts("}");
-
-       ctx->destroy(ctx);
-       exit(0);
+#define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+   static const char *method_str[] = {
+      "CP MC   ",
+      "CP L2   ",
+      "CP L2   ",
+      "SDMA    ",
+   };
+   static const char *placement_str[] = {
+      /* Clear */
+      "fill->VRAM",
+      "fill->GTT ",
+      /* Copy */
+      "VRAM->VRAM",
+      "VRAM->GTT ",
+      "GTT ->VRAM",
+   };
+
+   printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+   printf("Heap       ,Method  ,L2p,Wa,");
+   for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+      if (size >= 1024)
+         printf("%6uKB,", size / 1024);
+      else
+         printf(" %6uB,", size);
+   }
+   printf("\n");
+
+   /* results[log2(size)][placement][method][] */
+   struct si_result {
+      bool is_valid;
+      bool is_cp;
+      bool is_sdma;
+      bool is_cs;
+      unsigned cache_policy;
+      unsigned dwords_per_thread;
+      unsigned waves_per_sh;
+      unsigned score;
+      unsigned index; /* index in results[x][y][index] */
+   } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+   /* Run benchmarks. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      bool is_copy = placement >= 2;
+
+      printf("-----------,--------,---,--,");
+      for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+         printf("--------,");
+      printf("\n");
+
+      for (unsigned method = 0; method < NUM_METHODS; method++) {
+         bool test_cp = method <= 2;
+         bool test_sdma = method == 3;
+         bool test_cs = method >= 4;
+         unsigned cs_method = method - 4;
+         unsigned cs_waves_per_sh =
+            test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
+         cs_method %= 3 * NUM_SHADERS;
+         unsigned cache_policy =
+            test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
+         unsigned cs_dwords_per_thread =
+            test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+         if (test_sdma && !sctx->sdma_cs)
+            continue;
+
+         if (sctx->chip_class == GFX6) {
+            /* GFX6 doesn't support CP DMA operations through L2. */
+            if (test_cp && cache_policy != L2_BYPASS)
+               continue;
+            /* WAVES_PER_SH is in multiples of 16 on GFX6. */
+            if (test_cs && cs_waves_per_sh % 16 != 0)
+               continue;
+         }
+
+         /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
+          * chips before gfx9.
+          */
+         if (test_cs && cache_policy && sctx->chip_class < GFX9)
+            continue;
+
+         printf("%s ,", placement_str[placement]);
+         if (test_cs) {
+            printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+                   cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
+         } else {
+            printf("%s,%3s,", method_str[method],
+                   method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
+         }
+         if (test_cs && cs_waves_per_sh)
+            printf("%2u,", cs_waves_per_sh);
+         else
+            printf("  ,");
+
+         double score = 0;
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Don't test bigger sizes if it's too slow. Print 0. */
+            if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
+               printf("%7.0f ,", 0.0);
+               continue;
+            }
+
+            enum pipe_resource_usage dst_usage, src_usage;
+            struct pipe_resource *dst, *src;
+            struct pipe_query *q[NUM_RUNS];
+            unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+            unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
+
+            if (test_sdma) {
+               if (sctx->chip_class == GFX6)
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+               else
+                  query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+            }
+
+            if (placement == 0 || placement == 2 || placement == 4)
+               dst_usage = PIPE_USAGE_DEFAULT;
+            else
+               dst_usage = PIPE_USAGE_STREAM;
+
+            if (placement == 2 || placement == 3)
+               src_usage = PIPE_USAGE_DEFAULT;
+            else
+               src_usage = PIPE_USAGE_STREAM;
+
+            dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
+            src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
+
+            /* Run tests. */
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               q[iter] = ctx->create_query(ctx, query_type, 0);
+               ctx->begin_query(ctx, q[iter]);
+
+               if (test_cp) {
+                  /* CP DMA */
+                  if (is_copy) {
+                     si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
+                                           cache_policy);
+                  } else {
+                     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
+                                            SI_COHERENCY_NONE, cache_policy);
+                  }
+               } else if (test_sdma) {
+                  /* SDMA */
+                  if (is_copy) {
+                     si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
+                  } else {
+                     si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+                  }
+               } else {
+                  /* Compute */
+                  /* The memory accesses are coalesced, meaning that the 1st instruction writes
+                   * the 1st contiguous block of data for the whole wave, the 2nd instruction
+                   * writes the 2nd contiguous block of data, etc.
+                   */
+                  unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+                  unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+                  unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+                  unsigned num_dwords = size / 4;
+                  unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+                  void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+                                                          cache_policy == L2_STREAM, is_copy);
+
+                  struct pipe_grid_info info = {};
+                  info.block[0] = MIN2(64, num_instructions);
+                  info.block[1] = 1;
+                  info.block[2] = 1;
+                  info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+                  info.grid[1] = 1;
+                  info.grid[2] = 1;
+
+                  struct pipe_shader_buffer sb[2] = {};
+                  sb[0].buffer = dst;
+                  sb[0].buffer_size = size;
+
+                  if (is_copy) {
+                     sb[1].buffer = src;
+                     sb[1].buffer_size = size;
+                  } else {
+                     for (unsigned i = 0; i < 4; i++)
+                        sctx->cs_user_data[i] = clear_value;
+                  }
+
+                  sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
+
+                  ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
+                  ctx->bind_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+                  ctx->launch_grid(ctx, &info);
+
+                  ctx->bind_compute_state(ctx, NULL);
+                  ctx->delete_compute_state(ctx, cs);
+                  sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+                  sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+               }
+
+               /* Flush L2, so that we don't just test L2 cache performance. */
+               if (!test_sdma) {
+                  sctx->flags |= SI_CONTEXT_WB_L2;
+                  sctx->emit_cache_flush(sctx);
+               }
+
+               ctx->end_query(ctx, q[iter]);
+               ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+            }
+            pipe_resource_reference(&dst, NULL);
+            pipe_resource_reference(&src, NULL);
+
+            /* Get results. */
+            uint64_t min = ~0ull, max = 0, total = 0;
+
+            for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+               union pipe_query_result result;
+
+               ctx->get_query_result(ctx, q[iter], true, &result);
+               ctx->destroy_query(ctx, q[iter]);
+
+               min = MIN2(min, result.u64);
+               max = MAX2(max, result.u64);
+               total += result.u64;
+            }
+
+            score = get_MBps_rate(size, total / (double)NUM_RUNS);
+            printf("%7.0f ,", score);
+            fflush(stdout);
+
+            struct si_result *r = &results[util_logbase2(size)][placement][method];
+            r->is_valid = true;
+            r->is_cp = test_cp;
+            r->is_sdma = test_sdma;
+            r->is_cs = test_cs;
+            r->cache_policy = cache_policy;
+            r->dwords_per_thread = cs_dwords_per_thread;
+            r->waves_per_sh = cs_waves_per_sh;
+            r->score = score;
+            r->index = method;
+         }
+         puts("");
+      }
+   }
+
+   puts("");
+   puts("static struct si_method");
+   printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
+          "cached)\n",
+          sctx->screen->info.name);
+   puts("{");
+   puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+
+   /* Analyze results and find the best methods. */
+   for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+      if (placement == 0)
+         puts("   if (dst == RADEON_DOMAIN_VRAM) {");
+      else if (placement == 1)
+         puts("   } else { /* GTT */");
+      else if (placement == 2) {
+         puts("}");
+         puts("");
+         puts("static struct si_method");
+         printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+                sctx->screen->info.name);
+         printf("                     uint64_t size64, bool async, bool cached)\n");
+         puts("{");
+         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
+         puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+      } else if (placement == 3)
+         puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+      else
+         puts("   } else { /* GTT -> VRAM */");
+
+      for (unsigned mode = 0; mode < 3; mode++) {
+         bool async = mode == 0;
+         bool cached = mode == 1;
+
+         if (async)
+            puts("      if (async) { /* SDMA or async compute */");
+         else if (cached)
+            puts("      if (cached) { /* gfx ring */");
+         else
+            puts("      } else { /* gfx ring - uncached */");
+
+         /* The list of best chosen methods. */
+         struct si_result *methods[32];
+         unsigned method_max_size[32];
+         unsigned num_methods = 0;
+
+         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+            /* Find the best method. */
+            struct si_result *best = NULL;
+
+            for (unsigned i = 0; i < NUM_METHODS; i++) {
+               struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+               if (!r->is_valid)
+                  continue;
+
+               /* Ban CP DMA clears via MC on <= GFX8. They are super slow
+                * on GTT, which we can get due to BO evictions.
+                */
+               if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
+                   r->cache_policy == L2_BYPASS)
+                  continue;
+
+               if (async) {
+                  /* The following constraints for compute IBs try to limit
+                   * resource usage so as not to decrease the performance
+                   * of gfx IBs too much.
+                   */
+
+                  /* Don't use CP DMA on asynchronous rings, because
+                   * the engine is shared with gfx IBs.
+                   */
+                  if (r->is_cp)
+                     continue;
+
+                  /* Don't use L2 caching on asynchronous rings to minimize
+                   * L2 usage.
+                   */
+                  if (r->cache_policy == L2_LRU)
+                     continue;
+
+                  /* Asynchronous compute recommends waves_per_sh != 0
+                   * to limit CU usage. */
+                  if (r->is_cs && r->waves_per_sh == 0)
+                     continue;
+               } else {
+                  /* SDMA is always asynchronous */
+                  if (r->is_sdma)
+                     continue;
+
+                  if (cached && r->cache_policy == L2_BYPASS)
+                     continue;
+                  if (!cached && r->cache_policy == L2_LRU)
+                     continue;
+               }
+
+               if (!best) {
+                  best = r;
+                  continue;
+               }
+
+               /* Assume some measurement error. Earlier methods occupy fewer
+                * resources, so the next method is always more greedy, and we
+                * don't want to select it due to a measurement error.
+                */
+               double min_improvement = 1.03;
+
+               if (best->score * min_improvement < r->score)
+                  best = r;
+            }
+
+            if (num_methods > 0) {
+               unsigned prev_index = num_methods - 1;
+               struct si_result *prev = methods[prev_index];
+               struct si_result *prev_this_size =
+                  &results[util_logbase2(size)][placement][prev->index];
+
+               /* If the best one is also the best for the previous size,
+                * just bump the size for the previous one.
+                *
+                * If there is no best, it means all methods were too slow
+                * for this size and were not tested. Use the best one for
+                * the previous size.
+                */
+               if (!best ||
+                   /* If it's the same method as for the previous size: */
+                   (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
+                    prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
+                    prev->dwords_per_thread == best->dwords_per_thread &&
+                    prev->waves_per_sh == best->waves_per_sh) ||
+                   /* If the method for the previous size is also the best
+                    * for this size: */
+                   (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
+                  method_max_size[prev_index] = size;
+                  continue;
+               }
+            }
+
+            /* Add it to the list. */
+            assert(num_methods < ARRAY_SIZE(methods));
+            methods[num_methods] = best;
+            method_max_size[num_methods] = size;
+            num_methods++;
+         }
+
+         for (unsigned i = 0; i < num_methods; i++) {
+            struct si_result *best = methods[i];
+            unsigned size = method_max_size[i];
+
+            /* The size threshold is between the current benchmarked
+             * size and the next benchmarked size. */
+            if (i < num_methods - 1)
+               printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+            else if (i > 0)
+               printf("         else                   ");
+            else
+               printf("         ");
+            printf("return ");
+
+            assert(best);
+            const char *cache_policy_str =
+               best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
+               best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM";
+
+            if (best->is_cp) {
+               printf("CP_DMA(%s);\n", cache_policy_str);
+            }
+            if (best->is_sdma)
+               printf("SDMA;\n");
+            if (best->is_cs) {
+               printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
+                      best->dwords_per_thread, best->waves_per_sh);
+            }
+         }
+      }
+      puts("      }");
+   }
+   puts("   }");
+   puts("}");
+
+   ctx->destroy(ctx);
+   exit(0);
 }