#include "si_pipe.h"
#include "si_query.h"
-#define MIN_SIZE 512
-#define MAX_SIZE (128 * 1024 * 1024)
-#define SIZE_SHIFT 1
-#define NUM_RUNS 128
+#define MIN_SIZE 512
+#define MAX_SIZE (128 * 1024 * 1024)
+#define SIZE_SHIFT 1
+#define NUM_RUNS 128
static double get_MBps_rate(unsigned num_bytes, unsigned ns)
{
- return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
+ return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
}
void si_test_dma_perf(struct si_screen *sscreen)
{
- struct pipe_screen *screen = &sscreen->b;
- struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
- struct si_context *sctx = (struct si_context*)ctx;
- const uint32_t clear_value = 0x12345678;
- static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
- static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
+ struct pipe_screen *screen = &sscreen->b;
+ struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
+ struct si_context *sctx = (struct si_context *)ctx;
+ const uint32_t clear_value = 0x12345678;
+ static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
+ static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16};
#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
-
- static const char *method_str[] = {
- "CP MC ",
- "CP L2 ",
- "CP L2 ",
- "SDMA ",
- };
- static const char *placement_str[] = {
- /* Clear */
- "fill->VRAM",
- "fill->GTT ",
- /* Copy */
- "VRAM->VRAM",
- "VRAM->GTT ",
- "GTT ->VRAM",
- };
-
- printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
- printf("Heap ,Method ,L2p,Wa,");
- for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
- if (size >= 1024)
- printf("%6uKB,", size / 1024);
- else
- printf(" %6uB,", size);
- }
- printf("\n");
-
- /* results[log2(size)][placement][method][] */
- struct si_result {
- bool is_valid;
- bool is_cp;
- bool is_sdma;
- bool is_cs;
- unsigned cache_policy;
- unsigned dwords_per_thread;
- unsigned waves_per_sh;
- unsigned score;
- unsigned index; /* index in results[x][y][index] */
- } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
-
- /* Run benchmarks. */
- for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
- bool is_copy = placement >= 2;
-
- printf("-----------,--------,---,--,");
- for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
- printf("--------,");
- printf("\n");
-
- for (unsigned method = 0; method < NUM_METHODS; method++) {
- bool test_cp = method <= 2;
- bool test_sdma = method == 3;
- bool test_cs = method >= 4;
- unsigned cs_method = method - 4;
- STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
- unsigned cs_waves_per_sh =
- test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
- cs_method %= 2*NUM_SHADERS;
- unsigned cache_policy = test_cp ? method % 3 :
- test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
- unsigned cs_dwords_per_thread =
- test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
-
- if (sctx->chip_class == SI) {
- /* SI doesn't support CP DMA operations through L2. */
- if (test_cp && cache_policy != L2_BYPASS)
- continue;
- /* WAVES_PER_SH is in multiples of 16 on SI. */
- if (test_cs && cs_waves_per_sh % 16 != 0)
- continue;
- }
-
- printf("%s ,", placement_str[placement]);
- if (test_cs) {
- printf("CS x%-4u,%3s,", cs_dwords_per_thread,
- cache_policy == L2_LRU ? "LRU" :
- cache_policy == L2_STREAM ? "Str" : "");
- } else {
- printf("%s,%3s,", method_str[method],
- method == L2_LRU ? "LRU" :
- method == L2_STREAM ? "Str" : "");
- }
- if (test_cs && cs_waves_per_sh)
- printf("%2u,", cs_waves_per_sh);
- else
- printf(" ,");
-
- double score = 0;
- for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
- /* Don't test bigger sizes if it's too slow. Print 0. */
- if (size >= 512*1024 &&
- score < 400 * (size / (4*1024*1024))) {
- printf("%7.0f ,", 0.0);
- continue;
- }
-
- enum pipe_resource_usage dst_usage, src_usage;
- struct pipe_resource *dst, *src;
- struct pipe_query *q[NUM_RUNS];
- unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
-
- if (test_sdma) {
- if (sctx->chip_class == SI)
- query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
- else
- query_type = SI_QUERY_TIME_ELAPSED_SDMA;
- }
-
- if (placement == 0 || placement == 2 || placement == 4)
- dst_usage = PIPE_USAGE_DEFAULT;
- else
- dst_usage = PIPE_USAGE_STREAM;
-
- if (placement == 2 || placement == 3)
- src_usage = PIPE_USAGE_DEFAULT;
- else
- src_usage = PIPE_USAGE_STREAM;
-
- dst = pipe_buffer_create(screen, 0, dst_usage, size);
- src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
-
- /* Run tests. */
- for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
- q[iter] = ctx->create_query(ctx, query_type, 0);
- ctx->begin_query(ctx, q[iter]);
-
- if (test_cp) {
- /* CP DMA */
- if (is_copy) {
- si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
- SI_COHERENCY_NONE, cache_policy);
- } else {
- si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
- SI_COHERENCY_NONE, cache_policy);
- }
- } else if (test_sdma) {
- /* SDMA */
- if (is_copy) {
- struct pipe_box box;
- u_box_1d(0, size, &box);
- sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
- } else {
- si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
- }
- } else {
- /* Compute */
- /* The memory accesses are coalesced, meaning that the 1st instruction writes
- * the 1st contiguous block of data for the whole wave, the 2nd instruction
- * writes the 2nd contiguous block of data, etc.
- */
- unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
- unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
- unsigned dwords_per_wave = cs_dwords_per_thread * 64;
-
- unsigned num_dwords = size / 4;
- unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
-
- void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
- cache_policy == L2_STREAM, is_copy);
-
- struct pipe_grid_info info = {};
- info.block[0] = MIN2(64, num_instructions);
- info.block[1] = 1;
- info.block[2] = 1;
- info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
- info.grid[1] = 1;
- info.grid[2] = 1;
-
- struct pipe_shader_buffer sb[2] = {};
- sb[0].buffer = dst;
- sb[0].buffer_size = size;
-
- if (is_copy) {
- sb[1].buffer = src;
- sb[1].buffer_size = size;
- } else {
- for (unsigned i = 0; i < 4; i++)
- sctx->cs_user_data[i] = clear_value;
- }
-
- sctx->flags |= SI_CONTEXT_INV_VMEM_L1 |
- SI_CONTEXT_INV_SMEM_L1;
-
- ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb);
- ctx->bind_compute_state(ctx, cs);
- sctx->cs_max_waves_per_sh = cs_waves_per_sh;
-
- ctx->launch_grid(ctx, &info);
-
- ctx->bind_compute_state(ctx, NULL);
- ctx->delete_compute_state(ctx, cs);
- sctx->cs_max_waves_per_sh = 0; /* disable the limit */
-
- sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
- }
-
- /* Flush L2, so that we don't just test L2 cache performance. */
- if (!test_sdma) {
- sctx->flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
- si_emit_cache_flush(sctx);
- }
-
- ctx->end_query(ctx, q[iter]);
- ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
- }
- pipe_resource_reference(&dst, NULL);
- pipe_resource_reference(&src, NULL);
-
- /* Get results. */
- uint64_t min = ~0ull, max = 0, total = 0;
-
- for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
- union pipe_query_result result;
-
- ctx->get_query_result(ctx, q[iter], true, &result);
- ctx->destroy_query(ctx, q[iter]);
-
- min = MIN2(min, result.u64);
- max = MAX2(max, result.u64);
- total += result.u64;
- }
-
- score = get_MBps_rate(size, total / (double)NUM_RUNS);
- printf("%7.0f ,", score);
- fflush(stdout);
-
- struct si_result *r = &results[util_logbase2(size)][placement][method];
- r->is_valid = true;
- r->is_cp = test_cp;
- r->is_sdma = test_sdma;
- r->is_cs = test_cs;
- r->cache_policy = cache_policy;
- r->dwords_per_thread = cs_dwords_per_thread;
- r->waves_per_sh = cs_waves_per_sh;
- r->score = score;
- r->index = method;
- }
- puts("");
- }
- }
-
- puts("");
- puts("static struct si_method");
- printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
- sctx->screen->info.name);
- puts("{");
- puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
-
- /* Analyze results and find the best methods. */
- for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
- if (placement == 0)
- puts(" if (dst == RADEON_DOMAIN_VRAM) {");
- else if (placement == 1)
- puts(" } else { /* GTT */");
- else if (placement == 2) {
- puts("}");
- puts("");
- puts("static struct si_method");
- printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
- sctx->screen->info.name);
- printf(" uint64_t size64, bool async, bool cached)\n");
- puts("{");
- puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
- puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
- } else if (placement == 3)
- puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
- else
- puts(" } else { /* GTT -> VRAM */");
-
- for (unsigned mode = 0; mode < 3; mode++) {
- bool async = mode == 0;
- bool cached = mode == 1;
-
- if (async)
- puts(" if (async) { /* SDMA or async compute */");
- else if (cached)
- puts(" if (cached) { /* gfx ring */");
- else
- puts(" } else { /* gfx ring - uncached */");
-
- /* The list of best chosen methods. */
- struct si_result *methods[32];
- unsigned method_max_size[32];
- unsigned num_methods = 0;
-
- for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
- /* Find the best method. */
- struct si_result *best = NULL;
-
- for (unsigned i = 0; i < NUM_METHODS; i++) {
- struct si_result *r = &results[util_logbase2(size)][placement][i];
-
- if (!r->is_valid)
- continue;
-
- /* Ban CP DMA clears via MC on <= VI. They are super slow
- * on GTT, which we can get due to BO evictions.
- */
- if (sctx->chip_class <= VI && placement == 1 &&
- r->is_cp && r->cache_policy == L2_BYPASS)
- continue;
-
- if (async) {
- /* The following constraints for compute IBs try to limit
- * resource usage so as not to decrease the performance
- * of gfx IBs too much.
- */
-
- /* Don't use CP DMA on asynchronous rings, because
- * the engine is shared with gfx IBs.
- */
- if (r->is_cp)
- continue;
-
- /* Don't use L2 caching on asynchronous rings to minimize
- * L2 usage.
- */
- if (r->cache_policy == L2_LRU)
- continue;
-
- /* Asynchronous compute recommends waves_per_sh != 0
- * to limit CU usage. */
- if (r->is_cs && r->waves_per_sh == 0)
- continue;
- } else {
- /* SDMA is always asynchronous */
- if (r->is_sdma)
- continue;
-
- if (cached && r->cache_policy == L2_BYPASS)
- continue;
- if (!cached && r->cache_policy == L2_LRU)
- continue;
- }
-
- if (!best) {
- best = r;
- continue;
- }
-
- /* Assume some measurement error. Earlier methods occupy fewer
- * resources, so the next method is always more greedy, and we
- * don't want to select it due to a measurement error.
- */
- double min_improvement = 1.03;
-
- if (best->score * min_improvement < r->score)
- best = r;
- }
-
- if (num_methods > 0) {
- unsigned prev_index = num_methods - 1;
- struct si_result *prev = methods[prev_index];
- struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
-
- /* If the best one is also the best for the previous size,
- * just bump the size for the previous one.
- *
- * If there is no best, it means all methods were too slow
- * for this size and were not tested. Use the best one for
- * the previous size.
- */
- if (!best ||
- /* If it's the same method as for the previous size: */
- (prev->is_cp == best->is_cp &&
- prev->is_sdma == best->is_sdma &&
- prev->is_cs == best->is_cs &&
- prev->cache_policy == best->cache_policy &&
- prev->dwords_per_thread == best->dwords_per_thread &&
- prev->waves_per_sh == best->waves_per_sh) ||
- /* If the method for the previous size is also the best
- * for this size: */
- (prev_this_size->is_valid &&
- prev_this_size->score * 1.03 > best->score)) {
- method_max_size[prev_index] = size;
- continue;
- }
- }
-
- /* Add it to the list. */
- assert(num_methods < ARRAY_SIZE(methods));
- methods[num_methods] = best;
- method_max_size[num_methods] = size;
- num_methods++;
- }
-
- for (unsigned i = 0; i < num_methods; i++) {
- struct si_result *best = methods[i];
- unsigned size = method_max_size[i];
-
- /* The size threshold is between the current benchmarked
- * size and the next benchmarked size. */
- if (i < num_methods - 1)
- printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
- else if (i > 0)
- printf(" else ");
- else
- printf(" ");
- printf("return ");
-
- assert(best);
- if (best->is_cp) {
- printf("CP_DMA(%s);\n",
- best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
- best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM");
- }
- if (best->is_sdma)
- printf("SDMA;\n");
- if (best->is_cs) {
- printf("COMPUTE(%s, %u, %u);\n",
- best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM",
- best->dwords_per_thread,
- best->waves_per_sh);
- }
- }
- }
- puts(" }");
- }
- puts(" }");
- puts("}");
-
- ctx->destroy(ctx);
- exit(0);
+#define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+
+ static const char *method_str[] = {
+ "CP MC ",
+ "CP L2 ",
+ "CP L2 ",
+ "SDMA ",
+ };
+ static const char *placement_str[] = {
+ /* Clear */
+ "fill->VRAM",
+ "fill->GTT ",
+ /* Copy */
+ "VRAM->VRAM",
+ "VRAM->GTT ",
+ "GTT ->VRAM",
+ };
+
+ printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
+ printf("Heap ,Method ,L2p,Wa,");
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ if (size >= 1024)
+ printf("%6uKB,", size / 1024);
+ else
+ printf(" %6uB,", size);
+ }
+ printf("\n");
+
+ /* results[log2(size)][placement][method][] */
+ struct si_result {
+ bool is_valid;
+ bool is_cp;
+ bool is_sdma;
+ bool is_cs;
+ unsigned cache_policy;
+ unsigned dwords_per_thread;
+ unsigned waves_per_sh;
+ unsigned score;
+ unsigned index; /* index in results[x][y][index] */
+ } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
+
+ /* Run benchmarks. */
+ for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+ bool is_copy = placement >= 2;
+
+ printf("-----------,--------,---,--,");
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
+ printf("--------,");
+ printf("\n");
+
+ for (unsigned method = 0; method < NUM_METHODS; method++) {
+ bool test_cp = method <= 2;
+ bool test_sdma = method == 3;
+ bool test_cs = method >= 4;
+ unsigned cs_method = method - 4;
+ unsigned cs_waves_per_sh =
+ test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
+ cs_method %= 3 * NUM_SHADERS;
+ unsigned cache_policy =
+ test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
+ unsigned cs_dwords_per_thread =
+ test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
+
+ if (test_sdma && !sctx->sdma_cs)
+ continue;
+
+ if (sctx->chip_class == GFX6) {
+ /* GFX6 doesn't support CP DMA operations through L2. */
+ if (test_cp && cache_policy != L2_BYPASS)
+ continue;
+ /* WAVES_PER_SH is in multiples of 16 on GFX6. */
+ if (test_cs && cs_waves_per_sh % 16 != 0)
+ continue;
+ }
+
+ /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
+ * chips before gfx9.
+ */
+ if (test_cs && cache_policy && sctx->chip_class < GFX9)
+ continue;
+
+ printf("%s ,", placement_str[placement]);
+ if (test_cs) {
+ printf("CS x%-4u,%3s,", cs_dwords_per_thread,
+ cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
+ } else {
+ printf("%s,%3s,", method_str[method],
+ method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
+ }
+ if (test_cs && cs_waves_per_sh)
+ printf("%2u,", cs_waves_per_sh);
+ else
+ printf(" ,");
+
+ double score = 0;
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ /* Don't test bigger sizes if it's too slow. Print 0. */
+ if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
+ printf("%7.0f ,", 0.0);
+ continue;
+ }
+
+ enum pipe_resource_usage dst_usage, src_usage;
+ struct pipe_resource *dst, *src;
+ struct pipe_query *q[NUM_RUNS];
+ unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
+ unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
+
+ if (test_sdma) {
+ if (sctx->chip_class == GFX6)
+ query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
+ else
+ query_type = SI_QUERY_TIME_ELAPSED_SDMA;
+ }
+
+ if (placement == 0 || placement == 2 || placement == 4)
+ dst_usage = PIPE_USAGE_DEFAULT;
+ else
+ dst_usage = PIPE_USAGE_STREAM;
+
+ if (placement == 2 || placement == 3)
+ src_usage = PIPE_USAGE_DEFAULT;
+ else
+ src_usage = PIPE_USAGE_STREAM;
+
+ dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
+ src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
+
+ /* Run tests. */
+ for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+ q[iter] = ctx->create_query(ctx, query_type, 0);
+ ctx->begin_query(ctx, q[iter]);
+
+ if (test_cp) {
+ /* CP DMA */
+ if (is_copy) {
+ si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
+ cache_policy);
+ } else {
+ si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
+ SI_COHERENCY_NONE, cache_policy);
+ }
+ } else if (test_sdma) {
+ /* SDMA */
+ if (is_copy) {
+ si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
+ } else {
+ si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
+ }
+ } else {
+ /* Compute */
+ /* The memory accesses are coalesced, meaning that the 1st instruction writes
+ * the 1st contiguous block of data for the whole wave, the 2nd instruction
+ * writes the 2nd contiguous block of data, etc.
+ */
+ unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
+ unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
+ unsigned dwords_per_wave = cs_dwords_per_thread * 64;
+
+ unsigned num_dwords = size / 4;
+ unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+
+ void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
+ cache_policy == L2_STREAM, is_copy);
+
+ struct pipe_grid_info info = {};
+ info.block[0] = MIN2(64, num_instructions);
+ info.block[1] = 1;
+ info.block[2] = 1;
+ info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+ info.grid[1] = 1;
+ info.grid[2] = 1;
+
+ struct pipe_shader_buffer sb[2] = {};
+ sb[0].buffer = dst;
+ sb[0].buffer_size = size;
+
+ if (is_copy) {
+ sb[1].buffer = src;
+ sb[1].buffer_size = size;
+ } else {
+ for (unsigned i = 0; i < 4; i++)
+ sctx->cs_user_data[i] = clear_value;
+ }
+
+ sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
+
+ ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
+ ctx->bind_compute_state(ctx, cs);
+ sctx->cs_max_waves_per_sh = cs_waves_per_sh;
+
+ ctx->launch_grid(ctx, &info);
+
+ ctx->bind_compute_state(ctx, NULL);
+ ctx->delete_compute_state(ctx, cs);
+ sctx->cs_max_waves_per_sh = 0; /* disable the limit */
+
+ sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
+ }
+
+ /* Flush L2, so that we don't just test L2 cache performance. */
+ if (!test_sdma) {
+ sctx->flags |= SI_CONTEXT_WB_L2;
+ sctx->emit_cache_flush(sctx);
+ }
+
+ ctx->end_query(ctx, q[iter]);
+ ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
+ }
+ pipe_resource_reference(&dst, NULL);
+ pipe_resource_reference(&src, NULL);
+
+ /* Get results. */
+ uint64_t min = ~0ull, max = 0, total = 0;
+
+ for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
+ union pipe_query_result result;
+
+ ctx->get_query_result(ctx, q[iter], true, &result);
+ ctx->destroy_query(ctx, q[iter]);
+
+ min = MIN2(min, result.u64);
+ max = MAX2(max, result.u64);
+ total += result.u64;
+ }
+
+ score = get_MBps_rate(size, total / (double)NUM_RUNS);
+ printf("%7.0f ,", score);
+ fflush(stdout);
+
+ struct si_result *r = &results[util_logbase2(size)][placement][method];
+ r->is_valid = true;
+ r->is_cp = test_cp;
+ r->is_sdma = test_sdma;
+ r->is_cs = test_cs;
+ r->cache_policy = cache_policy;
+ r->dwords_per_thread = cs_dwords_per_thread;
+ r->waves_per_sh = cs_waves_per_sh;
+ r->score = score;
+ r->index = method;
+ }
+ puts("");
+ }
+ }
+
+ puts("");
+ puts("static struct si_method");
+ printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
+ "cached)\n",
+ sctx->screen->info.name);
+ puts("{");
+ puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
+
+ /* Analyze results and find the best methods. */
+ for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
+ if (placement == 0)
+ puts(" if (dst == RADEON_DOMAIN_VRAM) {");
+ else if (placement == 1)
+ puts(" } else { /* GTT */");
+ else if (placement == 2) {
+ puts("}");
+ puts("");
+ puts("static struct si_method");
+ printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
+ sctx->screen->info.name);
+ printf(" uint64_t size64, bool async, bool cached)\n");
+ puts("{");
+ puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
+ puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
+ } else if (placement == 3)
+ puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
+ else
+ puts(" } else { /* GTT -> VRAM */");
+
+ for (unsigned mode = 0; mode < 3; mode++) {
+ bool async = mode == 0;
+ bool cached = mode == 1;
+
+ if (async)
+ puts(" if (async) { /* SDMA or async compute */");
+ else if (cached)
+ puts(" if (cached) { /* gfx ring */");
+ else
+ puts(" } else { /* gfx ring - uncached */");
+
+ /* The list of best chosen methods. */
+ struct si_result *methods[32];
+ unsigned method_max_size[32];
+ unsigned num_methods = 0;
+
+ for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
+ /* Find the best method. */
+ struct si_result *best = NULL;
+
+ for (unsigned i = 0; i < NUM_METHODS; i++) {
+ struct si_result *r = &results[util_logbase2(size)][placement][i];
+
+ if (!r->is_valid)
+ continue;
+
+ /* Ban CP DMA clears via MC on <= GFX8. They are super slow
+ * on GTT, which we can get due to BO evictions.
+ */
+ if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
+ r->cache_policy == L2_BYPASS)
+ continue;
+
+ if (async) {
+ /* The following constraints for compute IBs try to limit
+ * resource usage so as not to decrease the performance
+ * of gfx IBs too much.
+ */
+
+ /* Don't use CP DMA on asynchronous rings, because
+ * the engine is shared with gfx IBs.
+ */
+ if (r->is_cp)
+ continue;
+
+ /* Don't use L2 caching on asynchronous rings to minimize
+ * L2 usage.
+ */
+ if (r->cache_policy == L2_LRU)
+ continue;
+
+ /* Asynchronous compute recommends waves_per_sh != 0
+ * to limit CU usage. */
+ if (r->is_cs && r->waves_per_sh == 0)
+ continue;
+ } else {
+ /* SDMA is always asynchronous */
+ if (r->is_sdma)
+ continue;
+
+ if (cached && r->cache_policy == L2_BYPASS)
+ continue;
+ if (!cached && r->cache_policy == L2_LRU)
+ continue;
+ }
+
+ if (!best) {
+ best = r;
+ continue;
+ }
+
+ /* Assume some measurement error. Earlier methods occupy fewer
+ * resources, so the next method is always more greedy, and we
+ * don't want to select it due to a measurement error.
+ */
+ double min_improvement = 1.03;
+
+ if (best->score * min_improvement < r->score)
+ best = r;
+ }
+
+ if (num_methods > 0) {
+ unsigned prev_index = num_methods - 1;
+ struct si_result *prev = methods[prev_index];
+ struct si_result *prev_this_size =
+ &results[util_logbase2(size)][placement][prev->index];
+
+ /* If the best one is also the best for the previous size,
+ * just bump the size for the previous one.
+ *
+ * If there is no best, it means all methods were too slow
+ * for this size and were not tested. Use the best one for
+ * the previous size.
+ */
+ if (!best ||
+ /* If it's the same method as for the previous size: */
+ (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
+ prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
+ prev->dwords_per_thread == best->dwords_per_thread &&
+ prev->waves_per_sh == best->waves_per_sh) ||
+ /* If the method for the previous size is also the best
+ * for this size: */
+ (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
+ method_max_size[prev_index] = size;
+ continue;
+ }
+ }
+
+ /* Add it to the list. */
+ assert(num_methods < ARRAY_SIZE(methods));
+ methods[num_methods] = best;
+ method_max_size[num_methods] = size;
+ num_methods++;
+ }
+
+ for (unsigned i = 0; i < num_methods; i++) {
+ struct si_result *best = methods[i];
+ unsigned size = method_max_size[i];
+
+ /* The size threshold is between the current benchmarked
+ * size and the next benchmarked size. */
+ if (i < num_methods - 1)
+ printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
+ else if (i > 0)
+ printf(" else ");
+ else
+ printf(" ");
+ printf("return ");
+
+ assert(best);
+ const char *cache_policy_str =
+ best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
+ best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM";
+
+ if (best->is_cp) {
+ printf("CP_DMA(%s);\n", cache_policy_str);
+ }
+ if (best->is_sdma)
+ printf("SDMA;\n");
+ if (best->is_cs) {
+ printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
+ best->dwords_per_thread, best->waves_per_sh);
+ }
+ }
+ }
+ puts(" }");
+ }
+ puts(" }");
+ puts("}");
+
+ ctx->destroy(ctx);
+ exit(0);
}