src/gallium/drivers/radeonsi/si_test_dma_perf.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  *
  24  */
  25
  26 /* This file implements tests on the si_clearbuffer function. */
  27
  28 #include "si_pipe.h"
  29 #include "si_query.h"
  30
  31 #define MIN_SIZE   512
  32 #define MAX_SIZE   (128 * 1024 * 1024)
  33 #define SIZE_SHIFT 1
  34 #define NUM_RUNS   128
  35
  36 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
  37 {
  38    return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
  39 }
  40
  41 void si_test_dma_perf(struct si_screen *sscreen)
  42 {
  43    struct pipe_screen *screen = &sscreen->b;
  44    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
  45    struct si_context *sctx = (struct si_context *)ctx;
  46    const uint32_t clear_value = 0x12345678;
  47    static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
  48    static const unsigned cs_waves_per_sh_list[] = {0, 2, 4, 8, 16};
  49
  50 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
  51 #define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
  52
  53    static const char *method_str[] = {
  54       "CP MC   ",
  55       "CP L2   ",
  56       "CP L2   ",
  57       "SDMA    ",
  58    };
  59    static const char *placement_str[] = {
  60       /* Clear */
  61       "fill->VRAM",
  62       "fill->GTT ",
  63       /* Copy */
  64       "VRAM->VRAM",
  65       "VRAM->GTT ",
  66       "GTT ->VRAM",
  67    };
  68
  69    printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
  70    printf("Heap       ,Method  ,L2p,Wa,");
  71    for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
  72       if (size >= 1024)
  73          printf("%6uKB,", size / 1024);
  74       else
  75          printf(" %6uB,", size);
  76    }
  77    printf("\n");
  78
  79    /* results[log2(size)][placement][method][] */
  80    struct si_result {
  81       bool is_valid;
  82       bool is_cp;
  83       bool is_sdma;
  84       bool is_cs;
  85       unsigned cache_policy;
  86       unsigned dwords_per_thread;
  87       unsigned waves_per_sh;
  88       unsigned score;
  89       unsigned index; /* index in results[x][y][index] */
  90    } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
  91
  92    /* Run benchmarks. */
  93    for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
  94       bool is_copy = placement >= 2;
  95
  96       printf("-----------,--------,---,--,");
  97       for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
  98          printf("--------,");
  99       printf("\n");
 100
 101       for (unsigned method = 0; method < NUM_METHODS; method++) {
 102          bool test_cp = method <= 2;
 103          bool test_sdma = method == 3;
 104          bool test_cs = method >= 4;
 105          unsigned cs_method = method - 4;
 106          unsigned cs_waves_per_sh =
 107             test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
 108          cs_method %= 3 * NUM_SHADERS;
 109          unsigned cache_policy =
 110             test_cp ? method % 3 : test_cs ? (cs_method / NUM_SHADERS) : 0;
 111          unsigned cs_dwords_per_thread =
 112             test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
 113
 114          if (test_sdma && !sctx->sdma_cs)
 115             continue;
 116
 117          if (sctx->chip_class == GFX6) {
 118             /* GFX6 doesn't support CP DMA operations through L2. */
 119             if (test_cp && cache_policy != L2_BYPASS)
 120                continue;
 121             /* WAVES_PER_SH is in multiples of 16 on GFX6. */
 122             if (test_cs && cs_waves_per_sh % 16 != 0)
 123                continue;
 124          }
 125
 126          /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
 127           * chips before gfx9.
 128           */
 129          if (test_cs && cache_policy && sctx->chip_class < GFX9)
 130             continue;
 131
 132          printf("%s ,", placement_str[placement]);
 133          if (test_cs) {
 134             printf("CS x%-4u,%3s,", cs_dwords_per_thread,
 135                    cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
 136          } else {
 137             printf("%s,%3s,", method_str[method],
 138                    method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
 139          }
 140          if (test_cs && cs_waves_per_sh)
 141             printf("%2u,", cs_waves_per_sh);
 142          else
 143             printf("  ,");
 144
 145          double score = 0;
 146          for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
 147             /* Don't test bigger sizes if it's too slow. Print 0. */
 148             if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
 149                printf("%7.0f ,", 0.0);
 150                continue;
 151             }
 152
 153             enum pipe_resource_usage dst_usage, src_usage;
 154             struct pipe_resource *dst, *src;
 155             struct pipe_query *q[NUM_RUNS];
 156             unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
 157             unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
 158
 159             if (test_sdma) {
 160                if (sctx->chip_class == GFX6)
 161                   query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
 162                else
 163                   query_type = SI_QUERY_TIME_ELAPSED_SDMA;
 164             }
 165
 166             if (placement == 0 || placement == 2 || placement == 4)
 167                dst_usage = PIPE_USAGE_DEFAULT;
 168             else
 169                dst_usage = PIPE_USAGE_STREAM;
 170
 171             if (placement == 2 || placement == 3)
 172                src_usage = PIPE_USAGE_DEFAULT;
 173             else
 174                src_usage = PIPE_USAGE_STREAM;
 175
 176             dst = pipe_aligned_buffer_create(screen, flags, dst_usage, size, 256);
 177             src = is_copy ? pipe_aligned_buffer_create(screen, flags, src_usage, size, 256) : NULL;
 178
 179             /* Run tests. */
 180             for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
 181                q[iter] = ctx->create_query(ctx, query_type, 0);
 182                ctx->begin_query(ctx, q[iter]);
 183
 184                if (test_cp) {
 185                   /* CP DMA */
 186                   if (is_copy) {
 187                      si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
 188                                            cache_policy);
 189                   } else {
 190                      si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
 191                                             SI_COHERENCY_NONE, cache_policy);
 192                   }
 193                } else if (test_sdma) {
 194                   /* SDMA */
 195                   if (is_copy) {
 196                      si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
 197                   } else {
 198                      si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
 199                   }
 200                } else {
 201                   /* Compute */
 202                   /* The memory accesses are coalesced, meaning that the 1st instruction writes
 203                    * the 1st contiguous block of data for the whole wave, the 2nd instruction
 204                    * writes the 2nd contiguous block of data, etc.
 205                    */
 206                   unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
 207                   unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
 208                   unsigned dwords_per_wave = cs_dwords_per_thread * 64;
 209
 210                   unsigned num_dwords = size / 4;
 211                   unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
 212
 213                   void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
 214                                                           cache_policy == L2_STREAM, is_copy);
 215
 216                   struct pipe_grid_info info = {};
 217                   info.block[0] = MIN2(64, num_instructions);
 218                   info.block[1] = 1;
 219                   info.block[2] = 1;
 220                   info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
 221                   info.grid[1] = 1;
 222                   info.grid[2] = 1;
 223
 224                   struct pipe_shader_buffer sb[2] = {};
 225                   sb[0].buffer = dst;
 226                   sb[0].buffer_size = size;
 227
 228                   if (is_copy) {
 229                      sb[1].buffer = src;
 230                      sb[1].buffer_size = size;
 231                   } else {
 232                      for (unsigned i = 0; i < 4; i++)
 233                         sctx->cs_user_data[i] = clear_value;
 234                   }
 235
 236                   sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
 237
 238                   ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
 239                   ctx->bind_compute_state(ctx, cs);
 240                   sctx->cs_max_waves_per_sh = cs_waves_per_sh;
 241
 242                   ctx->launch_grid(ctx, &info);
 243
 244                   ctx->bind_compute_state(ctx, NULL);
 245                   ctx->delete_compute_state(ctx, cs);
 246                   sctx->cs_max_waves_per_sh = 0; /* disable the limit */
 247
 248                   sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 249                }
 250
 251                /* Flush L2, so that we don't just test L2 cache performance. */
 252                if (!test_sdma) {
 253                   sctx->flags |= SI_CONTEXT_WB_L2;
 254                   sctx->emit_cache_flush(sctx);
 255                }
 256
 257                ctx->end_query(ctx, q[iter]);
 258                ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
 259             }
 260             pipe_resource_reference(&dst, NULL);
 261             pipe_resource_reference(&src, NULL);
 262
 263             /* Get results. */
 264             uint64_t min = ~0ull, max = 0, total = 0;
 265
 266             for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
 267                union pipe_query_result result;
 268
 269                ctx->get_query_result(ctx, q[iter], true, &result);
 270                ctx->destroy_query(ctx, q[iter]);
 271
 272                min = MIN2(min, result.u64);
 273                max = MAX2(max, result.u64);
 274                total += result.u64;
 275             }
 276
 277             score = get_MBps_rate(size, total / (double)NUM_RUNS);
 278             printf("%7.0f ,", score);
 279             fflush(stdout);
 280
 281             struct si_result *r = &results[util_logbase2(size)][placement][method];
 282             r->is_valid = true;
 283             r->is_cp = test_cp;
 284             r->is_sdma = test_sdma;
 285             r->is_cs = test_cs;
 286             r->cache_policy = cache_policy;
 287             r->dwords_per_thread = cs_dwords_per_thread;
 288             r->waves_per_sh = cs_waves_per_sh;
 289             r->score = score;
 290             r->index = method;
 291          }
 292          puts("");
 293       }
 294    }
 295
 296    puts("");
 297    puts("static struct si_method");
 298    printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
 299           "cached)\n",
 300           sctx->screen->info.name);
 301    puts("{");
 302    puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
 303
 304    /* Analyze results and find the best methods. */
 305    for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
 306       if (placement == 0)
 307          puts("   if (dst == RADEON_DOMAIN_VRAM) {");
 308       else if (placement == 1)
 309          puts("   } else { /* GTT */");
 310       else if (placement == 2) {
 311          puts("}");
 312          puts("");
 313          puts("static struct si_method");
 314          printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
 315                 sctx->screen->info.name);
 316          printf("                     uint64_t size64, bool async, bool cached)\n");
 317          puts("{");
 318          puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
 319          puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
 320       } else if (placement == 3)
 321          puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
 322       else
 323          puts("   } else { /* GTT -> VRAM */");
 324
 325       for (unsigned mode = 0; mode < 3; mode++) {
 326          bool async = mode == 0;
 327          bool cached = mode == 1;
 328
 329          if (async)
 330             puts("      if (async) { /* SDMA or async compute */");
 331          else if (cached)
 332             puts("      if (cached) { /* gfx ring */");
 333          else
 334             puts("      } else { /* gfx ring - uncached */");
 335
 336          /* The list of best chosen methods. */
 337          struct si_result *methods[32];
 338          unsigned method_max_size[32];
 339          unsigned num_methods = 0;
 340
 341          for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
 342             /* Find the best method. */
 343             struct si_result *best = NULL;
 344
 345             for (unsigned i = 0; i < NUM_METHODS; i++) {
 346                struct si_result *r = &results[util_logbase2(size)][placement][i];
 347
 348                if (!r->is_valid)
 349                   continue;
 350
 351                /* Ban CP DMA clears via MC on <= GFX8. They are super slow
 352                 * on GTT, which we can get due to BO evictions.
 353                 */
 354                if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
 355                    r->cache_policy == L2_BYPASS)
 356                   continue;
 357
 358                if (async) {
 359                   /* The following constraints for compute IBs try to limit
 360                    * resource usage so as not to decrease the performance
 361                    * of gfx IBs too much.
 362                    */
 363
 364                   /* Don't use CP DMA on asynchronous rings, because
 365                    * the engine is shared with gfx IBs.
 366                    */
 367                   if (r->is_cp)
 368                      continue;
 369
 370                   /* Don't use L2 caching on asynchronous rings to minimize
 371                    * L2 usage.
 372                    */
 373                   if (r->cache_policy == L2_LRU)
 374                      continue;
 375
 376                   /* Asynchronous compute recommends waves_per_sh != 0
 377                    * to limit CU usage. */
 378                   if (r->is_cs && r->waves_per_sh == 0)
 379                      continue;
 380                } else {
 381                   /* SDMA is always asynchronous */
 382                   if (r->is_sdma)
 383                      continue;
 384
 385                   if (cached && r->cache_policy == L2_BYPASS)
 386                      continue;
 387                   if (!cached && r->cache_policy == L2_LRU)
 388                      continue;
 389                }
 390
 391                if (!best) {
 392                   best = r;
 393                   continue;
 394                }
 395
 396                /* Assume some measurement error. Earlier methods occupy fewer
 397                 * resources, so the next method is always more greedy, and we
 398                 * don't want to select it due to a measurement error.
 399                 */
 400                double min_improvement = 1.03;
 401
 402                if (best->score * min_improvement < r->score)
 403                   best = r;
 404             }
 405
 406             if (num_methods > 0) {
 407                unsigned prev_index = num_methods - 1;
 408                struct si_result *prev = methods[prev_index];
 409                struct si_result *prev_this_size =
 410                   &results[util_logbase2(size)][placement][prev->index];
 411
 412                /* If the best one is also the best for the previous size,
 413                 * just bump the size for the previous one.
 414                 *
 415                 * If there is no best, it means all methods were too slow
 416                 * for this size and were not tested. Use the best one for
 417                 * the previous size.
 418                 */
 419                if (!best ||
 420                    /* If it's the same method as for the previous size: */
 421                    (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
 422                     prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
 423                     prev->dwords_per_thread == best->dwords_per_thread &&
 424                     prev->waves_per_sh == best->waves_per_sh) ||
 425                    /* If the method for the previous size is also the best
 426                     * for this size: */
 427                    (prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
 428                   method_max_size[prev_index] = size;
 429                   continue;
 430                }
 431             }
 432
 433             /* Add it to the list. */
 434             assert(num_methods < ARRAY_SIZE(methods));
 435             methods[num_methods] = best;
 436             method_max_size[num_methods] = size;
 437             num_methods++;
 438          }
 439
 440          for (unsigned i = 0; i < num_methods; i++) {
 441             struct si_result *best = methods[i];
 442             unsigned size = method_max_size[i];
 443
 444             /* The size threshold is between the current benchmarked
 445              * size and the next benchmarked size. */
 446             if (i < num_methods - 1)
 447                printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
 448             else if (i > 0)
 449                printf("         else                   ");
 450             else
 451                printf("         ");
 452             printf("return ");
 453
 454             assert(best);
 455             const char *cache_policy_str =
 456                best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
 457                best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM";
 458
 459             if (best->is_cp) {
 460                printf("CP_DMA(%s);\n", cache_policy_str);
 461             }
 462             if (best->is_sdma)
 463                printf("SDMA;\n");
 464             if (best->is_cs) {
 465                printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
 466                       best->dwords_per_thread, best->waves_per_sh);
 467             }
 468          }
 469       }
 470       puts("      }");
 471    }
 472    puts("   }");
 473    puts("}");
 474
 475    ctx->destroy(ctx);
 476    exit(0);
 477 }