src/gallium/drivers/radeonsi/si_test_dma_perf.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  22  * SOFTWARE.
  23  *
  24  */
  25
  26 /* This file implements tests on the si_clearbuffer function. */
  27
  28 #include "si_pipe.h"
  29 #include "si_query.h"
  30
  31 #define MIN_SIZE        512
  32 #define MAX_SIZE        (128 * 1024 * 1024)
  33 #define SIZE_SHIFT      1
  34 #define NUM_RUNS        128
  35
  36 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
  37 {
  38         return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
  39 }
  40
  41 void si_test_dma_perf(struct si_screen *sscreen)
  42 {
  43         struct pipe_screen *screen = &sscreen->b;
  44         struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
  45         struct si_context *sctx = (struct si_context*)ctx;
  46         const uint32_t clear_value = 0x12345678;
  47         static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
  48         static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
  49
  50 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
  51 #define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
  52
  53         static const char *method_str[] = {
  54                 "CP MC   ",
  55                 "CP L2   ",
  56                 "CP L2   ",
  57                 "SDMA    ",
  58         };
  59         static const char *placement_str[] = {
  60                 /* Clear */
  61                 "fill->VRAM",
  62                 "fill->GTT ",
  63                 /* Copy */
  64                 "VRAM->VRAM",
  65                 "VRAM->GTT ",
  66                 "GTT ->VRAM",
  67         };
  68
  69         printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
  70         printf("Heap       ,Method  ,L2p,Wa,");
  71         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
  72                 if (size >= 1024)
  73                         printf("%6uKB,", size / 1024);
  74                 else
  75                         printf(" %6uB,", size);
  76         }
  77         printf("\n");
  78
  79         /* results[log2(size)][placement][method][] */
  80         struct si_result {
  81                 bool is_valid;
  82                 bool is_cp;
  83                 bool is_sdma;
  84                 bool is_cs;
  85                 unsigned cache_policy;
  86                 unsigned dwords_per_thread;
  87                 unsigned waves_per_sh;
  88                 unsigned score;
  89                 unsigned index; /* index in results[x][y][index] */
  90         } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
  91
  92         /* Run benchmarks. */
  93         for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
  94                 bool is_copy = placement >= 2;
  95
  96                 printf("-----------,--------,---,--,");
  97                 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
  98                         printf("--------,");
  99                 printf("\n");
 100
 101                 for (unsigned method = 0; method < NUM_METHODS; method++) {
 102                         bool test_cp = method <= 2;
 103                         bool test_sdma = method == 3;
 104                         bool test_cs = method >= 4;
 105                         unsigned cs_method = method - 4;
 106                         STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
 107                         unsigned cs_waves_per_sh =
 108                                 test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
 109                         cs_method %= 2*NUM_SHADERS;
 110                         unsigned cache_policy = test_cp ? method % 3 :
 111                                                 test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
 112                         unsigned cs_dwords_per_thread =
 113                                 test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
 114
 115                         if (test_sdma && !sctx->dma_cs)
 116                                 continue;
 117
 118                         if (sctx->chip_class == GFX6) {
 119                                 /* GFX6 doesn't support CP DMA operations through L2. */
 120                                 if (test_cp && cache_policy != L2_BYPASS)
 121                                         continue;
 122                                 /* WAVES_PER_SH is in multiples of 16 on GFX6. */
 123                                 if (test_cs && cs_waves_per_sh % 16 != 0)
 124                                         continue;
 125                         }
 126
 127                         printf("%s ,", placement_str[placement]);
 128                         if (test_cs) {
 129                                 printf("CS x%-4u,%3s,", cs_dwords_per_thread,
 130                                        cache_policy == L2_LRU ? "LRU" :
 131                                        cache_policy == L2_STREAM ? "Str" : "");
 132                         } else {
 133                                 printf("%s,%3s,", method_str[method],
 134                                        method == L2_LRU ? "LRU" :
 135                                        method == L2_STREAM ? "Str" : "");
 136                         }
 137                         if (test_cs && cs_waves_per_sh)
 138                                 printf("%2u,", cs_waves_per_sh);
 139                         else
 140                                 printf("  ,");
 141
 142                         double score = 0;
 143                         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
 144                                 /* Don't test bigger sizes if it's too slow. Print 0. */
 145                                 if (size >= 512*1024 &&
 146                                     score < 400 * (size / (4*1024*1024))) {
 147                                         printf("%7.0f ,", 0.0);
 148                                         continue;
 149                                 }
 150
 151                                 enum pipe_resource_usage dst_usage, src_usage;
 152                                 struct pipe_resource *dst, *src;
 153                                 struct pipe_query *q[NUM_RUNS];
 154                                 unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
 155
 156                                 if (test_sdma) {
 157                                         if (sctx->chip_class == GFX6)
 158                                                 query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
 159                                         else
 160                                                 query_type = SI_QUERY_TIME_ELAPSED_SDMA;
 161                                 }
 162
 163                                 if (placement == 0 || placement == 2 || placement == 4)
 164                                         dst_usage = PIPE_USAGE_DEFAULT;
 165                                 else
 166                                         dst_usage = PIPE_USAGE_STREAM;
 167
 168                                 if (placement == 2 || placement == 3)
 169                                         src_usage = PIPE_USAGE_DEFAULT;
 170                                 else
 171                                         src_usage = PIPE_USAGE_STREAM;
 172
 173                                 dst = pipe_buffer_create(screen, 0, dst_usage, size);
 174                                 src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
 175
 176                                 /* Run tests. */
 177                                 for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
 178                                         q[iter] = ctx->create_query(ctx, query_type, 0);
 179                                         ctx->begin_query(ctx, q[iter]);
 180
 181                                         if (test_cp) {
 182                                                 /* CP DMA */
 183                                                 if (is_copy) {
 184                                                         si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
 185                                                                               SI_COHERENCY_NONE, cache_policy);
 186                                                 } else {
 187                                                         si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
 188                                                                                clear_value, 0,
 189                                                                                SI_COHERENCY_NONE, cache_policy);
 190                                                 }
 191                                         } else if (test_sdma) {
 192                                                 /* SDMA */
 193                                                 if (is_copy) {
 194                                                         struct pipe_box box;
 195                                                         u_box_1d(0, size, &box);
 196                                                         sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
 197                                                 } else {
 198                                                         si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
 199                                                 }
 200                                         } else {
 201                                                 /* Compute */
 202                                                 /* The memory accesses are coalesced, meaning that the 1st instruction writes
 203                                                  * the 1st contiguous block of data for the whole wave, the 2nd instruction
 204                                                  * writes the 2nd contiguous block of data, etc.
 205                                                  */
 206                                                 unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
 207                                                 unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
 208                                                 unsigned dwords_per_wave = cs_dwords_per_thread * 64;
 209
 210                                                 unsigned num_dwords = size / 4;
 211                                                 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
 212
 213                                                 void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
 214                                                                                         cache_policy == L2_STREAM, is_copy);
 215
 216                                                 struct pipe_grid_info info = {};
 217                                                 info.block[0] = MIN2(64, num_instructions);
 218                                                 info.block[1] = 1;
 219                                                 info.block[2] = 1;
 220                                                 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
 221                                                 info.grid[1] = 1;
 222                                                 info.grid[2] = 1;
 223
 224                                                 struct pipe_shader_buffer sb[2] = {};
 225                                                 sb[0].buffer = dst;
 226                                                 sb[0].buffer_size = size;
 227
 228                                                 if (is_copy) {
 229                                                         sb[1].buffer = src;
 230                                                         sb[1].buffer_size = size;
 231                                                 } else {
 232                                                         for (unsigned i = 0; i < 4; i++)
 233                                                                 sctx->cs_user_data[i] = clear_value;
 234                                                 }
 235
 236                                                 sctx->flags |= SI_CONTEXT_INV_VCACHE |
 237                                                                SI_CONTEXT_INV_SCACHE;
 238
 239                                                 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
 240                                                                         is_copy ? 2 : 1, sb, 0x1);
 241                                                 ctx->bind_compute_state(ctx, cs);
 242                                                 sctx->cs_max_waves_per_sh = cs_waves_per_sh;
 243
 244                                                 ctx->launch_grid(ctx, &info);
 245
 246                                                 ctx->bind_compute_state(ctx, NULL);
 247                                                 ctx->delete_compute_state(ctx, cs);
 248                                                 sctx->cs_max_waves_per_sh = 0; /* disable the limit */
 249
 250                                                 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
 251                                         }
 252
 253                                         /* Flush L2, so that we don't just test L2 cache performance. */
 254                                         if (!test_sdma) {
 255                                                 sctx->flags |= SI_CONTEXT_WB_L2;
 256                                                 si_emit_cache_flush(sctx);
 257                                         }
 258
 259                                         ctx->end_query(ctx, q[iter]);
 260                                         ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
 261                                 }
 262                                 pipe_resource_reference(&dst, NULL);
 263                                 pipe_resource_reference(&src, NULL);
 264
 265                                 /* Get results. */
 266                                 uint64_t min = ~0ull, max = 0, total = 0;
 267
 268                                 for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
 269                                         union pipe_query_result result;
 270
 271                                         ctx->get_query_result(ctx, q[iter], true, &result);
 272                                         ctx->destroy_query(ctx, q[iter]);
 273
 274                                         min = MIN2(min, result.u64);
 275                                         max = MAX2(max, result.u64);
 276                                         total += result.u64;
 277                                 }
 278
 279                                 score = get_MBps_rate(size, total / (double)NUM_RUNS);
 280                                 printf("%7.0f ,", score);
 281                                 fflush(stdout);
 282
 283                                 struct si_result *r = &results[util_logbase2(size)][placement][method];
 284                                 r->is_valid = true;
 285                                 r->is_cp = test_cp;
 286                                 r->is_sdma = test_sdma;
 287                                 r->is_cs = test_cs;
 288                                 r->cache_policy = cache_policy;
 289                                 r->dwords_per_thread = cs_dwords_per_thread;
 290                                 r->waves_per_sh = cs_waves_per_sh;
 291                                 r->score = score;
 292                                 r->index = method;
 293                         }
 294                         puts("");
 295                 }
 296         }
 297
 298         puts("");
 299         puts("static struct si_method");
 300         printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
 301                sctx->screen->info.name);
 302         puts("{");
 303         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
 304
 305         /* Analyze results and find the best methods. */
 306         for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
 307                 if (placement == 0)
 308                         puts("   if (dst == RADEON_DOMAIN_VRAM) {");
 309                 else if (placement == 1)
 310                         puts("   } else { /* GTT */");
 311                 else if (placement == 2) {
 312                         puts("}");
 313                         puts("");
 314                         puts("static struct si_method");
 315                         printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
 316                                sctx->screen->info.name);
 317                         printf("                     uint64_t size64, bool async, bool cached)\n");
 318                         puts("{");
 319                         puts("   unsigned size = MIN2(size64, UINT_MAX);\n");
 320                         puts("   if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
 321                 } else if (placement == 3)
 322                         puts("   } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
 323                 else
 324                         puts("   } else { /* GTT -> VRAM */");
 325
 326                 for (unsigned mode = 0; mode < 3; mode++) {
 327                         bool async = mode == 0;
 328                         bool cached = mode == 1;
 329
 330                         if (async)
 331                                 puts("      if (async) { /* SDMA or async compute */");
 332                         else if (cached)
 333                                 puts("      if (cached) { /* gfx ring */");
 334                         else
 335                                 puts("      } else { /* gfx ring - uncached */");
 336
 337                         /* The list of best chosen methods. */
 338                         struct si_result *methods[32];
 339                         unsigned method_max_size[32];
 340                         unsigned num_methods = 0;
 341
 342                         for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
 343                                 /* Find the best method. */
 344                                 struct si_result *best = NULL;
 345
 346                                 for (unsigned i = 0; i < NUM_METHODS; i++) {
 347                                         struct si_result *r = &results[util_logbase2(size)][placement][i];
 348
 349                                         if (!r->is_valid)
 350                                                 continue;
 351
 352                                         /* Ban CP DMA clears via MC on <= GFX8. They are super slow
 353                                          * on GTT, which we can get due to BO evictions.
 354                                          */
 355                                         if (sctx->chip_class <= GFX8 && placement == 1 &&
 356                                             r->is_cp && r->cache_policy == L2_BYPASS)
 357                                                 continue;
 358
 359                                         if (async) {
 360                                                 /* The following constraints for compute IBs try to limit
 361                                                  * resource usage so as not to decrease the performance
 362                                                  * of gfx IBs too much.
 363                                                  */
 364
 365                                                 /* Don't use CP DMA on asynchronous rings, because
 366                                                  * the engine is shared with gfx IBs.
 367                                                  */
 368                                                 if (r->is_cp)
 369                                                         continue;
 370
 371                                                 /* Don't use L2 caching on asynchronous rings to minimize
 372                                                  * L2 usage.
 373                                                  */
 374                                                 if (r->cache_policy == L2_LRU)
 375                                                         continue;
 376
 377                                                 /* Asynchronous compute recommends waves_per_sh != 0
 378                                                  * to limit CU usage. */
 379                                                 if (r->is_cs && r->waves_per_sh == 0)
 380                                                         continue;
 381                                         } else {
 382                                                 /* SDMA is always asynchronous */
 383                                                 if (r->is_sdma)
 384                                                         continue;
 385
 386                                                 if (cached && r->cache_policy == L2_BYPASS)
 387                                                         continue;
 388                                                 if (!cached && r->cache_policy == L2_LRU)
 389                                                         continue;
 390                                         }
 391
 392                                         if (!best) {
 393                                                 best = r;
 394                                                 continue;
 395                                         }
 396
 397                                         /* Assume some measurement error. Earlier methods occupy fewer
 398                                          * resources, so the next method is always more greedy, and we
 399                                          * don't want to select it due to a measurement error.
 400                                          */
 401                                         double min_improvement = 1.03;
 402
 403                                         if (best->score * min_improvement < r->score)
 404                                                 best = r;
 405                                 }
 406
 407                                 if (num_methods > 0) {
 408                                         unsigned prev_index = num_methods - 1;
 409                                         struct si_result *prev = methods[prev_index];
 410                                         struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
 411
 412                                         /* If the best one is also the best for the previous size,
 413                                          * just bump the size for the previous one.
 414                                          *
 415                                          * If there is no best, it means all methods were too slow
 416                                          * for this size and were not tested. Use the best one for
 417                                          * the previous size.
 418                                          */
 419                                         if (!best ||
 420                                             /* If it's the same method as for the previous size: */
 421                                             (prev->is_cp == best->is_cp &&
 422                                              prev->is_sdma == best->is_sdma &&
 423                                              prev->is_cs == best->is_cs &&
 424                                              prev->cache_policy == best->cache_policy &&
 425                                              prev->dwords_per_thread == best->dwords_per_thread &&
 426                                              prev->waves_per_sh == best->waves_per_sh) ||
 427                                             /* If the method for the previous size is also the best
 428                                              * for this size: */
 429                                             (prev_this_size->is_valid &&
 430                                              prev_this_size->score * 1.03 > best->score)) {
 431                                                 method_max_size[prev_index] = size;
 432                                                 continue;
 433                                         }
 434                                 }
 435
 436                                 /* Add it to the list. */
 437                                 assert(num_methods < ARRAY_SIZE(methods));
 438                                 methods[num_methods] = best;
 439                                 method_max_size[num_methods] = size;
 440                                 num_methods++;
 441                         }
 442
 443                         for (unsigned i = 0; i < num_methods; i++) {
 444                                 struct si_result *best = methods[i];
 445                                 unsigned size = method_max_size[i];
 446
 447                                 /* The size threshold is between the current benchmarked
 448                                  * size and the next benchmarked size. */
 449                                 if (i < num_methods - 1)
 450                                         printf("         if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
 451                                 else if (i > 0)
 452                                         printf("         else                   ");
 453                                 else
 454                                         printf("         ");
 455                                 printf("return ");
 456
 457                                 assert(best);
 458                                 if (best->is_cp) {
 459                                         printf("CP_DMA(%s);\n",
 460                                                best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
 461                                                best->cache_policy == L2_LRU ?    "L2_LRU   " : "L2_STREAM");
 462                                 }
 463                                 if (best->is_sdma)
 464                                         printf("SDMA;\n");
 465                                 if (best->is_cs) {
 466                                         printf("COMPUTE(%s, %u, %u);\n",
 467                                                best->cache_policy == L2_LRU ? "L2_LRU   " : "L2_STREAM",
 468                                                best->dwords_per_thread,
 469                                                best->waves_per_sh);
 470                                 }
 471                         }
 472                 }
 473                 puts("      }");
 474         }
 475         puts("   }");
 476         puts("}");
 477
 478         ctx->destroy(ctx);
 479         exit(0);
 480 }