radeonsi: rename and re-document cache flush flags
[mesa.git] / src / gallium / drivers / radeonsi / si_test_dma_perf.c
1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 */
25
26 /* This file implements tests on the si_clearbuffer function. */
27
28 #include "si_pipe.h"
29 #include "si_query.h"
30
31 #define MIN_SIZE 512
32 #define MAX_SIZE (128 * 1024 * 1024)
33 #define SIZE_SHIFT 1
34 #define NUM_RUNS 128
35
36 static double get_MBps_rate(unsigned num_bytes, unsigned ns)
37 {
38 return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
39 }
40
41 void si_test_dma_perf(struct si_screen *sscreen)
42 {
43 struct pipe_screen *screen = &sscreen->b;
44 struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
45 struct si_context *sctx = (struct si_context*)ctx;
46 const uint32_t clear_value = 0x12345678;
47 static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
48 static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
49
50 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
51 #define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
52
53 static const char *method_str[] = {
54 "CP MC ",
55 "CP L2 ",
56 "CP L2 ",
57 "SDMA ",
58 };
59 static const char *placement_str[] = {
60 /* Clear */
61 "fill->VRAM",
62 "fill->GTT ",
63 /* Copy */
64 "VRAM->VRAM",
65 "VRAM->GTT ",
66 "GTT ->VRAM",
67 };
68
69 printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
70 printf("Heap ,Method ,L2p,Wa,");
71 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
72 if (size >= 1024)
73 printf("%6uKB,", size / 1024);
74 else
75 printf(" %6uB,", size);
76 }
77 printf("\n");
78
79 /* results[log2(size)][placement][method][] */
80 struct si_result {
81 bool is_valid;
82 bool is_cp;
83 bool is_sdma;
84 bool is_cs;
85 unsigned cache_policy;
86 unsigned dwords_per_thread;
87 unsigned waves_per_sh;
88 unsigned score;
89 unsigned index; /* index in results[x][y][index] */
90 } results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
91
92 /* Run benchmarks. */
93 for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
94 bool is_copy = placement >= 2;
95
96 printf("-----------,--------,---,--,");
97 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
98 printf("--------,");
99 printf("\n");
100
101 for (unsigned method = 0; method < NUM_METHODS; method++) {
102 bool test_cp = method <= 2;
103 bool test_sdma = method == 3;
104 bool test_cs = method >= 4;
105 unsigned cs_method = method - 4;
106 STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
107 unsigned cs_waves_per_sh =
108 test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
109 cs_method %= 2*NUM_SHADERS;
110 unsigned cache_policy = test_cp ? method % 3 :
111 test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
112 unsigned cs_dwords_per_thread =
113 test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
114
115 if (test_sdma && !sctx->dma_cs)
116 continue;
117
118 if (sctx->chip_class == GFX6) {
119 /* GFX6 doesn't support CP DMA operations through L2. */
120 if (test_cp && cache_policy != L2_BYPASS)
121 continue;
122 /* WAVES_PER_SH is in multiples of 16 on GFX6. */
123 if (test_cs && cs_waves_per_sh % 16 != 0)
124 continue;
125 }
126
127 printf("%s ,", placement_str[placement]);
128 if (test_cs) {
129 printf("CS x%-4u,%3s,", cs_dwords_per_thread,
130 cache_policy == L2_LRU ? "LRU" :
131 cache_policy == L2_STREAM ? "Str" : "");
132 } else {
133 printf("%s,%3s,", method_str[method],
134 method == L2_LRU ? "LRU" :
135 method == L2_STREAM ? "Str" : "");
136 }
137 if (test_cs && cs_waves_per_sh)
138 printf("%2u,", cs_waves_per_sh);
139 else
140 printf(" ,");
141
142 double score = 0;
143 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
144 /* Don't test bigger sizes if it's too slow. Print 0. */
145 if (size >= 512*1024 &&
146 score < 400 * (size / (4*1024*1024))) {
147 printf("%7.0f ,", 0.0);
148 continue;
149 }
150
151 enum pipe_resource_usage dst_usage, src_usage;
152 struct pipe_resource *dst, *src;
153 struct pipe_query *q[NUM_RUNS];
154 unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
155
156 if (test_sdma) {
157 if (sctx->chip_class == GFX6)
158 query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
159 else
160 query_type = SI_QUERY_TIME_ELAPSED_SDMA;
161 }
162
163 if (placement == 0 || placement == 2 || placement == 4)
164 dst_usage = PIPE_USAGE_DEFAULT;
165 else
166 dst_usage = PIPE_USAGE_STREAM;
167
168 if (placement == 2 || placement == 3)
169 src_usage = PIPE_USAGE_DEFAULT;
170 else
171 src_usage = PIPE_USAGE_STREAM;
172
173 dst = pipe_buffer_create(screen, 0, dst_usage, size);
174 src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
175
176 /* Run tests. */
177 for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
178 q[iter] = ctx->create_query(ctx, query_type, 0);
179 ctx->begin_query(ctx, q[iter]);
180
181 if (test_cp) {
182 /* CP DMA */
183 if (is_copy) {
184 si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
185 SI_COHERENCY_NONE, cache_policy);
186 } else {
187 si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
188 clear_value, 0,
189 SI_COHERENCY_NONE, cache_policy);
190 }
191 } else if (test_sdma) {
192 /* SDMA */
193 if (is_copy) {
194 struct pipe_box box;
195 u_box_1d(0, size, &box);
196 sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box);
197 } else {
198 si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
199 }
200 } else {
201 /* Compute */
202 /* The memory accesses are coalesced, meaning that the 1st instruction writes
203 * the 1st contiguous block of data for the whole wave, the 2nd instruction
204 * writes the 2nd contiguous block of data, etc.
205 */
206 unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
207 unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
208 unsigned dwords_per_wave = cs_dwords_per_thread * 64;
209
210 unsigned num_dwords = size / 4;
211 unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
212
213 void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
214 cache_policy == L2_STREAM, is_copy);
215
216 struct pipe_grid_info info = {};
217 info.block[0] = MIN2(64, num_instructions);
218 info.block[1] = 1;
219 info.block[2] = 1;
220 info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
221 info.grid[1] = 1;
222 info.grid[2] = 1;
223
224 struct pipe_shader_buffer sb[2] = {};
225 sb[0].buffer = dst;
226 sb[0].buffer_size = size;
227
228 if (is_copy) {
229 sb[1].buffer = src;
230 sb[1].buffer_size = size;
231 } else {
232 for (unsigned i = 0; i < 4; i++)
233 sctx->cs_user_data[i] = clear_value;
234 }
235
236 sctx->flags |= SI_CONTEXT_INV_VCACHE |
237 SI_CONTEXT_INV_SCACHE;
238
239 ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
240 is_copy ? 2 : 1, sb, 0x1);
241 ctx->bind_compute_state(ctx, cs);
242 sctx->cs_max_waves_per_sh = cs_waves_per_sh;
243
244 ctx->launch_grid(ctx, &info);
245
246 ctx->bind_compute_state(ctx, NULL);
247 ctx->delete_compute_state(ctx, cs);
248 sctx->cs_max_waves_per_sh = 0; /* disable the limit */
249
250 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
251 }
252
253 /* Flush L2, so that we don't just test L2 cache performance. */
254 if (!test_sdma) {
255 sctx->flags |= SI_CONTEXT_WB_L2;
256 si_emit_cache_flush(sctx);
257 }
258
259 ctx->end_query(ctx, q[iter]);
260 ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
261 }
262 pipe_resource_reference(&dst, NULL);
263 pipe_resource_reference(&src, NULL);
264
265 /* Get results. */
266 uint64_t min = ~0ull, max = 0, total = 0;
267
268 for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
269 union pipe_query_result result;
270
271 ctx->get_query_result(ctx, q[iter], true, &result);
272 ctx->destroy_query(ctx, q[iter]);
273
274 min = MIN2(min, result.u64);
275 max = MAX2(max, result.u64);
276 total += result.u64;
277 }
278
279 score = get_MBps_rate(size, total / (double)NUM_RUNS);
280 printf("%7.0f ,", score);
281 fflush(stdout);
282
283 struct si_result *r = &results[util_logbase2(size)][placement][method];
284 r->is_valid = true;
285 r->is_cp = test_cp;
286 r->is_sdma = test_sdma;
287 r->is_cs = test_cs;
288 r->cache_policy = cache_policy;
289 r->dwords_per_thread = cs_dwords_per_thread;
290 r->waves_per_sh = cs_waves_per_sh;
291 r->score = score;
292 r->index = method;
293 }
294 puts("");
295 }
296 }
297
298 puts("");
299 puts("static struct si_method");
300 printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
301 sctx->screen->info.name);
302 puts("{");
303 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
304
305 /* Analyze results and find the best methods. */
306 for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
307 if (placement == 0)
308 puts(" if (dst == RADEON_DOMAIN_VRAM) {");
309 else if (placement == 1)
310 puts(" } else { /* GTT */");
311 else if (placement == 2) {
312 puts("}");
313 puts("");
314 puts("static struct si_method");
315 printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
316 sctx->screen->info.name);
317 printf(" uint64_t size64, bool async, bool cached)\n");
318 puts("{");
319 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
320 puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
321 } else if (placement == 3)
322 puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
323 else
324 puts(" } else { /* GTT -> VRAM */");
325
326 for (unsigned mode = 0; mode < 3; mode++) {
327 bool async = mode == 0;
328 bool cached = mode == 1;
329
330 if (async)
331 puts(" if (async) { /* SDMA or async compute */");
332 else if (cached)
333 puts(" if (cached) { /* gfx ring */");
334 else
335 puts(" } else { /* gfx ring - uncached */");
336
337 /* The list of best chosen methods. */
338 struct si_result *methods[32];
339 unsigned method_max_size[32];
340 unsigned num_methods = 0;
341
342 for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
343 /* Find the best method. */
344 struct si_result *best = NULL;
345
346 for (unsigned i = 0; i < NUM_METHODS; i++) {
347 struct si_result *r = &results[util_logbase2(size)][placement][i];
348
349 if (!r->is_valid)
350 continue;
351
352 /* Ban CP DMA clears via MC on <= GFX8. They are super slow
353 * on GTT, which we can get due to BO evictions.
354 */
355 if (sctx->chip_class <= GFX8 && placement == 1 &&
356 r->is_cp && r->cache_policy == L2_BYPASS)
357 continue;
358
359 if (async) {
360 /* The following constraints for compute IBs try to limit
361 * resource usage so as not to decrease the performance
362 * of gfx IBs too much.
363 */
364
365 /* Don't use CP DMA on asynchronous rings, because
366 * the engine is shared with gfx IBs.
367 */
368 if (r->is_cp)
369 continue;
370
371 /* Don't use L2 caching on asynchronous rings to minimize
372 * L2 usage.
373 */
374 if (r->cache_policy == L2_LRU)
375 continue;
376
377 /* Asynchronous compute recommends waves_per_sh != 0
378 * to limit CU usage. */
379 if (r->is_cs && r->waves_per_sh == 0)
380 continue;
381 } else {
382 /* SDMA is always asynchronous */
383 if (r->is_sdma)
384 continue;
385
386 if (cached && r->cache_policy == L2_BYPASS)
387 continue;
388 if (!cached && r->cache_policy == L2_LRU)
389 continue;
390 }
391
392 if (!best) {
393 best = r;
394 continue;
395 }
396
397 /* Assume some measurement error. Earlier methods occupy fewer
398 * resources, so the next method is always more greedy, and we
399 * don't want to select it due to a measurement error.
400 */
401 double min_improvement = 1.03;
402
403 if (best->score * min_improvement < r->score)
404 best = r;
405 }
406
407 if (num_methods > 0) {
408 unsigned prev_index = num_methods - 1;
409 struct si_result *prev = methods[prev_index];
410 struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
411
412 /* If the best one is also the best for the previous size,
413 * just bump the size for the previous one.
414 *
415 * If there is no best, it means all methods were too slow
416 * for this size and were not tested. Use the best one for
417 * the previous size.
418 */
419 if (!best ||
420 /* If it's the same method as for the previous size: */
421 (prev->is_cp == best->is_cp &&
422 prev->is_sdma == best->is_sdma &&
423 prev->is_cs == best->is_cs &&
424 prev->cache_policy == best->cache_policy &&
425 prev->dwords_per_thread == best->dwords_per_thread &&
426 prev->waves_per_sh == best->waves_per_sh) ||
427 /* If the method for the previous size is also the best
428 * for this size: */
429 (prev_this_size->is_valid &&
430 prev_this_size->score * 1.03 > best->score)) {
431 method_max_size[prev_index] = size;
432 continue;
433 }
434 }
435
436 /* Add it to the list. */
437 assert(num_methods < ARRAY_SIZE(methods));
438 methods[num_methods] = best;
439 method_max_size[num_methods] = size;
440 num_methods++;
441 }
442
443 for (unsigned i = 0; i < num_methods; i++) {
444 struct si_result *best = methods[i];
445 unsigned size = method_max_size[i];
446
447 /* The size threshold is between the current benchmarked
448 * size and the next benchmarked size. */
449 if (i < num_methods - 1)
450 printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
451 else if (i > 0)
452 printf(" else ");
453 else
454 printf(" ");
455 printf("return ");
456
457 assert(best);
458 if (best->is_cp) {
459 printf("CP_DMA(%s);\n",
460 best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
461 best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM");
462 }
463 if (best->is_sdma)
464 printf("SDMA;\n");
465 if (best->is_cs) {
466 printf("COMPUTE(%s, %u, %u);\n",
467 best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM",
468 best->dwords_per_thread,
469 best->waves_per_sh);
470 }
471 }
472 }
473 puts(" }");
474 }
475 puts(" }");
476 puts("}");
477
478 ctx->destroy(ctx);
479 exit(0);
480 }