2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* This file implements tests on the si_clearbuffer function. */
32 #define MAX_SIZE (128 * 1024 * 1024)
36 static double get_MBps_rate(unsigned num_bytes
, unsigned ns
)
38 return (num_bytes
/ (1024.0 * 1024.0)) / (ns
/ 1000000000.0);
41 void si_test_dma_perf(struct si_screen
*sscreen
)
43 struct pipe_screen
*screen
= &sscreen
->b
;
44 struct pipe_context
*ctx
= screen
->context_create(screen
, NULL
, 0);
45 struct si_context
*sctx
= (struct si_context
*)ctx
;
46 const uint32_t clear_value
= 0x12345678;
47 static const unsigned cs_dwords_per_thread_list
[] = {64, 32, 16, 8, 4, 2, 1};
48 static const unsigned cs_waves_per_sh_list
[] = {0, 2, 4, 8, 16};
50 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
51 #define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
53 static const char *method_str
[] = {
59 static const char *placement_str
[] = {
69 printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
70 printf("Heap ,Method ,L2p,Wa,");
71 for (unsigned size
= MIN_SIZE
; size
<= MAX_SIZE
; size
<<= SIZE_SHIFT
) {
73 printf("%6uKB,", size
/ 1024);
75 printf(" %6uB,", size
);
79 /* results[log2(size)][placement][method][] */
85 unsigned cache_policy
;
86 unsigned dwords_per_thread
;
87 unsigned waves_per_sh
;
89 unsigned index
; /* index in results[x][y][index] */
90 } results
[32][ARRAY_SIZE(placement_str
)][NUM_METHODS
] = {};
93 for (unsigned placement
= 0; placement
< ARRAY_SIZE(placement_str
); placement
++) {
94 bool is_copy
= placement
>= 2;
96 printf("-----------,--------,---,--,");
97 for (unsigned size
= MIN_SIZE
; size
<= MAX_SIZE
; size
<<= SIZE_SHIFT
)
101 for (unsigned method
= 0; method
< NUM_METHODS
; method
++) {
102 bool test_cp
= method
<= 2;
103 bool test_sdma
= method
== 3;
104 bool test_cs
= method
>= 4;
105 unsigned cs_method
= method
- 4;
106 unsigned cs_waves_per_sh
=
107 test_cs
? cs_waves_per_sh_list
[cs_method
/ (3 * NUM_SHADERS
)] : 0;
108 cs_method
%= 3 * NUM_SHADERS
;
109 unsigned cache_policy
=
110 test_cp
? method
% 3 : test_cs
? (cs_method
/ NUM_SHADERS
) : 0;
111 unsigned cs_dwords_per_thread
=
112 test_cs
? cs_dwords_per_thread_list
[cs_method
% NUM_SHADERS
] : 0;
114 if (test_sdma
&& !sctx
->sdma_cs
)
117 if (sctx
->chip_class
== GFX6
) {
118 /* GFX6 doesn't support CP DMA operations through L2. */
119 if (test_cp
&& cache_policy
!= L2_BYPASS
)
121 /* WAVES_PER_SH is in multiples of 16 on GFX6. */
122 if (test_cs
&& cs_waves_per_sh
% 16 != 0)
126 /* SI_RESOURCE_FLAG_UNCACHED setting RADEON_FLAG_UNCACHED doesn't affect
129 if (test_cs
&& cache_policy
&& sctx
->chip_class
< GFX9
)
132 printf("%s ,", placement_str
[placement
]);
134 printf("CS x%-4u,%3s,", cs_dwords_per_thread
,
135 cache_policy
== L2_LRU
? "LRU" : cache_policy
== L2_STREAM
? "Str" : "");
137 printf("%s,%3s,", method_str
[method
],
138 method
== L2_LRU
? "LRU" : method
== L2_STREAM
? "Str" : "");
140 if (test_cs
&& cs_waves_per_sh
)
141 printf("%2u,", cs_waves_per_sh
);
146 for (unsigned size
= MIN_SIZE
; size
<= MAX_SIZE
; size
<<= SIZE_SHIFT
) {
147 /* Don't test bigger sizes if it's too slow. Print 0. */
148 if (size
>= 512 * 1024 && score
< 400 * (size
/ (4 * 1024 * 1024))) {
149 printf("%7.0f ,", 0.0);
153 enum pipe_resource_usage dst_usage
, src_usage
;
154 struct pipe_resource
*dst
, *src
;
155 struct pipe_query
*q
[NUM_RUNS
];
156 unsigned query_type
= PIPE_QUERY_TIME_ELAPSED
;
157 unsigned flags
= cache_policy
== L2_BYPASS
? SI_RESOURCE_FLAG_UNCACHED
: 0;
160 if (sctx
->chip_class
== GFX6
)
161 query_type
= SI_QUERY_TIME_ELAPSED_SDMA_SI
;
163 query_type
= SI_QUERY_TIME_ELAPSED_SDMA
;
166 if (placement
== 0 || placement
== 2 || placement
== 4)
167 dst_usage
= PIPE_USAGE_DEFAULT
;
169 dst_usage
= PIPE_USAGE_STREAM
;
171 if (placement
== 2 || placement
== 3)
172 src_usage
= PIPE_USAGE_DEFAULT
;
174 src_usage
= PIPE_USAGE_STREAM
;
176 dst
= pipe_aligned_buffer_create(screen
, flags
, dst_usage
, size
, 256);
177 src
= is_copy
? pipe_aligned_buffer_create(screen
, flags
, src_usage
, size
, 256) : NULL
;
180 for (unsigned iter
= 0; iter
< NUM_RUNS
; iter
++) {
181 q
[iter
] = ctx
->create_query(ctx
, query_type
, 0);
182 ctx
->begin_query(ctx
, q
[iter
]);
187 si_cp_dma_copy_buffer(sctx
, dst
, src
, 0, 0, size
, 0, SI_COHERENCY_NONE
,
190 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, 0, size
, clear_value
, 0,
191 SI_COHERENCY_NONE
, cache_policy
);
193 } else if (test_sdma
) {
196 si_sdma_copy_buffer(sctx
, dst
, src
, 0, 0, size
);
198 si_sdma_clear_buffer(sctx
, dst
, 0, size
, clear_value
);
202 /* The memory accesses are coalesced, meaning that the 1st instruction writes
203 * the 1st contiguous block of data for the whole wave, the 2nd instruction
204 * writes the 2nd contiguous block of data, etc.
206 unsigned instructions_per_thread
= MAX2(1, cs_dwords_per_thread
/ 4);
207 unsigned dwords_per_instruction
= cs_dwords_per_thread
/ instructions_per_thread
;
208 unsigned dwords_per_wave
= cs_dwords_per_thread
* 64;
210 unsigned num_dwords
= size
/ 4;
211 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
213 void *cs
= si_create_dma_compute_shader(ctx
, cs_dwords_per_thread
,
214 cache_policy
== L2_STREAM
, is_copy
);
216 struct pipe_grid_info info
= {};
217 info
.block
[0] = MIN2(64, num_instructions
);
220 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
224 struct pipe_shader_buffer sb
[2] = {};
226 sb
[0].buffer_size
= size
;
230 sb
[1].buffer_size
= size
;
232 for (unsigned i
= 0; i
< 4; i
++)
233 sctx
->cs_user_data
[i
] = clear_value
;
236 sctx
->flags
|= SI_CONTEXT_INV_VCACHE
| SI_CONTEXT_INV_SCACHE
;
238 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0, is_copy
? 2 : 1, sb
, 0x1);
239 ctx
->bind_compute_state(ctx
, cs
);
240 sctx
->cs_max_waves_per_sh
= cs_waves_per_sh
;
242 ctx
->launch_grid(ctx
, &info
);
244 ctx
->bind_compute_state(ctx
, NULL
);
245 ctx
->delete_compute_state(ctx
, cs
);
246 sctx
->cs_max_waves_per_sh
= 0; /* disable the limit */
248 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
;
251 /* Flush L2, so that we don't just test L2 cache performance. */
253 sctx
->flags
|= SI_CONTEXT_WB_L2
;
254 sctx
->emit_cache_flush(sctx
);
257 ctx
->end_query(ctx
, q
[iter
]);
258 ctx
->flush(ctx
, NULL
, PIPE_FLUSH_ASYNC
);
260 pipe_resource_reference(&dst
, NULL
);
261 pipe_resource_reference(&src
, NULL
);
264 uint64_t min
= ~0ull, max
= 0, total
= 0;
266 for (unsigned iter
= 0; iter
< NUM_RUNS
; iter
++) {
267 union pipe_query_result result
;
269 ctx
->get_query_result(ctx
, q
[iter
], true, &result
);
270 ctx
->destroy_query(ctx
, q
[iter
]);
272 min
= MIN2(min
, result
.u64
);
273 max
= MAX2(max
, result
.u64
);
277 score
= get_MBps_rate(size
, total
/ (double)NUM_RUNS
);
278 printf("%7.0f ,", score
);
281 struct si_result
*r
= &results
[util_logbase2(size
)][placement
][method
];
284 r
->is_sdma
= test_sdma
;
286 r
->cache_policy
= cache_policy
;
287 r
->dwords_per_thread
= cs_dwords_per_thread
;
288 r
->waves_per_sh
= cs_waves_per_sh
;
297 puts("static struct si_method");
298 printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
300 sctx
->screen
->info
.name
);
302 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
304 /* Analyze results and find the best methods. */
305 for (unsigned placement
= 0; placement
< ARRAY_SIZE(placement_str
); placement
++) {
307 puts(" if (dst == RADEON_DOMAIN_VRAM) {");
308 else if (placement
== 1)
309 puts(" } else { /* GTT */");
310 else if (placement
== 2) {
313 puts("static struct si_method");
314 printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
315 sctx
->screen
->info
.name
);
316 printf(" uint64_t size64, bool async, bool cached)\n");
318 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
319 puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
320 } else if (placement
== 3)
321 puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
323 puts(" } else { /* GTT -> VRAM */");
325 for (unsigned mode
= 0; mode
< 3; mode
++) {
326 bool async
= mode
== 0;
327 bool cached
= mode
== 1;
330 puts(" if (async) { /* SDMA or async compute */");
332 puts(" if (cached) { /* gfx ring */");
334 puts(" } else { /* gfx ring - uncached */");
336 /* The list of best chosen methods. */
337 struct si_result
*methods
[32];
338 unsigned method_max_size
[32];
339 unsigned num_methods
= 0;
341 for (unsigned size
= MIN_SIZE
; size
<= MAX_SIZE
; size
<<= SIZE_SHIFT
) {
342 /* Find the best method. */
343 struct si_result
*best
= NULL
;
345 for (unsigned i
= 0; i
< NUM_METHODS
; i
++) {
346 struct si_result
*r
= &results
[util_logbase2(size
)][placement
][i
];
351 /* Ban CP DMA clears via MC on <= GFX8. They are super slow
352 * on GTT, which we can get due to BO evictions.
354 if (sctx
->chip_class
<= GFX8
&& placement
== 1 && r
->is_cp
&&
355 r
->cache_policy
== L2_BYPASS
)
359 /* The following constraints for compute IBs try to limit
360 * resource usage so as not to decrease the performance
361 * of gfx IBs too much.
364 /* Don't use CP DMA on asynchronous rings, because
365 * the engine is shared with gfx IBs.
370 /* Don't use L2 caching on asynchronous rings to minimize
373 if (r
->cache_policy
== L2_LRU
)
376 /* Asynchronous compute recommends waves_per_sh != 0
377 * to limit CU usage. */
378 if (r
->is_cs
&& r
->waves_per_sh
== 0)
381 /* SDMA is always asynchronous */
385 if (cached
&& r
->cache_policy
== L2_BYPASS
)
387 if (!cached
&& r
->cache_policy
== L2_LRU
)
396 /* Assume some measurement error. Earlier methods occupy fewer
397 * resources, so the next method is always more greedy, and we
398 * don't want to select it due to a measurement error.
400 double min_improvement
= 1.03;
402 if (best
->score
* min_improvement
< r
->score
)
406 if (num_methods
> 0) {
407 unsigned prev_index
= num_methods
- 1;
408 struct si_result
*prev
= methods
[prev_index
];
409 struct si_result
*prev_this_size
=
410 &results
[util_logbase2(size
)][placement
][prev
->index
];
412 /* If the best one is also the best for the previous size,
413 * just bump the size for the previous one.
415 * If there is no best, it means all methods were too slow
416 * for this size and were not tested. Use the best one for
420 /* If it's the same method as for the previous size: */
421 (prev
->is_cp
== best
->is_cp
&& prev
->is_sdma
== best
->is_sdma
&&
422 prev
->is_cs
== best
->is_cs
&& prev
->cache_policy
== best
->cache_policy
&&
423 prev
->dwords_per_thread
== best
->dwords_per_thread
&&
424 prev
->waves_per_sh
== best
->waves_per_sh
) ||
425 /* If the method for the previous size is also the best
427 (prev_this_size
->is_valid
&& prev_this_size
->score
* 1.03 > best
->score
)) {
428 method_max_size
[prev_index
] = size
;
433 /* Add it to the list. */
434 assert(num_methods
< ARRAY_SIZE(methods
));
435 methods
[num_methods
] = best
;
436 method_max_size
[num_methods
] = size
;
440 for (unsigned i
= 0; i
< num_methods
; i
++) {
441 struct si_result
*best
= methods
[i
];
442 unsigned size
= method_max_size
[i
];
444 /* The size threshold is between the current benchmarked
445 * size and the next benchmarked size. */
446 if (i
< num_methods
- 1)
447 printf(" if (size <= %9u) ", (size
+ (size
<< SIZE_SHIFT
)) / 2);
455 const char *cache_policy_str
=
456 best
->cache_policy
== L2_BYPASS
? "L2_BYPASS" :
457 best
->cache_policy
== L2_LRU
? "L2_LRU " : "L2_STREAM";
460 printf("CP_DMA(%s);\n", cache_policy_str
);
465 printf("COMPUTE(%s, %u, %u);\n", cache_policy_str
,
466 best
->dwords_per_thread
, best
->waves_per_sh
);