2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* This file implements tests on the si_clearbuffer function. */
32 #define MAX_SIZE (128 * 1024 * 1024)
36 static double get_MBps_rate(unsigned num_bytes
, unsigned ns
)
38 return (num_bytes
/ (1024.0 * 1024.0)) / (ns
/ 1000000000.0);
41 void si_test_dma_perf(struct si_screen
*sscreen
)
43 struct pipe_screen
*screen
= &sscreen
->b
;
44 struct pipe_context
*ctx
= screen
->context_create(screen
, NULL
, 0);
45 struct si_context
*sctx
= (struct si_context
*)ctx
;
46 const uint32_t clear_value
= 0x12345678;
47 static const unsigned cs_dwords_per_thread_list
[] = {64, 32, 16, 8, 4, 2, 1};
48 static const unsigned cs_waves_per_sh_list
[] = {1, 2, 4, 8, 16, 0};
50 #define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
51 #define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
53 static const char *method_str
[] = {
59 static const char *placement_str
[] = {
69 printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
70 printf("Heap ,Method ,L2p,Wa,");
71 for (unsigned size
= MIN_SIZE
; size
<= MAX_SIZE
; size
<<= SIZE_SHIFT
) {
73 printf("%6uKB,", size
/ 1024);
75 printf(" %6uB,", size
);
79 /* results[log2(size)][placement][method][] */
85 unsigned cache_policy
;
86 unsigned dwords_per_thread
;
87 unsigned waves_per_sh
;
89 unsigned index
; /* index in results[x][y][index] */
90 } results
[32][ARRAY_SIZE(placement_str
)][NUM_METHODS
] = {};
93 for (unsigned placement
= 0; placement
< ARRAY_SIZE(placement_str
); placement
++) {
94 bool is_copy
= placement
>= 2;
96 printf("-----------,--------,---,--,");
97 for (unsigned size
= MIN_SIZE
; size
<= MAX_SIZE
; size
<<= SIZE_SHIFT
)
101 for (unsigned method
= 0; method
< NUM_METHODS
; method
++) {
102 bool test_cp
= method
<= 2;
103 bool test_sdma
= method
== 3;
104 bool test_cs
= method
>= 4;
105 unsigned cs_method
= method
- 4;
106 STATIC_ASSERT(L2_STREAM
+ 1 == L2_LRU
);
107 unsigned cs_waves_per_sh
=
108 test_cs
? cs_waves_per_sh_list
[cs_method
/ (2*NUM_SHADERS
)] : 0;
109 cs_method
%= 2*NUM_SHADERS
;
110 unsigned cache_policy
= test_cp
? method
% 3 :
111 test_cs
? L2_STREAM
+ (cs_method
/ NUM_SHADERS
) : 0;
112 unsigned cs_dwords_per_thread
=
113 test_cs
? cs_dwords_per_thread_list
[cs_method
% NUM_SHADERS
] : 0;
115 if (test_sdma
&& !sctx
->dma_cs
)
118 if (sctx
->chip_class
== GFX6
) {
119 /* GFX6 doesn't support CP DMA operations through L2. */
120 if (test_cp
&& cache_policy
!= L2_BYPASS
)
122 /* WAVES_PER_SH is in multiples of 16 on GFX6. */
123 if (test_cs
&& cs_waves_per_sh
% 16 != 0)
127 printf("%s ,", placement_str
[placement
]);
129 printf("CS x%-4u,%3s,", cs_dwords_per_thread
,
130 cache_policy
== L2_LRU
? "LRU" :
131 cache_policy
== L2_STREAM
? "Str" : "");
133 printf("%s,%3s,", method_str
[method
],
134 method
== L2_LRU
? "LRU" :
135 method
== L2_STREAM
? "Str" : "");
137 if (test_cs
&& cs_waves_per_sh
)
138 printf("%2u,", cs_waves_per_sh
);
143 for (unsigned size
= MIN_SIZE
; size
<= MAX_SIZE
; size
<<= SIZE_SHIFT
) {
144 /* Don't test bigger sizes if it's too slow. Print 0. */
145 if (size
>= 512*1024 &&
146 score
< 400 * (size
/ (4*1024*1024))) {
147 printf("%7.0f ,", 0.0);
151 enum pipe_resource_usage dst_usage
, src_usage
;
152 struct pipe_resource
*dst
, *src
;
153 struct pipe_query
*q
[NUM_RUNS
];
154 unsigned query_type
= PIPE_QUERY_TIME_ELAPSED
;
157 if (sctx
->chip_class
== GFX6
)
158 query_type
= SI_QUERY_TIME_ELAPSED_SDMA_SI
;
160 query_type
= SI_QUERY_TIME_ELAPSED_SDMA
;
163 if (placement
== 0 || placement
== 2 || placement
== 4)
164 dst_usage
= PIPE_USAGE_DEFAULT
;
166 dst_usage
= PIPE_USAGE_STREAM
;
168 if (placement
== 2 || placement
== 3)
169 src_usage
= PIPE_USAGE_DEFAULT
;
171 src_usage
= PIPE_USAGE_STREAM
;
173 dst
= pipe_buffer_create(screen
, 0, dst_usage
, size
);
174 src
= is_copy
? pipe_buffer_create(screen
, 0, src_usage
, size
) : NULL
;
177 for (unsigned iter
= 0; iter
< NUM_RUNS
; iter
++) {
178 q
[iter
] = ctx
->create_query(ctx
, query_type
, 0);
179 ctx
->begin_query(ctx
, q
[iter
]);
184 si_cp_dma_copy_buffer(sctx
, dst
, src
, 0, 0, size
, 0,
185 SI_COHERENCY_NONE
, cache_policy
);
187 si_cp_dma_clear_buffer(sctx
, sctx
->gfx_cs
, dst
, 0, size
,
189 SI_COHERENCY_NONE
, cache_policy
);
191 } else if (test_sdma
) {
195 u_box_1d(0, size
, &box
);
196 sctx
->dma_copy(ctx
, dst
, 0, 0, 0, 0, src
, 0, &box
);
198 si_sdma_clear_buffer(sctx
, dst
, 0, size
, clear_value
);
202 /* The memory accesses are coalesced, meaning that the 1st instruction writes
203 * the 1st contiguous block of data for the whole wave, the 2nd instruction
204 * writes the 2nd contiguous block of data, etc.
206 unsigned instructions_per_thread
= MAX2(1, cs_dwords_per_thread
/ 4);
207 unsigned dwords_per_instruction
= cs_dwords_per_thread
/ instructions_per_thread
;
208 unsigned dwords_per_wave
= cs_dwords_per_thread
* 64;
210 unsigned num_dwords
= size
/ 4;
211 unsigned num_instructions
= DIV_ROUND_UP(num_dwords
, dwords_per_instruction
);
213 void *cs
= si_create_dma_compute_shader(ctx
, cs_dwords_per_thread
,
214 cache_policy
== L2_STREAM
, is_copy
);
216 struct pipe_grid_info info
= {};
217 info
.block
[0] = MIN2(64, num_instructions
);
220 info
.grid
[0] = DIV_ROUND_UP(num_dwords
, dwords_per_wave
);
224 struct pipe_shader_buffer sb
[2] = {};
226 sb
[0].buffer_size
= size
;
230 sb
[1].buffer_size
= size
;
232 for (unsigned i
= 0; i
< 4; i
++)
233 sctx
->cs_user_data
[i
] = clear_value
;
236 sctx
->flags
|= SI_CONTEXT_INV_VCACHE
|
237 SI_CONTEXT_INV_SCACHE
;
239 ctx
->set_shader_buffers(ctx
, PIPE_SHADER_COMPUTE
, 0,
240 is_copy
? 2 : 1, sb
, 0x1);
241 ctx
->bind_compute_state(ctx
, cs
);
242 sctx
->cs_max_waves_per_sh
= cs_waves_per_sh
;
244 ctx
->launch_grid(ctx
, &info
);
246 ctx
->bind_compute_state(ctx
, NULL
);
247 ctx
->delete_compute_state(ctx
, cs
);
248 sctx
->cs_max_waves_per_sh
= 0; /* disable the limit */
250 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
;
253 /* Flush L2, so that we don't just test L2 cache performance. */
255 sctx
->flags
|= SI_CONTEXT_WB_L2
;
256 sctx
->emit_cache_flush(sctx
);
259 ctx
->end_query(ctx
, q
[iter
]);
260 ctx
->flush(ctx
, NULL
, PIPE_FLUSH_ASYNC
);
262 pipe_resource_reference(&dst
, NULL
);
263 pipe_resource_reference(&src
, NULL
);
266 uint64_t min
= ~0ull, max
= 0, total
= 0;
268 for (unsigned iter
= 0; iter
< NUM_RUNS
; iter
++) {
269 union pipe_query_result result
;
271 ctx
->get_query_result(ctx
, q
[iter
], true, &result
);
272 ctx
->destroy_query(ctx
, q
[iter
]);
274 min
= MIN2(min
, result
.u64
);
275 max
= MAX2(max
, result
.u64
);
279 score
= get_MBps_rate(size
, total
/ (double)NUM_RUNS
);
280 printf("%7.0f ,", score
);
283 struct si_result
*r
= &results
[util_logbase2(size
)][placement
][method
];
286 r
->is_sdma
= test_sdma
;
288 r
->cache_policy
= cache_policy
;
289 r
->dwords_per_thread
= cs_dwords_per_thread
;
290 r
->waves_per_sh
= cs_waves_per_sh
;
299 puts("static struct si_method");
300 printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
301 sctx
->screen
->info
.name
);
303 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
305 /* Analyze results and find the best methods. */
306 for (unsigned placement
= 0; placement
< ARRAY_SIZE(placement_str
); placement
++) {
308 puts(" if (dst == RADEON_DOMAIN_VRAM) {");
309 else if (placement
== 1)
310 puts(" } else { /* GTT */");
311 else if (placement
== 2) {
314 puts("static struct si_method");
315 printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
316 sctx
->screen
->info
.name
);
317 printf(" uint64_t size64, bool async, bool cached)\n");
319 puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
320 puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
321 } else if (placement
== 3)
322 puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
324 puts(" } else { /* GTT -> VRAM */");
326 for (unsigned mode
= 0; mode
< 3; mode
++) {
327 bool async
= mode
== 0;
328 bool cached
= mode
== 1;
331 puts(" if (async) { /* SDMA or async compute */");
333 puts(" if (cached) { /* gfx ring */");
335 puts(" } else { /* gfx ring - uncached */");
337 /* The list of best chosen methods. */
338 struct si_result
*methods
[32];
339 unsigned method_max_size
[32];
340 unsigned num_methods
= 0;
342 for (unsigned size
= MIN_SIZE
; size
<= MAX_SIZE
; size
<<= SIZE_SHIFT
) {
343 /* Find the best method. */
344 struct si_result
*best
= NULL
;
346 for (unsigned i
= 0; i
< NUM_METHODS
; i
++) {
347 struct si_result
*r
= &results
[util_logbase2(size
)][placement
][i
];
352 /* Ban CP DMA clears via MC on <= GFX8. They are super slow
353 * on GTT, which we can get due to BO evictions.
355 if (sctx
->chip_class
<= GFX8
&& placement
== 1 &&
356 r
->is_cp
&& r
->cache_policy
== L2_BYPASS
)
360 /* The following constraints for compute IBs try to limit
361 * resource usage so as not to decrease the performance
362 * of gfx IBs too much.
365 /* Don't use CP DMA on asynchronous rings, because
366 * the engine is shared with gfx IBs.
371 /* Don't use L2 caching on asynchronous rings to minimize
374 if (r
->cache_policy
== L2_LRU
)
377 /* Asynchronous compute recommends waves_per_sh != 0
378 * to limit CU usage. */
379 if (r
->is_cs
&& r
->waves_per_sh
== 0)
382 /* SDMA is always asynchronous */
386 if (cached
&& r
->cache_policy
== L2_BYPASS
)
388 if (!cached
&& r
->cache_policy
== L2_LRU
)
397 /* Assume some measurement error. Earlier methods occupy fewer
398 * resources, so the next method is always more greedy, and we
399 * don't want to select it due to a measurement error.
401 double min_improvement
= 1.03;
403 if (best
->score
* min_improvement
< r
->score
)
407 if (num_methods
> 0) {
408 unsigned prev_index
= num_methods
- 1;
409 struct si_result
*prev
= methods
[prev_index
];
410 struct si_result
*prev_this_size
= &results
[util_logbase2(size
)][placement
][prev
->index
];
412 /* If the best one is also the best for the previous size,
413 * just bump the size for the previous one.
415 * If there is no best, it means all methods were too slow
416 * for this size and were not tested. Use the best one for
420 /* If it's the same method as for the previous size: */
421 (prev
->is_cp
== best
->is_cp
&&
422 prev
->is_sdma
== best
->is_sdma
&&
423 prev
->is_cs
== best
->is_cs
&&
424 prev
->cache_policy
== best
->cache_policy
&&
425 prev
->dwords_per_thread
== best
->dwords_per_thread
&&
426 prev
->waves_per_sh
== best
->waves_per_sh
) ||
427 /* If the method for the previous size is also the best
429 (prev_this_size
->is_valid
&&
430 prev_this_size
->score
* 1.03 > best
->score
)) {
431 method_max_size
[prev_index
] = size
;
436 /* Add it to the list. */
437 assert(num_methods
< ARRAY_SIZE(methods
));
438 methods
[num_methods
] = best
;
439 method_max_size
[num_methods
] = size
;
443 for (unsigned i
= 0; i
< num_methods
; i
++) {
444 struct si_result
*best
= methods
[i
];
445 unsigned size
= method_max_size
[i
];
447 /* The size threshold is between the current benchmarked
448 * size and the next benchmarked size. */
449 if (i
< num_methods
- 1)
450 printf(" if (size <= %9u) ", (size
+ (size
<< SIZE_SHIFT
)) / 2);
459 printf("CP_DMA(%s);\n",
460 best
->cache_policy
== L2_BYPASS
? "L2_BYPASS" :
461 best
->cache_policy
== L2_LRU
? "L2_LRU " : "L2_STREAM");
466 printf("COMPUTE(%s, %u, %u);\n",
467 best
->cache_policy
== L2_LRU
? "L2_LRU " : "L2_STREAM",
468 best
->dwords_per_thread
,