uint64_t aligned_size = size & ~3ull;
if (aligned_size >= 4) {
- /* Before GFX9, CP DMA was very slow when clearing GTT, so never
- * use CP DMA clears on those chips, because we can't be certain
- * about buffer placements.
+ uint64_t compute_min_size;
+
+ /* CP DMA clears are terribly slow with GTT on GFX6-8, which can always
+ * happen due to BO evictions.
*/
+ if (sctx->chip_class <= GFX8) {
+ compute_min_size = 0;
+ } else if (sctx->chip_class >= GFX10 &&
+ sctx->screen->info.has_dedicated_vram &&
+ si_resource(dst)->domains & RADEON_DOMAIN_VRAM) {
+ /* VRAM clears on gfx10 dGPUs */
+ if (sctx->screen->info.vram_bit_width >= 192)
+ compute_min_size = 128 * 1024;
+ else
+ compute_min_size = 1024 * 1024;
+ } else if (sctx->screen->info.has_dedicated_vram &&
+ si_resource(dst)->domains & RADEON_DOMAIN_GTT) {
+ /* GTT clears on gfx9 and gfx10 dGPUs */
+ compute_min_size = UINT64_MAX; /* CP DMA is the best due to slow PCIe */
+ } else {
+ compute_min_size = 32 * 1024;
+ }
+
if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 &&
- (size > 32 * 1024 || sctx->chip_class <= GFX8))) {
+ size > compute_min_size)) {
si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value,
clear_value_size, coher);
} else {
enum si_coherency coher = SI_COHERENCY_SHADER;
enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+ uint64_t compute_min_size;
+
+ if (sctx->chip_class >= GFX10 &&
+ sctx->screen->info.has_dedicated_vram &&
+ si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+ si_resource(src)->domains & RADEON_DOMAIN_VRAM) {
+ /* VRAM copies on gfx10 dGPUs */
+ if (sctx->screen->info.vram_bit_width >= 192)
+ compute_min_size = 128 * 1024;
+ else
+ compute_min_size = 1024 * 1024;
+ } else if (sctx->chip_class >= GFX10 &&
+ sctx->screen->info.has_dedicated_vram &&
+ (si_resource(dst)->domains | si_resource(src)->domains) & RADEON_DOMAIN_GTT) {
+ compute_min_size = UINT64_MAX; /* CP DMA is the best due to slow PCIe */
+ } else {
+ compute_min_size = 32 * 1024;
+ }
/* Only use compute for VRAM copies on dGPUs. */
if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
- si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 &&
- src_offset % 4 == 0 && size % 4 == 0) {
+ si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size &&
+ dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher);
} else {
si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy);