From: Marek Olšák Date: Thu, 7 May 2020 03:22:23 +0000 (-0400) Subject: radeonsi: tweak clear/copy_buffer limits when to use compute X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=f57276309b14be8a61e1913ab00dd6e4b1e5e8ce;p=mesa.git radeonsi: tweak clear/copy_buffer limits when to use compute Acked-by: Pierre-Eric Pelloux-Prayer Part-of: --- diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 97ca936f810..ce19c428f1c 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -327,12 +327,31 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_ uint64_t aligned_size = size & ~3ull; if (aligned_size >= 4) { - /* Before GFX9, CP DMA was very slow when clearing GTT, so never - * use CP DMA clears on those chips, because we can't be certain - * about buffer placements. + uint64_t compute_min_size; + + /* CP DMA clears are terribly slow with GTT on GFX6-8, which can always + * happen due to BO evictions. */ + if (sctx->chip_class <= GFX8) { + compute_min_size = 0; + } else if (sctx->chip_class >= GFX10 && + sctx->screen->info.has_dedicated_vram && + si_resource(dst)->domains & RADEON_DOMAIN_VRAM) { + /* VRAM clears on gfx10 dGPUs */ + if (sctx->screen->info.vram_bit_width >= 192) + compute_min_size = 128 * 1024; + else + compute_min_size = 1024 * 1024; + } else if (sctx->screen->info.has_dedicated_vram && + si_resource(dst)->domains & RADEON_DOMAIN_GTT) { + /* GTT clears on gfx9 and gfx10 dGPUs */ + compute_min_size = UINT64_MAX; /* CP DMA is the best due to slow PCIe */ + } else { + compute_min_size = 32 * 1024; + } + if (clear_value_size > 4 || (!force_cpdma && clear_value_size == 4 && offset % 4 == 0 && - (size > 32 * 1024 || sctx->chip_class <= GFX8))) { + size > compute_min_size)) { si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, aligned_size, clear_value, clear_value_size, coher); } else { @@ -371,11 +390,29 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p enum si_coherency coher = SI_COHERENCY_SHADER; enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); + uint64_t compute_min_size; + + if (sctx->chip_class >= GFX10 && + sctx->screen->info.has_dedicated_vram && + si_resource(dst)->domains & RADEON_DOMAIN_VRAM && + si_resource(src)->domains & RADEON_DOMAIN_VRAM) { + /* VRAM copies on gfx10 dGPUs */ + if (sctx->screen->info.vram_bit_width >= 192) + compute_min_size = 128 * 1024; + else + compute_min_size = 1024 * 1024; + } else if (sctx->chip_class >= GFX10 && + sctx->screen->info.has_dedicated_vram && + (si_resource(dst)->domains | si_resource(src)->domains) & RADEON_DOMAIN_GTT) { + compute_min_size = UINT64_MAX; /* CP DMA is the best due to slow PCIe */ + } else { + compute_min_size = 32 * 1024; + } /* Only use compute for VRAM copies on dGPUs. */ if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM && - si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > 32 * 1024 && dst_offset % 4 == 0 && - src_offset % 4 == 0 && size % 4 == 0) { + si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size && + dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) { si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, size, NULL, 0, coher); } else { si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, 0, coher, cache_policy);