From 4b0dc098b2561c07c59f7dab2813640a25789bf1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 2 Nov 2017 00:00:53 +0100 Subject: [PATCH] gallium/u_threaded: don't map big VRAM buffers for the first upload directly MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This improves Paraview "many spheres" performance 4x along with the radeonsi commit. Reviewed-by: Nicolai Hähnle --- src/gallium/auxiliary/util/u_threaded_context.c | 14 ++++++++++++++ src/gallium/auxiliary/util/u_threaded_context.h | 6 ++++++ src/gallium/drivers/radeon/r600_buffer_common.c | 10 ++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c index 0f232580532..ccce12b00ce 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.c +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -1284,6 +1284,20 @@ tc_improve_map_buffer_flags(struct threaded_context *tc, if (usage & tc_flags) return usage; + /* Use the staging upload if it's preferred. */ + if (usage & (PIPE_TRANSFER_DISCARD_RANGE | + PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) && + !(usage & PIPE_TRANSFER_PERSISTENT) && + /* Try not to decrement the counter if it's not positive. Still racy, + * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */ + tres->max_forced_staging_uploads > 0 && + p_atomic_dec_return(&tres->max_forced_staging_uploads) >= 0) { + usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | + PIPE_TRANSFER_UNSYNCHRONIZED); + + return usage | tc_flags | PIPE_TRANSFER_DISCARD_RANGE; + } + /* Sparse buffers can't be mapped directly and can't be reallocated * (fully invalidated). That may just be a radeonsi limitation, but * the threaded context must obey it with radeonsi. diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h index 8977b03cd20..ac7bc3dec73 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.h +++ b/src/gallium/auxiliary/util/u_threaded_context.h @@ -241,6 +241,12 @@ struct threaded_resource { * pointers. */ bool is_shared; bool is_user_ptr; + + /* If positive, prefer DISCARD_RANGE with a staging buffer over any other + * method of CPU access when map flags allow it. Useful for buffers that + * are too large for the visible VRAM window. + */ + int max_forced_staging_uploads; }; struct threaded_transfer { diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 67daaa40053..92521f47792 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -191,10 +191,15 @@ void si_init_resource_fields(struct r600_common_screen *rscreen, res->vram_usage = 0; res->gart_usage = 0; - if (res->domains & RADEON_DOMAIN_VRAM) + if (res->domains & RADEON_DOMAIN_VRAM) { res->vram_usage = size; - else if (res->domains & RADEON_DOMAIN_GTT) + + res->b.max_forced_staging_uploads = + rscreen->info.has_dedicated_vram && + size >= rscreen->info.vram_vis_size / 4 ? 1 : 0; + } else if (res->domains & RADEON_DOMAIN_GTT) { res->gart_usage = size; + } } bool si_alloc_resource(struct r600_common_screen *rscreen, @@ -289,6 +294,7 @@ void si_replace_buffer_storage(struct pipe_context *ctx, pb_reference(&rdst->buf, rsrc->buf); rdst->gpu_address = rsrc->gpu_address; rdst->b.b.bind = rsrc->b.b.bind; + rdst->b.max_forced_staging_uploads = rsrc->b.max_forced_staging_uploads; rdst->flags = rsrc->flags; assert(rdst->vram_usage == rsrc->vram_usage); -- 2.30.2