From d2147787534de87cd11015266293211b5188442f Mon Sep 17 00:00:00 2001 From: Vasily Khoruzhick Date: Sat, 7 Sep 2019 19:33:07 -0700 Subject: [PATCH] lima: implement BO cache Allocating BOs is expensive, so we should avoid doing that by caching freed BOs. BO cache is modelled after one in v3d driver and works as follows: - in lima_bo_create() check if we have matching BO in cache and return it if there's one, allocate new BO otherwise. - in lima_bo_unreference() (renamed from lima_bo_free()): put BO in cache instead of freeing it and remove all stale BOs from cache Reviewed-by: Qiang Yu Signed-off-by: Vasily Khoruzhick --- src/gallium/drivers/lima/lima_bo.c | 184 +++++++++++++++++++++-- src/gallium/drivers/lima/lima_bo.h | 10 +- src/gallium/drivers/lima/lima_context.c | 6 +- src/gallium/drivers/lima/lima_program.c | 4 +- src/gallium/drivers/lima/lima_resource.c | 4 +- src/gallium/drivers/lima/lima_screen.c | 23 ++- src/gallium/drivers/lima/lima_screen.h | 9 ++ src/gallium/drivers/lima/lima_submit.c | 2 +- 8 files changed, 212 insertions(+), 30 deletions(-) diff --git a/src/gallium/drivers/lima/lima_bo.c b/src/gallium/drivers/lima/lima_bo.c index b082d32afd8..22263ea0054 100644 --- a/src/gallium/drivers/lima/lima_bo.c +++ b/src/gallium/drivers/lima/lima_bo.c @@ -30,6 +30,7 @@ #include "drm-uapi/lima_drm.h" #include "util/u_hash_table.h" +#include "util/u_math.h" #include "util/os_time.h" #include "os/os_mman.h" @@ -37,6 +38,7 @@ #include "lima_screen.h" #include "lima_bo.h" +#include "lima_util.h" #define PTR_TO_UINT(x) ((unsigned)((intptr_t)(x))) @@ -68,6 +70,16 @@ err_out0: return false; } +bool lima_bo_cache_init(struct lima_screen *screen) +{ + mtx_init(&screen->bo_cache_lock, mtx_plain); + list_inithead(&screen->bo_cache_time); + for (int i = 0; i < NR_BO_CACHE_BUCKETS; i++) + list_inithead(&screen->bo_cache_buckets[i]); + + return true; +} + void lima_bo_table_fini(struct lima_screen *screen) { mtx_destroy(&screen->bo_table_lock); @@ -75,6 +87,13 @@ void lima_bo_table_fini(struct lima_screen *screen) util_hash_table_destroy(screen->bo_flink_names); } +static void +lima_bo_cache_remove(struct lima_bo *bo) +{ + list_del(&bo->size_list); + list_del(&bo->time_list); +} + static void lima_close_kms_handle(struct lima_screen *screen, uint32_t handle) { struct drm_gem_close args = { @@ -84,6 +103,36 @@ static void lima_close_kms_handle(struct lima_screen *screen, uint32_t handle) drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &args); } +static void +lima_bo_free(struct lima_bo *bo) +{ + struct lima_screen *screen = bo->screen; + mtx_lock(&screen->bo_table_lock); + util_hash_table_remove(screen->bo_handles, + (void *)(uintptr_t)bo->handle); + if (bo->flink_name) + util_hash_table_remove(screen->bo_flink_names, + (void *)(uintptr_t)bo->flink_name); + mtx_unlock(&screen->bo_table_lock); + + if (bo->map) + lima_bo_unmap(bo); + + lima_close_kms_handle(screen, bo->handle); + free(bo); +} + +void lima_bo_cache_fini(struct lima_screen *screen) +{ + mtx_destroy(&screen->bo_cache_lock); + + list_for_each_entry_safe(struct lima_bo, entry, + &screen->bo_cache_time, time_list) { + lima_bo_cache_remove(entry); + lima_bo_free(entry); + } +} + static bool lima_bo_get_info(struct lima_bo *bo) { struct drm_lima_gem_info req = { @@ -98,10 +147,112 @@ static bool lima_bo_get_info(struct lima_bo *bo) return true; } +static unsigned +lima_bucket_index(unsigned size) +{ + /* Round down to POT to compute a bucket index */ + + unsigned bucket_index = util_logbase2(size); + + /* Clamp the bucket index; all huge allocations will be + * sorted into the largest bucket */ + bucket_index = CLAMP(bucket_index, MIN_BO_CACHE_BUCKET, + MAX_BO_CACHE_BUCKET); + + /* Reindex from 0 */ + return (bucket_index - MIN_BO_CACHE_BUCKET); +} + +static struct list_head * +lima_bo_cache_get_bucket(struct lima_screen *screen, unsigned size) +{ + return &screen->bo_cache_buckets[lima_bucket_index(size)]; +} + +static void +lima_bo_cache_free_stale_bos(struct lima_screen *screen, time_t time) +{ + list_for_each_entry_safe(struct lima_bo, entry, + &screen->bo_cache_time, time_list) { + /* Free BOs that are sitting idle for longer than 5 seconds */ + if (time - entry->free_time > 6) { + lima_bo_cache_remove(entry); + lima_bo_free(entry); + } else + break; + } +} + +static bool +lima_bo_cache_put(struct lima_bo *bo) +{ + if (!bo->cacheable) + return false; + + struct lima_screen *screen = bo->screen; + + mtx_lock(&screen->bo_cache_lock); + struct list_head *bucket = lima_bo_cache_get_bucket(screen, bo->size); + + if (!bucket) { + mtx_unlock(&screen->bo_cache_lock); + return false; + } + + struct timespec time; + clock_gettime(CLOCK_MONOTONIC, &time); + bo->free_time = time.tv_sec; + list_addtail(&bo->size_list, bucket); + list_addtail(&bo->time_list, &screen->bo_cache_time); + lima_bo_cache_free_stale_bos(screen, time.tv_sec); + mtx_unlock(&screen->bo_cache_lock); + + return true; +} + +static struct lima_bo * +lima_bo_cache_get(struct lima_screen *screen, uint32_t size, uint32_t flags) +{ + struct lima_bo *bo = NULL; + mtx_lock(&screen->bo_cache_lock); + struct list_head *bucket = lima_bo_cache_get_bucket(screen, size); + + if (!bucket) { + mtx_unlock(&screen->bo_cache_lock); + return false; + } + + list_for_each_entry_safe(struct lima_bo, entry, bucket, size_list) { + if (entry->size >= size && + entry->flags == flags) { + /* Check if BO is idle. If it's not it's better to allocate new one */ + if (!lima_bo_wait(entry, LIMA_GEM_WAIT_WRITE, 0)) + break; + + lima_bo_cache_remove(entry); + p_atomic_set(&entry->refcnt, 1); + bo = entry; + break; + } + } + + mtx_unlock(&screen->bo_cache_lock); + + return bo; +} + struct lima_bo *lima_bo_create(struct lima_screen *screen, uint32_t size, uint32_t flags) { struct lima_bo *bo; + + /* Try to get bo from cache first */ + bo = lima_bo_cache_get(screen, size, flags); + if (bo) + return bo; + + size = align(size, LIMA_PAGE_SIZE); + struct drm_lima_gem_create req = { .size = size, .flags = flags, @@ -110,12 +261,17 @@ struct lima_bo *lima_bo_create(struct lima_screen *screen, if (!(bo = calloc(1, sizeof(*bo)))) return NULL; + list_inithead(&bo->time_list); + list_inithead(&bo->size_list); + if (drmIoctl(screen->fd, DRM_IOCTL_LIMA_GEM_CREATE, &req)) goto err_out0; bo->screen = screen; bo->size = req.size; + bo->flags = req.flags; bo->handle = req.handle; + bo->cacheable = !(lima_debug & LIMA_DEBUG_NO_BO_CACHE); p_atomic_set(&bo->refcnt, 1); if (!lima_bo_get_info(bo)) @@ -130,25 +286,16 @@ err_out0: return NULL; } -void lima_bo_free(struct lima_bo *bo) +void lima_bo_unreference(struct lima_bo *bo) { if (!p_atomic_dec_zero(&bo->refcnt)) return; - struct lima_screen *screen = bo->screen; - mtx_lock(&screen->bo_table_lock); - util_hash_table_remove(screen->bo_handles, - (void *)(uintptr_t)bo->handle); - if (bo->flink_name) - util_hash_table_remove(screen->bo_flink_names, - (void *)(uintptr_t)bo->flink_name); - mtx_unlock(&screen->bo_table_lock); - - if (bo->map) - lima_bo_unmap(bo); + /* Try to put it into cache */ + if (lima_bo_cache_put(bo)) + return; - lima_close_kms_handle(screen, bo->handle); - free(bo); + lima_bo_free(bo); } void *lima_bo_map(struct lima_bo *bo) @@ -175,6 +322,9 @@ bool lima_bo_export(struct lima_bo *bo, struct winsys_handle *handle) { struct lima_screen *screen = bo->screen; + /* Don't cache exported BOs */ + bo->cacheable = false; + switch (handle->type) { case WINSYS_HANDLE_TYPE_SHARED: if (!bo->flink_name) { @@ -271,6 +421,8 @@ struct lima_bo *lima_bo_import(struct lima_screen *screen, if (bo) { p_atomic_inc(&bo->refcnt); + /* Don't cache imported BOs */ + bo->cacheable = false; mtx_unlock(&screen->bo_table_lock); return bo; } @@ -282,6 +434,10 @@ struct lima_bo *lima_bo_import(struct lima_screen *screen, return NULL; } + /* Don't cache imported BOs */ + bo->cacheable = false; + list_inithead(&bo->time_list); + list_inithead(&bo->size_list); bo->screen = screen; p_atomic_set(&bo->refcnt, 1); diff --git a/src/gallium/drivers/lima/lima_bo.h b/src/gallium/drivers/lima/lima_bo.h index 3f440b3b014..7bbd0063602 100644 --- a/src/gallium/drivers/lima/lima_bo.h +++ b/src/gallium/drivers/lima/lima_bo.h @@ -28,12 +28,18 @@ #include #include "util/u_atomic.h" +#include "util/list.h" struct lima_bo { struct lima_screen *screen; + struct list_head time_list; + struct list_head size_list; int refcnt; + bool cacheable; + time_t free_time; uint32_t size; + uint32_t flags; uint32_t handle; uint64_t offset; uint32_t flink_name; @@ -44,10 +50,12 @@ struct lima_bo { bool lima_bo_table_init(struct lima_screen *screen); void lima_bo_table_fini(struct lima_screen *screen); +bool lima_bo_cache_init(struct lima_screen *screen); +void lima_bo_cache_fini(struct lima_screen *screen); struct lima_bo *lima_bo_create(struct lima_screen *screen, uint32_t size, uint32_t flags); -void lima_bo_free(struct lima_bo *bo); +void lima_bo_unreference(struct lima_bo *bo); static inline void lima_bo_reference(struct lima_bo *bo) { diff --git a/src/gallium/drivers/lima/lima_context.c b/src/gallium/drivers/lima/lima_context.c index 3c3887001bf..813e87361bb 100644 --- a/src/gallium/drivers/lima/lima_context.c +++ b/src/gallium/drivers/lima/lima_context.c @@ -138,13 +138,13 @@ lima_context_destroy(struct pipe_context *pctx) for (int i = 0; i < LIMA_CTX_PLB_MAX_NUM; i++) { if (ctx->plb[i]) - lima_bo_free(ctx->plb[i]); + lima_bo_unreference(ctx->plb[i]); if (ctx->gp_tile_heap[i]) - lima_bo_free(ctx->gp_tile_heap[i]); + lima_bo_unreference(ctx->gp_tile_heap[i]); } if (ctx->plb_gp_stream) - lima_bo_free(ctx->plb_gp_stream); + lima_bo_unreference(ctx->plb_gp_stream); if (ctx->plb_pp_stream) assert(!_mesa_hash_table_num_entries(ctx->plb_pp_stream)); diff --git a/src/gallium/drivers/lima/lima_program.c b/src/gallium/drivers/lima/lima_program.c index e3863feb80e..5e639748bfc 100644 --- a/src/gallium/drivers/lima/lima_program.c +++ b/src/gallium/drivers/lima/lima_program.c @@ -302,7 +302,7 @@ lima_delete_fs_state(struct pipe_context *pctx, void *hwcso) struct lima_fs_shader_state *so = hwcso; if (so->bo) - lima_bo_free(so->bo); + lima_bo_unreference(so->bo); ralloc_free(so); } @@ -396,7 +396,7 @@ lima_delete_vs_state(struct pipe_context *pctx, void *hwcso) struct lima_vs_shader_state *so = hwcso; if (so->bo) - lima_bo_free(so->bo); + lima_bo_unreference(so->bo); ralloc_free(so); } diff --git a/src/gallium/drivers/lima/lima_resource.c b/src/gallium/drivers/lima/lima_resource.c index faa129998c3..e01e60c0465 100644 --- a/src/gallium/drivers/lima/lima_resource.c +++ b/src/gallium/drivers/lima/lima_resource.c @@ -259,7 +259,7 @@ lima_resource_destroy(struct pipe_screen *pscreen, struct pipe_resource *pres) struct lima_resource *res = lima_resource(pres); if (res->bo) - lima_bo_free(res->bo); + lima_bo_unreference(res->bo); if (res->scanout) renderonly_scanout_destroy(res->scanout, screen->ro); @@ -528,7 +528,7 @@ lima_surface_destroy(struct pipe_context *pctx, struct pipe_surface *psurf) struct lima_ctx_plb_pp_stream *s = entry->data; if (--s->refcnt == 0) { if (s->bo) - lima_bo_free(s->bo); + lima_bo_unreference(s->bo); _mesa_hash_table_remove(ctx->plb_pp_stream, entry); ralloc_free(s); } diff --git a/src/gallium/drivers/lima/lima_screen.c b/src/gallium/drivers/lima/lima_screen.c index 5e6ac1ffb08..3c1288c897f 100644 --- a/src/gallium/drivers/lima/lima_screen.c +++ b/src/gallium/drivers/lima/lima_screen.c @@ -61,8 +61,9 @@ lima_screen_destroy(struct pipe_screen *pscreen) free(screen->ro); if (screen->pp_buffer) - lima_bo_free(screen->pp_buffer); + lima_bo_unreference(screen->pp_buffer); + lima_bo_cache_fini(screen); lima_bo_table_fini(screen); ralloc_free(screen); } @@ -418,6 +419,8 @@ static const struct debug_named_value debug_options[] = { "dump GPU command stream to $PWD/lima.dump" }, { "shaderdb", LIMA_DEBUG_SHADERDB, "print shader information for shaderdb" }, + { "nobocache", LIMA_DEBUG_NO_BO_CACHE, + "disable BO cache" }, { NULL } }; @@ -478,16 +481,20 @@ lima_screen_create(int fd, struct renderonly *ro) if (!lima_screen_query_info(screen)) goto err_out0; - if (!lima_bo_table_init(screen)) + if (!lima_bo_cache_init(screen)) goto err_out0; + if (!lima_bo_table_init(screen)) + goto err_out1; + screen->pp_ra = ppir_regalloc_init(screen); if (!screen->pp_ra) - goto err_out1; + goto err_out2; screen->pp_buffer = lima_bo_create(screen, pp_buffer_size, 0); if (!screen->pp_buffer) - goto err_out1; + goto err_out2; + screen->pp_buffer->cacheable = false; /* fs program for clear buffer? * const0 1 0 0 -1.67773, mov.v0 $0 ^const0.xxxx, stop @@ -534,7 +541,7 @@ lima_screen_create(int fd, struct renderonly *ro) screen->ro = renderonly_dup(ro); if (!screen->ro) { fprintf(stderr, "Failed to dup renderonly object\n"); - goto err_out2; + goto err_out3; } } @@ -559,10 +566,12 @@ lima_screen_create(int fd, struct renderonly *ro) return &screen->base; +err_out3: + lima_bo_unreference(screen->pp_buffer); err_out2: - lima_bo_free(screen->pp_buffer); -err_out1: lima_bo_table_fini(screen); +err_out1: + lima_bo_cache_fini(screen); err_out0: ralloc_free(screen); return NULL; diff --git a/src/gallium/drivers/lima/lima_screen.h b/src/gallium/drivers/lima/lima_screen.h index 547d083ecd0..62fa480738c 100644 --- a/src/gallium/drivers/lima/lima_screen.h +++ b/src/gallium/drivers/lima/lima_screen.h @@ -37,6 +37,7 @@ #define LIMA_DEBUG_PP (1 << 1) #define LIMA_DEBUG_DUMP (1 << 2) #define LIMA_DEBUG_SHADERDB (1 << 3) +#define LIMA_DEBUG_NO_BO_CACHE (1 << 4) extern uint32_t lima_debug; extern FILE *lima_dump_command_stream; @@ -46,6 +47,11 @@ extern int lima_ppir_force_spilling; struct ra_regs; +#define MIN_BO_CACHE_BUCKET (12) /* 2^12 = 4KB */ +#define MAX_BO_CACHE_BUCKET (22) /* 2^22 = 4MB */ + +#define NR_BO_CACHE_BUCKETS (MAX_BO_CACHE_BUCKET - MIN_BO_CACHE_BUCKET + 1) + struct lima_screen { struct pipe_screen base; struct renderonly *ro; @@ -60,8 +66,11 @@ struct lima_screen { /* bo table */ mtx_t bo_table_lock; + mtx_t bo_cache_lock; struct util_hash_table *bo_handles; struct util_hash_table *bo_flink_names; + struct list_head bo_cache_buckets[NR_BO_CACHE_BUCKETS]; + struct list_head bo_cache_time; struct slab_parent_pool transfer_pool; diff --git a/src/gallium/drivers/lima/lima_submit.c b/src/gallium/drivers/lima/lima_submit.c index 3977af8078e..acc73d08054 100644 --- a/src/gallium/drivers/lima/lima_submit.c +++ b/src/gallium/drivers/lima/lima_submit.c @@ -145,7 +145,7 @@ bool lima_submit_start(struct lima_submit *submit, void *frame, uint32_t size) bool ret = drmIoctl(submit->screen->fd, DRM_IOCTL_LIMA_GEM_SUBMIT, &req) == 0; util_dynarray_foreach(&submit->bos, struct lima_bo *, bo) { - lima_bo_free(*bo); + lima_bo_unreference(*bo); } util_dynarray_clear(&submit->gem_bos); -- 2.30.2