From 06890c444a893a96b6ec1cfb36f77915ec9acda8 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sat, 13 Dec 2014 15:27:39 -0800 Subject: [PATCH] vc4: Add a userspace BO cache. Since our kernel BOs require CMA allocation, and the use of them requires new mmaps, it's pretty expensive and we should avoid it if possible. Copying my original design for Intel, make a userspace cache that reuses BOs that haven't been shared to other processes but frees BOs that have sat in the cache for over a second. Improves glxgears framerate on RPi by around 30%. --- src/gallium/drivers/vc4/vc4_bufmgr.c | 131 ++++++++++++++++++++++++++- src/gallium/drivers/vc4/vc4_bufmgr.h | 34 ++++++- src/gallium/drivers/vc4/vc4_screen.c | 2 + src/gallium/drivers/vc4/vc4_screen.h | 12 +++ 4 files changed, 175 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c index 64fe2e40e42..34596be537d 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -29,14 +29,49 @@ #include #include "util/u_memory.h" +#include "util/ralloc.h" #include "vc4_context.h" #include "vc4_screen.h" +#define container_of(ptr, type, field) \ + (type*)((char*)ptr - offsetof(type, field)) + +static struct vc4_bo * +vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name) +{ + struct vc4_bo_cache *cache = &screen->bo_cache; + uint32_t page_index = size / 4096 - 1; + + if (cache->size_list_size <= page_index) + return NULL; + + struct vc4_bo *bo = NULL; + pipe_mutex_lock(cache->lock); + if (!is_empty_list(&cache->size_list[page_index])) { + struct simple_node *node = last_elem(&cache->size_list[page_index]); + bo = container_of(node, struct vc4_bo, size_list); + pipe_reference_init(&bo->reference, 1); + remove_from_list(&bo->time_list); + remove_from_list(&bo->size_list); + + bo->name = name; + } + pipe_mutex_unlock(cache->lock); + return bo; +} + struct vc4_bo * vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) { - struct vc4_bo *bo = CALLOC_STRUCT(vc4_bo); + struct vc4_bo *bo; + size = align(size, 4096); + + bo = vc4_bo_from_cache(screen, size, name); + if (bo) + return bo; + + bo = CALLOC_STRUCT(vc4_bo); if (!bo) return NULL; @@ -44,6 +79,7 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) bo->screen = screen; bo->size = size; bo->name = name; + bo->private = true; struct drm_mode_create_dumb create; memset(&create, 0, sizeof(create)); @@ -65,6 +101,18 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) } void +vc4_bo_last_unreference(struct vc4_bo *bo) +{ + struct vc4_screen *screen = bo->screen; + + struct timespec time; + clock_gettime(CLOCK_MONOTONIC, &time); + pipe_mutex_lock(screen->bo_cache.lock); + vc4_bo_last_unreference_locked_timed(bo, time.tv_sec); + pipe_mutex_unlock(screen->bo_cache.lock); +} + +static void vc4_bo_free(struct vc4_bo *bo) { struct vc4_screen *screen = bo->screen; @@ -89,6 +137,69 @@ vc4_bo_free(struct vc4_bo *bo) free(bo); } +static void +free_stale_bos(struct vc4_screen *screen, time_t time) +{ + while (!is_empty_list(&screen->bo_cache.time_list)) { + struct simple_node *node = + first_elem(&screen->bo_cache.time_list); + struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list); + + /* If it's more than a second old, free it. */ + if (time - bo->free_time > 2) { + remove_from_list(&bo->time_list); + remove_from_list(&bo->size_list); + vc4_bo_free(bo); + } else { + break; + } + } +} + +void +vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time) +{ + struct vc4_screen *screen = bo->screen; + struct vc4_bo_cache *cache = &screen->bo_cache; + uint32_t page_index = bo->size / 4096 - 1; + + if (!bo->private) { + vc4_bo_free(bo); + return; + } + + if (cache->size_list_size <= page_index) { + struct simple_node *new_list = + ralloc_array(screen, struct simple_node, page_index + 1); + + /* Move old list contents over (since the array has moved, and + * therefore the pointers to the list heads have to change. + */ + for (int i = 0; i < cache->size_list_size; i++) { + struct simple_node *old_head = &cache->size_list[i]; + if (is_empty_list(old_head)) + make_empty_list(&new_list[i]); + else { + new_list[i].next = old_head->next; + new_list[i].prev = old_head->prev; + new_list[i].next->prev = &new_list[i]; + new_list[i].prev->next = &new_list[i]; + } + } + for (int i = cache->size_list_size; i < page_index + 1; i++) + make_empty_list(&new_list[i]); + + cache->size_list = new_list; + cache->size_list_size = page_index + 1; + } + + bo->free_time = time; + insert_at_tail(&cache->size_list[page_index], &bo->size_list); + insert_at_tail(&cache->time_list, &bo->time_list); + + free_stale_bos(screen, time); +} + static struct vc4_bo * vc4_bo_open_handle(struct vc4_screen *screen, uint32_t winsys_stride, @@ -103,6 +214,7 @@ vc4_bo_open_handle(struct vc4_screen *screen, bo->handle = handle; bo->size = size; bo->name = "winsys"; + bo->private = false; #ifdef USE_VC4_SIMULATOR vc4_bo_map(bo); @@ -194,6 +306,7 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name) return false; } + bo->private = false; *name = flink.name; return true; @@ -289,3 +402,19 @@ vc4_bo_map(struct vc4_bo *bo) return map; } + +void +vc4_bufmgr_destroy(struct pipe_screen *pscreen) +{ + struct vc4_screen *screen = vc4_screen(pscreen); + struct vc4_bo_cache *cache = &screen->bo_cache; + + while (!is_empty_list(&cache->time_list)) { + struct simple_node *node = first_elem(&cache->time_list); + struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list); + + remove_from_list(&bo->time_list); + remove_from_list(&bo->size_list); + vc4_bo_free(bo); + } +} diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h index baaecfdfd3f..f9559e999a1 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.h +++ b/src/gallium/drivers/vc4/vc4_bufmgr.h @@ -26,6 +26,7 @@ #include #include "util/u_inlines.h" +#include "vc4_qir.h" struct vc4_context; @@ -41,13 +42,26 @@ struct vc4_bo { void *simulator_winsys_map; uint32_t simulator_winsys_stride; #endif + + /** Entry in the linked list of buffers freed, by age. */ + struct simple_node time_list; + /** Entry in the per-page-count linked list of buffers freed (by age). */ + struct simple_node size_list; + /** Approximate second when the bo was freed. */ + time_t free_time; + /** + * Whether only our process has a reference to the BO (meaning that + * it's safe to reuse it in the BO cache). + */ + bool private; }; struct vc4_bo *vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name); struct vc4_bo *vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, uint32_t size, const char *name); -void vc4_bo_free(struct vc4_bo *bo); +void vc4_bo_last_unreference(struct vc4_bo *bo); +void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time); struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name, uint32_t winsys_stride); struct vc4_bo *vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd, @@ -59,7 +73,7 @@ static inline void vc4_bo_set_reference(struct vc4_bo **old_bo, struct vc4_bo *new_bo) { if (pipe_reference(&(*old_bo)->reference, &new_bo->reference)) - vc4_bo_free(*old_bo); + vc4_bo_last_unreference(*old_bo); *old_bo = new_bo; } @@ -77,7 +91,18 @@ vc4_bo_unreference(struct vc4_bo **bo) return; if (pipe_reference(&(*bo)->reference, NULL)) - vc4_bo_free(*bo); + vc4_bo_last_unreference(*bo); + *bo = NULL; +} + +static inline void +vc4_bo_unreference_locked_timed(struct vc4_bo **bo, time_t time) +{ + if (!*bo) + return; + + if (pipe_reference(&(*bo)->reference, NULL)) + vc4_bo_last_unreference_locked_timed(*bo, time); *bo = NULL; } @@ -93,5 +118,8 @@ vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns); bool vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns); +void +vc4_bufmgr_destroy(struct pipe_screen *pscreen); + #endif /* VC4_BUFMGR_H */ diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index b532cc6782f..8d216338bf7 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -76,6 +76,7 @@ vc4_screen_get_vendor(struct pipe_screen *pscreen) static void vc4_screen_destroy(struct pipe_screen *pscreen) { + vc4_bufmgr_destroy(pscreen); ralloc_free(pscreen); } @@ -449,6 +450,7 @@ vc4_screen_create(int fd) pscreen->is_format_supported = vc4_screen_is_format_supported; screen->fd = fd; + make_empty_list(&screen->bo_cache.time_list); vc4_fence_init(screen); diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h index 4a8b1f4577d..50a763f9a5e 100644 --- a/src/gallium/drivers/vc4/vc4_screen.h +++ b/src/gallium/drivers/vc4/vc4_screen.h @@ -25,7 +25,9 @@ #define VC4_SCREEN_H #include "pipe/p_screen.h" +#include "os/os_thread.h" #include "state_tracker/drm_driver.h" +#include "vc4_qir.h" struct vc4_bo; @@ -55,6 +57,16 @@ struct vc4_screen { * if we know the job's already done. */ uint64_t finished_seqno; + + struct vc4_bo_cache { + /** List of struct vc4_bo freed, by age. */ + struct simple_node time_list; + /** List of struct vc4_bo freed, per size, by age. */ + struct simple_node *size_list; + uint32_t size_list_size; + + pipe_mutex lock; + } bo_cache; }; static inline struct vc4_screen * -- 2.30.2