vc4: Add a userspace BO cache.
authorEric Anholt <eric@anholt.net>
Sat, 13 Dec 2014 23:27:39 +0000 (15:27 -0800)
committerEric Anholt <eric@anholt.net>
Thu, 18 Dec 2014 00:07:01 +0000 (16:07 -0800)
Since our kernel BOs require CMA allocation, and the use of them requires
new mmaps, it's pretty expensive and we should avoid it if possible.
Copying my original design for Intel, make a userspace cache that reuses
BOs that haven't been shared to other processes but frees BOs that have
sat in the cache for over a second.

Improves glxgears framerate on RPi by around 30%.

src/gallium/drivers/vc4/vc4_bufmgr.c
src/gallium/drivers/vc4/vc4_bufmgr.h
src/gallium/drivers/vc4/vc4_screen.c
src/gallium/drivers/vc4/vc4_screen.h

index 64fe2e40e42cf96d53391e38464c51b91f6b2373..34596be537d0b1d38a7d81551db8db210935ff8e 100644 (file)
 #include <xf86drmMode.h>
 
 #include "util/u_memory.h"
+#include "util/ralloc.h"
 
 #include "vc4_context.h"
 #include "vc4_screen.h"
 
+#define container_of(ptr, type, field) \
+   (type*)((char*)ptr - offsetof(type, field))
+
+static struct vc4_bo *
+vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
+{
+        struct vc4_bo_cache *cache = &screen->bo_cache;
+        uint32_t page_index = size / 4096 - 1;
+
+        if (cache->size_list_size <= page_index)
+                return NULL;
+
+        struct vc4_bo *bo = NULL;
+        pipe_mutex_lock(cache->lock);
+        if (!is_empty_list(&cache->size_list[page_index])) {
+                struct simple_node *node = last_elem(&cache->size_list[page_index]);
+                bo = container_of(node, struct vc4_bo, size_list);
+                pipe_reference_init(&bo->reference, 1);
+                remove_from_list(&bo->time_list);
+                remove_from_list(&bo->size_list);
+
+                bo->name = name;
+        }
+        pipe_mutex_unlock(cache->lock);
+        return bo;
+}
+
 struct vc4_bo *
 vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
 {
-        struct vc4_bo *bo = CALLOC_STRUCT(vc4_bo);
+        struct vc4_bo *bo;
+        size = align(size, 4096);
+
+        bo = vc4_bo_from_cache(screen, size, name);
+        if (bo)
+                return bo;
+
+        bo = CALLOC_STRUCT(vc4_bo);
         if (!bo)
                 return NULL;
 
@@ -44,6 +79,7 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
         bo->screen = screen;
         bo->size = size;
         bo->name = name;
+        bo->private = true;
 
         struct drm_mode_create_dumb create;
         memset(&create, 0, sizeof(create));
@@ -65,6 +101,18 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
 }
 
 void
+vc4_bo_last_unreference(struct vc4_bo *bo)
+{
+        struct vc4_screen *screen = bo->screen;
+
+        struct timespec time;
+        clock_gettime(CLOCK_MONOTONIC, &time);
+        pipe_mutex_lock(screen->bo_cache.lock);
+        vc4_bo_last_unreference_locked_timed(bo, time.tv_sec);
+        pipe_mutex_unlock(screen->bo_cache.lock);
+}
+
+static void
 vc4_bo_free(struct vc4_bo *bo)
 {
         struct vc4_screen *screen = bo->screen;
@@ -89,6 +137,69 @@ vc4_bo_free(struct vc4_bo *bo)
         free(bo);
 }
 
+static void
+free_stale_bos(struct vc4_screen *screen, time_t time)
+{
+        while (!is_empty_list(&screen->bo_cache.time_list)) {
+                struct simple_node *node =
+                        first_elem(&screen->bo_cache.time_list);
+                struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list);
+
+                /* If it's more than a second old, free it. */
+                if (time - bo->free_time > 2) {
+                        remove_from_list(&bo->time_list);
+                        remove_from_list(&bo->size_list);
+                        vc4_bo_free(bo);
+                } else {
+                        break;
+                }
+        }
+}
+
+void
+vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
+{
+        struct vc4_screen *screen = bo->screen;
+        struct vc4_bo_cache *cache = &screen->bo_cache;
+        uint32_t page_index = bo->size / 4096 - 1;
+
+        if (!bo->private) {
+                vc4_bo_free(bo);
+                return;
+        }
+
+        if (cache->size_list_size <= page_index) {
+                struct simple_node *new_list =
+                        ralloc_array(screen, struct simple_node, page_index + 1);
+
+                /* Move old list contents over (since the array has moved, and
+                 * therefore the pointers to the list heads have to change.
+                 */
+                for (int i = 0; i < cache->size_list_size; i++) {
+                        struct simple_node *old_head = &cache->size_list[i];
+                        if (is_empty_list(old_head))
+                                make_empty_list(&new_list[i]);
+                        else {
+                                new_list[i].next = old_head->next;
+                                new_list[i].prev = old_head->prev;
+                                new_list[i].next->prev = &new_list[i];
+                                new_list[i].prev->next = &new_list[i];
+                        }
+                }
+                for (int i = cache->size_list_size; i < page_index + 1; i++)
+                        make_empty_list(&new_list[i]);
+
+                cache->size_list = new_list;
+                cache->size_list_size = page_index + 1;
+        }
+
+        bo->free_time = time;
+        insert_at_tail(&cache->size_list[page_index], &bo->size_list);
+        insert_at_tail(&cache->time_list, &bo->time_list);
+
+        free_stale_bos(screen, time);
+}
+
 static struct vc4_bo *
 vc4_bo_open_handle(struct vc4_screen *screen,
                    uint32_t winsys_stride,
@@ -103,6 +214,7 @@ vc4_bo_open_handle(struct vc4_screen *screen,
         bo->handle = handle;
         bo->size = size;
         bo->name = "winsys";
+        bo->private = false;
 
 #ifdef USE_VC4_SIMULATOR
         vc4_bo_map(bo);
@@ -194,6 +306,7 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
                 return false;
         }
 
+        bo->private = false;
         *name = flink.name;
 
         return true;
@@ -289,3 +402,19 @@ vc4_bo_map(struct vc4_bo *bo)
 
         return map;
 }
+
+void
+vc4_bufmgr_destroy(struct pipe_screen *pscreen)
+{
+        struct vc4_screen *screen = vc4_screen(pscreen);
+        struct vc4_bo_cache *cache = &screen->bo_cache;
+
+        while (!is_empty_list(&cache->time_list)) {
+                struct simple_node *node = first_elem(&cache->time_list);
+                struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list);
+
+                remove_from_list(&bo->time_list);
+                remove_from_list(&bo->size_list);
+                vc4_bo_free(bo);
+        }
+}
index baaecfdfd3fe9fffa2e99a15303a0c593397649f..f9559e999a131b35f79d353b52c31c11ffd010dd 100644 (file)
@@ -26,6 +26,7 @@
 
 #include <stdint.h>
 #include "util/u_inlines.h"
+#include "vc4_qir.h"
 
 struct vc4_context;
 
@@ -41,13 +42,26 @@ struct vc4_bo {
         void *simulator_winsys_map;
         uint32_t simulator_winsys_stride;
 #endif
+
+        /** Entry in the linked list of buffers freed, by age. */
+        struct simple_node time_list;
+        /** Entry in the per-page-count linked list of buffers freed (by age). */
+        struct simple_node size_list;
+        /** Approximate second when the bo was freed. */
+        time_t free_time;
+        /**
+         * Whether only our process has a reference to the BO (meaning that
+         * it's safe to reuse it in the BO cache).
+         */
+        bool private;
 };
 
 struct vc4_bo *vc4_bo_alloc(struct vc4_screen *screen, uint32_t size,
                             const char *name);
 struct vc4_bo *vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data,
                                 uint32_t size, const char *name);
-void vc4_bo_free(struct vc4_bo *bo);
+void vc4_bo_last_unreference(struct vc4_bo *bo);
+void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time);
 struct vc4_bo *vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
                                 uint32_t winsys_stride);
 struct vc4_bo *vc4_bo_open_dmabuf(struct vc4_screen *screen, int fd,
@@ -59,7 +73,7 @@ static inline void
 vc4_bo_set_reference(struct vc4_bo **old_bo, struct vc4_bo *new_bo)
 {
         if (pipe_reference(&(*old_bo)->reference, &new_bo->reference))
-                vc4_bo_free(*old_bo);
+                vc4_bo_last_unreference(*old_bo);
         *old_bo = new_bo;
 }
 
@@ -77,7 +91,18 @@ vc4_bo_unreference(struct vc4_bo **bo)
                 return;
 
         if (pipe_reference(&(*bo)->reference, NULL))
-                vc4_bo_free(*bo);
+                vc4_bo_last_unreference(*bo);
+        *bo = NULL;
+}
+
+static inline void
+vc4_bo_unreference_locked_timed(struct vc4_bo **bo, time_t time)
+{
+        if (!*bo)
+                return;
+
+        if (pipe_reference(&(*bo)->reference, NULL))
+                vc4_bo_last_unreference_locked_timed(*bo, time);
         *bo = NULL;
 }
 
@@ -93,5 +118,8 @@ vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns);
 bool
 vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns);
 
+void
+vc4_bufmgr_destroy(struct pipe_screen *pscreen);
+
 #endif /* VC4_BUFMGR_H */
 
index b532cc6782fc3c79ab6d370144b4876dffe06933..8d216338bf7fdd84a4865c372b896938ca4d786f 100644 (file)
@@ -76,6 +76,7 @@ vc4_screen_get_vendor(struct pipe_screen *pscreen)
 static void
 vc4_screen_destroy(struct pipe_screen *pscreen)
 {
+        vc4_bufmgr_destroy(pscreen);
         ralloc_free(pscreen);
 }
 
@@ -449,6 +450,7 @@ vc4_screen_create(int fd)
         pscreen->is_format_supported = vc4_screen_is_format_supported;
 
         screen->fd = fd;
+        make_empty_list(&screen->bo_cache.time_list);
 
         vc4_fence_init(screen);
 
index 4a8b1f4577d6cdf31f55c6a95fe1b90175bd6200..50a763f9a5ed969bf2b06d4cf96bfa2021c21dfd 100644 (file)
@@ -25,7 +25,9 @@
 #define VC4_SCREEN_H
 
 #include "pipe/p_screen.h"
+#include "os/os_thread.h"
 #include "state_tracker/drm_driver.h"
+#include "vc4_qir.h"
 
 struct vc4_bo;
 
@@ -55,6 +57,16 @@ struct vc4_screen {
          * if we know the job's already done.
          */
         uint64_t finished_seqno;
+
+        struct vc4_bo_cache {
+                /** List of struct vc4_bo freed, by age. */
+                struct simple_node time_list;
+                /** List of struct vc4_bo freed, per size, by age. */
+                struct simple_node *size_list;
+                uint32_t size_list_size;
+
+                pipe_mutex lock;
+        } bo_cache;
 };
 
 static inline struct vc4_screen *