From e4aa8338c30de2f99de86bcfb1a3a39a21cea5e9 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 3 Apr 2018 01:40:23 -0700 Subject: [PATCH] iris: Soft-pin the universe Breaks everything, woo! --- src/gallium/drivers/iris/iris_batch.c | 4 +- src/gallium/drivers/iris/iris_bufmgr.c | 237 +++++++++++++++++++++-- src/gallium/drivers/iris/iris_bufmgr.h | 43 ++-- src/gallium/drivers/iris/iris_resource.c | 17 +- src/gallium/drivers/iris/iris_screen.c | 3 - src/gallium/drivers/iris/iris_screen.h | 2 - 6 files changed, 251 insertions(+), 55 deletions(-) diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c index 7a5c3df3a2e..72ede9f53da 100644 --- a/src/gallium/drivers/iris/iris_batch.c +++ b/src/gallium/drivers/iris/iris_batch.c @@ -95,7 +95,7 @@ create_batch_buffer(struct iris_bufmgr *bufmgr, struct iris_batch_buffer *buf, const char *name, unsigned size) { - buf->bo = iris_bo_alloc(bufmgr, name, size); + buf->bo = iris_bo_alloc(bufmgr, name, size, IRIS_MEMZONE_OTHER); buf->bo->kflags |= EXEC_OBJECT_CAPTURE; buf->map = iris_bo_map(NULL, buf->bo, MAP_READ | MAP_WRITE); buf->map_next = buf->map; @@ -299,7 +299,7 @@ grow_buffer(struct iris_batch *batch, const unsigned existing_bytes = buffer_bytes_used(buf); struct iris_bo *new_bo = - iris_bo_alloc(bufmgr, bo->name, new_size); + iris_bo_alloc(bufmgr, bo->name, new_size, IRIS_MEMZONE_OTHER); buf->map = iris_bo_map(NULL, new_bo, MAP_READ | MAP_WRITE); buf->map_next = buf->map + existing_bytes; diff --git a/src/gallium/drivers/iris/iris_bufmgr.c b/src/gallium/drivers/iris/iris_bufmgr.c index 9bc101ee5ab..80e620b19e2 100644 --- a/src/gallium/drivers/iris/iris_bufmgr.c +++ b/src/gallium/drivers/iris/iris_bufmgr.c @@ -51,6 +51,8 @@ #include "util/macros.h" #include "util/hash_table.h" #include "util/list.h" +#include "util/u_dynarray.h" +#include "util/vma.h" #include "iris_bufmgr.h" #include "iris_context.h" #include "string.h" @@ -93,8 +95,6 @@ drm_ioctl(int fd, unsigned long request, void *arg) return ret; } - - static inline int atomic_add_unless(int *v, int add, int unless) { @@ -105,9 +105,37 @@ atomic_add_unless(int *v, int add, int unless) return c == unless; } +/* + * Idea: + * + * Have a bitmap-allocator for each BO cache bucket size. Because bo_alloc + * rounds up allocations to the bucket size anyway, we can make 1 bit in the + * bitmap represent N pages of memory, where N = . + * Allocations and frees always set/unset a single bit. Because ffsll only + * works on uint64_t, use a tree(?) of those. + * + * Nodes contain a starting address and a uint64_t bitmap. (pair-of-uint64_t) + * Bitmap uses 1 for a free block, 0 for in-use. + * + * Bucket contains... + * + * Dynamic array of nodes. (pointer, two ints) + */ + +struct vma_bucket_node { + uint64_t start_address; + uint64_t bitmap; +}; + struct bo_cache_bucket { + /** List of cached BOs. */ struct list_head head; + + /** Size of this bucket, in bytes. */ uint64_t size; + + /** List of vma_bucket_nodes */ + struct util_dynarray vma_list[IRIS_MEMZONE_COUNT]; }; struct iris_bufmgr { @@ -123,6 +151,8 @@ struct iris_bufmgr { struct hash_table *name_table; struct hash_table *handle_table; + struct util_vma_heap vma_allocator[IRIS_MEMZONE_COUNT]; + bool has_llc:1; bool bo_reuse:1; }; @@ -132,6 +162,10 @@ static int bo_set_tiling_internal(struct iris_bo *bo, uint32_t tiling_mode, static void bo_free(struct iris_bo *bo); +static uint64_t vma_alloc(struct iris_bufmgr *bufmgr, + enum iris_memory_zone memzone, + uint64_t size, uint64_t alignment); + static uint32_t key_hash_uint(const void *key) { @@ -191,6 +225,141 @@ bucket_for_size(struct iris_bufmgr *bufmgr, uint64_t size) &bufmgr->cache_bucket[index] : NULL; } +static enum iris_memory_zone +memzone_for_address(uint64_t address) +{ + const uint64_t _4GB = 1ull << 32; + + if (address >= 3 * _4GB) + return IRIS_MEMZONE_OTHER; + + if (address >= 2 * _4GB) + return IRIS_MEMZONE_DYNAMIC; + + if (address >= 1 * _4GB) + return IRIS_MEMZONE_SURFACE; + + return IRIS_MEMZONE_SHADER; +} + +static uint64_t +bucket_vma_alloc(struct iris_bufmgr *bufmgr, + struct bo_cache_bucket *bucket, + enum iris_memory_zone memzone) +{ + struct util_dynarray *vma_list = &bucket->vma_list[memzone]; + struct vma_bucket_node *node; + + if (vma_list->size == 0) { + /* This bucket allocator is out of space - allocate a new block of + * memory from a larger allocator (either another bucket or util_vma). + * + * Set the first bit used, and return the start address. + */ + node = util_dynarray_grow(vma_list, sizeof(struct vma_bucket_node)); + node->start_address = + vma_alloc(bufmgr, memzone, 64ull * bucket->size, bucket->size); + node->bitmap = ~1ull; + return node->start_address; + } + + /* Pick any bit from any node - they're all the right size and free. */ + node = util_dynarray_top_ptr(vma_list, struct vma_bucket_node); + int bit = ffsll(node->bitmap) - 1; + assert(bit != -1); + + /* Reserve the memory by clearing the bit. */ + node->bitmap &= ~(1ull << bit); + + /* If this node is now completely full, remove it from the free list. */ + if (node->bitmap == 0ull) { + (void) util_dynarray_pop(vma_list, struct vma_bucket_node); + } + + return node->start_address + bit * bucket->size; +} + +static void +bucket_vma_free(struct bo_cache_bucket *bucket, + uint64_t address, + uint64_t size) +{ + enum iris_memory_zone memzone = memzone_for_address(address); + struct util_dynarray *vma_list = &bucket->vma_list[memzone]; + const uint64_t node_bytes = 64ull * bucket->size; + struct vma_bucket_node *node = NULL; + + uint64_t start = (address / node_bytes) * node_bytes; + int bit = (address - start) / bucket->size; + + util_dynarray_foreach(vma_list, struct vma_bucket_node, cur) { + if (cur->start_address == start) { + node = cur; + break; + } + } + + if (!node) { + node = util_dynarray_grow(vma_list, sizeof(struct vma_bucket_node)); + node->start_address = start; + node->bitmap = 0ull; + } + + node->bitmap |= 1ull << bit; + + /* The block might be entirely free now, and if so, we could return it + * to the larger allocator. But we may as well hang on to it, in case + * we get more allocations at this block size. + */ +} + +static struct bo_cache_bucket * +get_bucket_allocator(struct iris_bufmgr *bufmgr, uint64_t size) +{ + /* Skip using the bucket allocator for very large sizes, as it allocates + * 64 of them and this can balloon rather quickly. + */ + if (size > 1024 * PAGE_SIZE) + return NULL; + + struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size); + + if (bucket && bucket->size == size) + return bucket; + + return NULL; +} + +static uint64_t +vma_alloc(struct iris_bufmgr *bufmgr, + enum iris_memory_zone memzone, + uint64_t size, + uint64_t alignment) +{ + struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size); + + if (bucket) + return bucket_vma_alloc(bufmgr, bucket, memzone); + + return util_vma_heap_alloc(&bufmgr->vma_allocator[memzone], size, + alignment); +} + +static void +vma_free(struct iris_bufmgr *bufmgr, + uint64_t address, + uint64_t size) +{ + struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size); + + if (bucket) { + bucket_vma_free(bucket, address, size); + } else { + enum iris_memory_zone memzone = memzone_for_address(address); + util_vma_heap_free(&bufmgr->vma_allocator[memzone], address, size); + } +} + int iris_bo_busy(struct iris_bo *bo) { @@ -237,6 +406,7 @@ static struct iris_bo * bo_alloc_internal(struct iris_bufmgr *bufmgr, const char *name, uint64_t size, + enum iris_memory_zone memzone, unsigned flags, uint32_t tiling_mode, uint32_t stride) @@ -303,7 +473,15 @@ retry: } } - if (!alloc_from_cache) { + if (alloc_from_cache) { + /* If the cached BO isn't in the right memory zone, free the old + * memory and assign it a new address. + */ + if (memzone != memzone_for_address(bo->gtt_offset)) { + vma_free(bufmgr, bo->gtt_offset, size); + bo->gtt_offset = 0; + } + } else { bo = calloc(1, sizeof(*bo)); if (!bo) goto err; @@ -325,6 +503,7 @@ retry: bo->gem_handle = create.handle; bo->bufmgr = bufmgr; + bo->kflags = EXEC_OBJECT_PINNED; bo->tiling_mode = I915_TILING_NONE; bo->swizzle_mode = I915_BIT_6_SWIZZLE_NONE; @@ -347,6 +526,13 @@ retry: goto err_free; } + if (bo->gtt_offset == 0ull) { + bo->gtt_offset = vma_alloc(bufmgr, memzone, bo->size, 1); + + if (bo->gtt_offset == 0ull) + goto err_free; + } + bo->name = name; p_atomic_set(&bo->refcount, 1); bo->reusable = true; @@ -370,17 +556,20 @@ err: struct iris_bo * iris_bo_alloc(struct iris_bufmgr *bufmgr, const char *name, - uint64_t size) + uint64_t size, + enum iris_memory_zone memzone) { - return bo_alloc_internal(bufmgr, name, size, 0, I915_TILING_NONE, 0); + return bo_alloc_internal(bufmgr, name, size, memzone, + 0, I915_TILING_NONE, 0); } struct iris_bo * iris_bo_alloc_tiled(struct iris_bufmgr *bufmgr, const char *name, - uint64_t size, uint32_t tiling_mode, uint32_t pitch, - unsigned flags) + uint64_t size, enum iris_memory_zone memzone, + uint32_t tiling_mode, uint32_t pitch, unsigned flags) { - return bo_alloc_internal(bufmgr, name, size, flags, tiling_mode, pitch); + return bo_alloc_internal(bufmgr, name, size, memzone, + flags, tiling_mode, pitch); } /** @@ -435,11 +624,13 @@ iris_bo_gem_create_from_name(struct iris_bufmgr *bufmgr, bo->size = open_arg.size; bo->gtt_offset = 0; bo->bufmgr = bufmgr; + bo->kflags = EXEC_OBJECT_PINNED; bo->gem_handle = open_arg.handle; bo->name = name; bo->global_name = handle; bo->reusable = false; bo->external = true; + bo->gtt_offset = vma_alloc(bufmgr, IRIS_MEMZONE_OTHER, bo->size, 1); _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo); @@ -494,6 +685,8 @@ bo_free(struct iris_bo *bo) _mesa_hash_table_remove(bufmgr->handle_table, entry); } + vma_free(bo->bufmgr, bo->gtt_offset, bo->size); + /* Close this object */ struct drm_gem_close close = { .handle = bo->gem_handle }; int ret = drm_ioctl(bufmgr->fd, DRM_IOCTL_GEM_CLOSE, &close); @@ -542,9 +735,7 @@ bo_unreference_final(struct iris_bo *bo, time_t time) if (bufmgr->bo_reuse && bo->reusable && bucket != NULL && iris_bo_madvise(bo, I915_MADV_DONTNEED)) { bo->free_time = time; - bo->name = NULL; - bo->kflags = 0; list_addtail(&bo->head, &bucket->head); } else { @@ -960,6 +1151,9 @@ iris_bufmgr_destroy(struct iris_bufmgr *bufmgr) bo_free(bo); } + + for (int i = 0; i < IRIS_MEMZONE_COUNT; i++) + util_dynarray_fini(&bucket->vma_list[i]); } _mesa_hash_table_destroy(bufmgr->name_table, NULL); @@ -1052,6 +1246,7 @@ iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd) bo->size = ret; bo->bufmgr = bufmgr; + bo->kflags = EXEC_OBJECT_PINNED; bo->gem_handle = handle; _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo); @@ -1059,6 +1254,7 @@ iris_bo_import_dmabuf(struct iris_bufmgr *bufmgr, int prime_fd) bo->name = "prime"; bo->reusable = false; bo->external = true; + bo->gtt_offset = vma_alloc(bufmgr, IRIS_MEMZONE_OTHER, bo->size, 1); struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle }; if (drm_ioctl(bufmgr->fd, DRM_IOCTL_I915_GEM_GET_TILING, &get_tiling)) @@ -1164,6 +1360,8 @@ add_bucket(struct iris_bufmgr *bufmgr, int size) assert(i < ARRAY_SIZE(bufmgr->cache_bucket)); list_inithead(&bufmgr->cache_bucket[i].head); + for (int i = 0; i < IRIS_MEMZONE_COUNT; i++) + util_dynarray_init(&bufmgr->cache_bucket[i].vma_list[i], NULL); bufmgr->cache_bucket[i].size = size; bufmgr->num_buckets++; @@ -1185,12 +1383,12 @@ init_cache_buckets(struct iris_bufmgr *bufmgr) * width/height alignment and rounding of sizes to pages will * get us useful cache hit rates anyway) */ - add_bucket(bufmgr, 4096); - add_bucket(bufmgr, 4096 * 2); - add_bucket(bufmgr, 4096 * 3); + add_bucket(bufmgr, PAGE_SIZE); + add_bucket(bufmgr, PAGE_SIZE * 2); + add_bucket(bufmgr, PAGE_SIZE * 3); /* Initialize the linked lists for BO reuse cache. */ - for (size = 4 * 4096; size <= cache_max_size; size *= 2) { + for (size = 4 * PAGE_SIZE; size <= cache_max_size; size *= 2) { add_bucket(bufmgr, size); add_bucket(bufmgr, size + size * 1 / 4); @@ -1284,6 +1482,17 @@ iris_bufmgr_init(struct gen_device_info *devinfo, int fd) bufmgr->has_llc = devinfo->has_llc; + const uint64_t _4GB = 1ull << 32; + + util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SHADER], + PAGE_SIZE, _4GB); + util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_SURFACE], + 1 * _4GB, _4GB); + util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_DYNAMIC], + 2 * _4GB, _4GB); + util_vma_heap_init(&bufmgr->vma_allocator[IRIS_MEMZONE_OTHER], + 3 * _4GB, (1ull << 48) - 3 * _4GB); + init_cache_buckets(bufmgr); bufmgr->name_table = diff --git a/src/gallium/drivers/iris/iris_bufmgr.h b/src/gallium/drivers/iris/iris_bufmgr.h index fa4df2a53df..3c52c2d8722 100644 --- a/src/gallium/drivers/iris/iris_bufmgr.h +++ b/src/gallium/drivers/iris/iris_bufmgr.h @@ -35,6 +35,15 @@ struct gen_device_info; struct pipe_debug_callback; +enum iris_memory_zone { + IRIS_MEMZONE_DYNAMIC, + IRIS_MEMZONE_SURFACE, + IRIS_MEMZONE_SHADER, + IRIS_MEMZONE_OTHER, +}; + +#define IRIS_MEMZONE_COUNT (IRIS_MEMZONE_OTHER + 1) + struct iris_bo { /** * Size in bytes of the buffer object. @@ -51,31 +60,11 @@ struct iris_bo { uint32_t gem_handle; /** - * Offset of the buffer inside the Graphics Translation Table. - * - * This is effectively our GPU address for the buffer and we use it - * as our base for all state pointers into the buffer. However, since the - * kernel may be forced to move it around during the course of the - * buffer's lifetime, we can only know where the buffer was on the last - * execbuf. We presume, and are usually right, that the buffer will not - * move and so we use that last offset for the next batch and by doing - * so we can avoid having the kernel perform a relocation fixup pass as - * our pointers inside the batch will be using the correct base offset. - * - * Since we do use it as a base address for the next batch of pointers, - * the kernel treats our offset as a request, and if possible will - * arrange the buffer to placed at that address (trying to balance - * the cost of buffer migration versus the cost of performing - * relocations). Furthermore, we can force the kernel to place the buffer, - * or report a failure if we specified a conflicting offset, at our chosen - * offset by specifying EXEC_OBJECT_PINNED. + * Virtual address of the buffer inside the PPGTT (Per-Process Graphics + * Translation Table). * - * Note the GTT may be either per context, or shared globally across the - * system. On a shared system, our buffers have to contend for address - * space with both aperture mappings and framebuffers and so are more - * likely to be moved. On a full ppGTT system, each batch exists in its - * own GTT, and so each buffer may have their own offset within each - * context. + * Although each hardware context has its own VMA, we assign BO's to the + * same address in all contexts, for simplicity. */ uint64_t gtt_offset; @@ -156,7 +145,8 @@ struct iris_bo { */ struct iris_bo *iris_bo_alloc(struct iris_bufmgr *bufmgr, const char *name, - uint64_t size); + uint64_t size, + enum iris_memory_zone memzone); /** * Allocate a tiled buffer object. @@ -174,7 +164,8 @@ struct iris_bo *iris_bo_alloc_tiled(struct iris_bufmgr *bufmgr, uint64_t size, uint32_t tiling_mode, uint32_t pitch, - unsigned flags); + unsigned flags, + enum iris_memory_zone memzone); /** Takes a reference on a buffer object */ static inline void diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c index 93855741f84..92c9b038a3d 100644 --- a/src/gallium/drivers/iris/iris_resource.c +++ b/src/gallium/drivers/iris/iris_resource.c @@ -256,19 +256,20 @@ iris_resource_create_with_modifiers(struct pipe_screen *pscreen, .usage = usage, .tiling_flags = 1 << mod_info->tiling); - res->bo = iris_bo_alloc_tiled(screen->bufmgr, "resource", res->surf.size_B, + enum iris_memory_zone memzone = IRIS_MEMZONE_OTHER; + const char *name = "resource"; + if (templ->flags & IRIS_RESOURCE_FLAG_INSTRUCTION_CACHE) { + memzone = IRIS_MEMZONE_SHADER; + name = "shader kernels"; + } + + res->bo = iris_bo_alloc_tiled(screen->bufmgr, name, res->surf.size_B, + IRIS_MEMZONE_OTHER, isl_tiling_to_i915_tiling(res->surf.tiling), res->surf.row_pitch_B, 0); if (!res->bo) goto fail; - if (templ->flags & IRIS_RESOURCE_FLAG_INSTRUCTION_CACHE) { - res->bo->kflags = EXEC_OBJECT_PINNED; - res->bo->name = "instruction cache"; - // XXX: p_atomic_add is backwards :( - res->bo->gtt_offset = __atomic_fetch_add(&screen->next_instruction_address, res->bo->size, __ATOMIC_ACQ_REL); - } - return &res->base; fail: diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c index d6d125bb95b..79dba4c507c 100644 --- a/src/gallium/drivers/iris/iris_screen.c +++ b/src/gallium/drivers/iris/iris_screen.c @@ -526,8 +526,5 @@ iris_screen_create(int fd) pscreen->fence_finish = iris_fence_finish; pscreen->query_memory_info = iris_query_memory_info; - /* Put them somewhere non-zero */ - screen->next_instruction_address = 128 * 1024; - return pscreen; } diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h index f24f567532f..5484e535ac9 100644 --- a/src/gallium/drivers/iris/iris_screen.h +++ b/src/gallium/drivers/iris/iris_screen.h @@ -47,8 +47,6 @@ struct iris_screen { struct isl_device isl_dev; struct iris_bufmgr *bufmgr; struct brw_compiler *compiler; - - uint32_t next_instruction_address; }; struct pipe_screen *iris_screen_create(int fd); -- 2.30.2