From 715df30a4e27a76917d6172e68fd7f4c3ff1fb9d Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Sat, 4 Feb 2017 01:18:05 +0100 Subject: [PATCH] radv/amdgpu: Add winsys implementation of virtual buffers. v2: - Added comments. - Fixed a double unmap bug. - Actually unmap the non-edge old ranges. Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Dave Airlie --- src/amd/vulkan/radv_radeon_winsys.h | 5 + src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c | 237 ++++++++++++++++-- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h | 35 ++- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 98 +++++++- 4 files changed, 349 insertions(+), 26 deletions(-) diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h index a8a1d2e369f..9f2430f75de 100644 --- a/src/amd/vulkan/radv_radeon_winsys.h +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -47,6 +47,7 @@ enum radeon_bo_flag { /* bitfield */ RADEON_FLAG_GTT_WC = (1 << 0), RADEON_FLAG_CPU_ACCESS = (1 << 1), RADEON_FLAG_NO_CPU_ACCESS = (1 << 2), + RADEON_FLAG_VIRTUAL = (1 << 3) }; enum radeon_bo_usage { /* bitfield */ @@ -289,6 +290,10 @@ struct radeon_winsys { void (*buffer_set_metadata)(struct radeon_winsys_bo *bo, struct radeon_bo_metadata *md); + + void (*buffer_virtual_bind)(struct radeon_winsys_bo *parent, + uint64_t offset, uint64_t size, + struct radeon_winsys_bo *bo, uint64_t bo_offset); struct radeon_winsys_ctx *(*ctx_create)(struct radeon_winsys *ws); void (*ctx_destroy)(struct radeon_winsys_ctx *ctx); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c index 7f7fc7ac1ad..8e78ef75233 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c @@ -34,19 +34,201 @@ #include #include +static void +radv_amdgpu_winsys_virtual_map(struct radv_amdgpu_winsys_bo *bo, + const struct radv_amdgpu_map_range *range) +{ + assert(range->size); + + if (!range->bo) + return; /* TODO: PRT mapping */ + + int r = amdgpu_bo_va_op(range->bo->bo, range->bo_offset, range->size, + range->offset + bo->va, 0, AMDGPU_VA_OP_MAP); + if (r) + abort(); +} + +static void +radv_amdgpu_winsys_virtual_unmap(struct radv_amdgpu_winsys_bo *bo, + const struct radv_amdgpu_map_range *range) +{ + assert(range->size); + + if (!range->bo) + return; /* TODO: PRT mapping */ + + int r = amdgpu_bo_va_op(range->bo->bo, range->bo_offset, range->size, + range->offset + bo->va, 0, AMDGPU_VA_OP_UNMAP); + if (r) + abort(); +} + +static void +radv_amdgpu_winsys_rebuild_bo_list(struct radv_amdgpu_winsys_bo *bo) +{ + bo->bo_count = 0; + for (uint32_t i = 0; i < bo->range_count; ++i) { + bool found = false; + if (!bo->ranges[i].bo) + continue; + + for(uint32_t j = 0; j < bo->bo_count; ++j) { + if (bo->bos[j] == bo->ranges[i].bo) { + found = true; + break; + } + } + + if (!found) { + if (bo->bo_capacity == bo->bo_count) { + bo->bos = realloc(bo->bos, + (bo->bo_capacity + 1) * sizeof(struct radv_amdgpu_bo *)); + ++bo->bo_capacity; + } + bo->bos[bo->bo_count++] = bo->ranges[i].bo; + } + } +} + +static void +radv_amdgpu_winsys_bo_virtual_bind(struct radeon_winsys_bo *_parent, + uint64_t offset, uint64_t size, + struct radeon_winsys_bo *_bo, uint64_t bo_offset) +{ + struct radv_amdgpu_winsys_bo *parent = (struct radv_amdgpu_winsys_bo *)_parent; + struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo*)_bo; + int range_count_delta, new_idx; + int first = 0, last; + struct radv_amdgpu_map_range new_first, new_last; + + assert(parent->is_virtual); + assert(!bo || !bo->is_virtual); + + if (!size) + return; + + /* We have at most 2 new ranges (1 by the bind, and another one by splitting a range that contains the newly bound range). */ + if (parent->range_capacity - parent->range_count < 2) { + parent->range_capacity += 2; + parent->ranges = realloc(parent->ranges, + parent->range_capacity * sizeof(struct radv_amdgpu_map_range)); + } + + /* + * [first, last] is exactly the range of ranges that either overlap the + * new parent, or are adjacent to it. This corresponds to the bind ranges + * that may change. + */ + while(first + 1 < parent->range_count && parent->ranges[first].offset + parent->ranges[first].size < offset) + ++first; + + last = first; + while(last + 1 < parent->range_count && parent->ranges[last].offset <= offset + size) + ++last; + + /* Whether the first or last range are going to be totally removed or just + * resized/left alone. Note that in the case of first == last, we will split + * this into a part before and after the new range. The remove flag is then + * whether to not create the corresponding split part. */ + bool remove_first = parent->ranges[first].offset == offset; + bool remove_last = parent->ranges[last].offset + parent->ranges[last].size == offset + size; + bool unmapped_first = false; + + assert(parent->ranges[first].offset <= offset); + assert(parent->ranges[last].offset + parent->ranges[last].size >= offset + size); + + /* Try to merge the new range with the first range. */ + if (parent->ranges[first].bo == bo && (!bo || offset - bo_offset == parent->ranges[first].offset - parent->ranges[first].bo_offset)) { + size += offset - parent->ranges[first].offset; + offset = parent->ranges[first].offset; + remove_first = true; + } + + /* Try to merge the new range with the last range. */ + if (parent->ranges[last].bo == bo && (!bo || offset - bo_offset == parent->ranges[last].offset - parent->ranges[last].bo_offset)) { + size = parent->ranges[last].offset + parent->ranges[last].size - offset; + remove_last = true; + } + + range_count_delta = 1 - (last - first + 1) + !remove_first + !remove_last; + new_idx = first + !remove_first; + + /* Any range between first and last is going to be entirely covered by the new range so just unmap them. */ + for (int i = first + 1; i < last; ++i) + radv_amdgpu_winsys_virtual_unmap(parent, parent->ranges + i); + + /* If the first/last range are not left alone we unmap then and optionally map + * them again after modifications. Not that this implicitly can do the splitting + * if first == last. */ + new_first = parent->ranges[first]; + new_last = parent->ranges[last]; + + if (parent->ranges[first].offset + parent->ranges[first].size > offset || remove_first) { + radv_amdgpu_winsys_virtual_unmap(parent, parent->ranges + first); + unmapped_first = true; + + if (!remove_first) { + new_first.size = offset - new_first.offset; + radv_amdgpu_winsys_virtual_map(parent, &new_first); + } + } + + if (parent->ranges[last].offset < offset + size || remove_last) { + if (first != last || !unmapped_first) + radv_amdgpu_winsys_virtual_unmap(parent, parent->ranges + last); + + if (!remove_last) { + new_last.size -= offset + size - new_last.offset; + new_last.offset = offset + size; + radv_amdgpu_winsys_virtual_map(parent, &new_last); + } + } + + /* Moves the range list after last to account for the changed number of ranges. */ + memmove(parent->ranges + last + 1 + range_count_delta, parent->ranges + last + 1, + sizeof(struct radv_amdgpu_map_range) * (parent->range_count - last - 1)); + + if (!remove_first) + parent->ranges[first] = new_first; + + if (!remove_last) + parent->ranges[new_idx + 1] = new_last; + + /* Actually set up the new range. */ + parent->ranges[new_idx].offset = offset; + parent->ranges[new_idx].size = size; + parent->ranges[new_idx].bo = bo; + parent->ranges[new_idx].bo_offset = bo_offset; + + radv_amdgpu_winsys_virtual_map(parent, parent->ranges + new_idx); + + parent->range_count += range_count_delta; + + radv_amdgpu_winsys_rebuild_bo_list(parent); +} + static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo) { struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); - if (bo->ws->debug_all_bos) { - pthread_mutex_lock(&bo->ws->global_bo_list_lock); - LIST_DEL(&bo->global_list_item); - bo->ws->num_buffers--; - pthread_mutex_unlock(&bo->ws->global_bo_list_lock); + if (bo->is_virtual) { + for (uint32_t i = 0; i < bo->range_count; ++i) { + radv_amdgpu_winsys_virtual_unmap(bo, bo->ranges + i); + } + free(bo->bos); + free(bo->ranges); + } else { + if (bo->ws->debug_all_bos) { + pthread_mutex_lock(&bo->ws->global_bo_list_lock); + LIST_DEL(&bo->global_list_item); + bo->ws->num_buffers--; + pthread_mutex_unlock(&bo->ws->global_bo_list_lock); + } + amdgpu_bo_va_op(bo->bo, 0, bo->size, bo->va, 0, AMDGPU_VA_OP_UNMAP); + amdgpu_bo_free(bo->bo); } - amdgpu_bo_va_op(bo->bo, 0, bo->size, bo->va, 0, AMDGPU_VA_OP_UNMAP); amdgpu_va_range_free(bo->va_handle); - amdgpu_bo_free(bo->bo); FREE(bo); } @@ -81,6 +263,31 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws, return NULL; } + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, + size, alignment, 0, &va, &va_handle, 0); + if (r) + goto error_va_alloc; + + bo->va = va; + bo->va_handle = va_handle; + bo->size = size; + bo->ws = ws; + bo->is_virtual = !!(flags & RADEON_FLAG_VIRTUAL); + + if (flags & RADEON_FLAG_VIRTUAL) { + bo->ranges = realloc(NULL, sizeof(struct radv_amdgpu_map_range)); + bo->range_count = 1; + bo->range_capacity = 1; + + bo->ranges[0].offset = 0; + bo->ranges[0].size = size; + bo->ranges[0].bo = NULL; + bo->ranges[0].bo_offset = 0; + + radv_amdgpu_winsys_virtual_map(bo, bo->ranges); + return (struct radeon_winsys_bo *)bo; + } + request.alloc_size = size; request.phys_alignment = alignment; @@ -105,31 +312,22 @@ radv_amdgpu_winsys_bo_create(struct radeon_winsys *_ws, goto error_bo_alloc; } - r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, - size, alignment, 0, &va, &va_handle, 0); - if (r) - goto error_va_alloc; - r = amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP); if (r) goto error_va_map; bo->bo = buf_handle; - bo->va = va; - bo->va_handle = va_handle; bo->initial_domain = initial_domain; - bo->size = size; bo->is_shared = false; - bo->ws = ws; radv_amdgpu_add_buffer_to_global_list(bo); return (struct radeon_winsys_bo *)bo; error_va_map: - amdgpu_va_range_free(va_handle); - -error_va_alloc: amdgpu_bo_free(buf_handle); error_bo_alloc: + amdgpu_va_range_free(va_handle); + +error_va_alloc: FREE(bo); return NULL; } @@ -296,4 +494,5 @@ void radv_amdgpu_bo_init_functions(struct radv_amdgpu_winsys *ws) ws->base.buffer_from_fd = radv_amdgpu_winsys_bo_from_fd; ws->base.buffer_get_fd = radv_amdgpu_winsys_get_fd; ws->base.buffer_set_metadata = radv_amdgpu_winsys_bo_set_metadata; + ws->base.buffer_virtual_bind = radv_amdgpu_winsys_bo_virtual_bind; } diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h index 499b063d56f..c7d484bc8dd 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.h @@ -31,17 +31,40 @@ #include "radv_amdgpu_winsys.h" + +struct radv_amdgpu_map_range { + uint64_t offset; + uint64_t size; + struct radv_amdgpu_winsys_bo *bo; + uint64_t bo_offset; +}; + struct radv_amdgpu_winsys_bo { - amdgpu_bo_handle bo; amdgpu_va_handle va_handle; - uint64_t va; - enum radeon_bo_domain initial_domain; uint64_t size; - bool is_shared; - struct radv_amdgpu_winsys *ws; - struct list_head global_list_item; + bool is_virtual; + + union { + /* physical bo */ + struct { + amdgpu_bo_handle bo; + enum radeon_bo_domain initial_domain; + bool is_shared; + struct list_head global_list_item; + }; + /* virtual bo */ + struct { + struct radv_amdgpu_map_range *ranges; + uint32_t range_count; + uint32_t range_capacity; + + struct radv_amdgpu_winsys_bo **bos; + uint32_t bo_count; + uint32_t bo_capacity; + }; + }; }; static inline diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index 80100e6eabb..8430df4d0fe 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -34,6 +34,11 @@ #include "radv_amdgpu_bo.h" #include "sid.h" + +enum { + VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 +}; + struct radv_amdgpu_cs { struct radeon_winsys_cs base; struct radv_amdgpu_winsys *ws; @@ -56,6 +61,12 @@ struct radv_amdgpu_cs { int buffer_hash_table[1024]; unsigned hw_ip; + + unsigned num_virtual_buffers; + unsigned max_num_virtual_buffers; + struct radeon_winsys_bo **virtual_buffers; + uint8_t *virtual_buffer_priorities; + int *virtual_buffer_hash_table; }; static inline struct radv_amdgpu_cs * @@ -141,6 +152,9 @@ static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs *rcs) cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]); free(cs->old_ib_buffers); + free(cs->virtual_buffers); + free(cs->virtual_buffer_priorities); + free(cs->virtual_buffer_hash_table); free(cs->handles); free(cs->priorities); free(cs); @@ -315,7 +329,13 @@ static void radv_amdgpu_cs_reset(struct radeon_winsys_cs *_cs) cs->buffer_hash_table[hash] = -1; } + for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) { + unsigned hash = ((uintptr_t)cs->virtual_buffers[i] >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1); + cs->virtual_buffer_hash_table[hash] = -1; + } + cs->num_buffers = 0; + cs->num_virtual_buffers = 0; if (cs->ws->use_ib_bos) { cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8); @@ -380,6 +400,49 @@ static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, ++cs->num_buffers; } +static void radv_amdgpu_cs_add_virtual_buffer(struct radeon_winsys_cs *_cs, + struct radeon_winsys_bo *bo, + uint8_t priority) +{ + struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); + unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1); + + + if (!cs->virtual_buffer_hash_table) { + cs->virtual_buffer_hash_table = malloc(VIRTUAL_BUFFER_HASH_TABLE_SIZE * sizeof(int)); + for (int i = 0; i < VIRTUAL_BUFFER_HASH_TABLE_SIZE; ++i) + cs->virtual_buffer_hash_table[i] = -1; + } + + if (cs->virtual_buffer_hash_table[hash] >= 0) { + int idx = cs->virtual_buffer_hash_table[hash]; + if (cs->virtual_buffers[idx] == bo) { + cs->virtual_buffer_priorities[idx] = MAX2(cs->virtual_buffer_priorities[idx], priority); + return; + } + for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) { + if (cs->virtual_buffers[i] == bo) { + cs->virtual_buffer_priorities[i] = MAX2(cs->virtual_buffer_priorities[i], priority); + cs->virtual_buffer_hash_table[hash] = i; + return; + } + } + } + + if(cs->max_num_virtual_buffers <= cs->num_virtual_buffers) { + cs->max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2); + cs->virtual_buffers = realloc(cs->virtual_buffers, sizeof(struct radv_amdgpu_virtual_virtual_buffer*) * cs->max_num_virtual_buffers); + cs->virtual_buffer_priorities = realloc(cs->virtual_buffer_priorities, sizeof(uint8_t) * cs->max_num_virtual_buffers); + } + + cs->virtual_buffers[cs->num_virtual_buffers] = bo; + cs->virtual_buffer_priorities[cs->num_virtual_buffers] = priority; + + cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers; + ++cs->num_virtual_buffers; + +} + static void radv_amdgpu_cs_add_buffer(struct radeon_winsys_cs *_cs, struct radeon_winsys_bo *_bo, uint8_t priority) @@ -387,6 +450,11 @@ static void radv_amdgpu_cs_add_buffer(struct radeon_winsys_cs *_cs, struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo); + if (bo->is_virtual) { + radv_amdgpu_cs_add_virtual_buffer(_cs, _bo, priority); + return; + } + radv_amdgpu_cs_add_buffer_internal(cs, bo->bo, priority); } @@ -401,6 +469,11 @@ static void radv_amdgpu_cs_execute_secondary(struct radeon_winsys_cs *_parent, child->priorities[i]); } + for (unsigned i = 0; i < child->num_virtual_buffers; ++i) { + radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i], + child->virtual_buffer_priorities[i]); + } + if (parent->ws->use_ib_bos) { if (parent->base.cdw + 4 > parent->base.max_dw) radv_amdgpu_cs_grow(&parent->base, 4); @@ -449,7 +522,8 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, bo_list); free(handles); pthread_mutex_unlock(&ws->global_bo_list_lock); - } else if (count == 1 && !extra_bo && !extra_cs) { + } else if (count == 1 && !extra_bo && !extra_cs && + !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers) { struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[0]; r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles, cs->priorities, bo_list); @@ -459,6 +533,8 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, for (unsigned i = 0; i < count; ++i) { struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs*)cs_array[i]; total_buffer_count += cs->num_buffers; + for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) + total_buffer_count += radv_amdgpu_winsys_bo(cs->virtual_buffers[j])->bo_count; } if (extra_cs) { @@ -512,6 +588,26 @@ static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws, ++unique_bo_count; } } + for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) { + struct radv_amdgpu_winsys_bo *virtual_bo = radv_amdgpu_winsys_bo(cs->virtual_buffers[j]); + for(unsigned k = 0; k < virtual_bo->bo_count; ++k) { + struct radv_amdgpu_winsys_bo *bo = virtual_bo->bos[k]; + bool found = false; + for (unsigned m = 0; m < unique_bo_count; ++m) { + if (handles[m] == bo->bo) { + found = true; + priorities[m] = MAX2(priorities[m], + cs->virtual_buffer_priorities[j]); + break; + } + } + if (!found) { + handles[unique_bo_count] = bo->bo; + priorities[unique_bo_count] = cs->virtual_buffer_priorities[j]; + ++unique_bo_count; + } + } + } } r = amdgpu_bo_list_create(ws->dev, unique_bo_count, handles, priorities, bo_list); -- 2.30.2