From aaea46242d651a1b03f7292ac89a68f8a9086692 Mon Sep 17 00:00:00 2001 From: Scott D Phillips Date: Wed, 7 Mar 2018 09:18:37 -0800 Subject: [PATCH] anv: Add vma_heap allocators in anv_device These will be used to assign virtual addresses to soft pinned buffers in a later patch. Two allocators are added for separate 'low' and 'high' virtual memory areas. Another alternative would have been to add a double-sided allocator, which wasn't done here just because it didn't appear to give any code complexity advantages. v2 (Scott Phillips): - rename has_exec_softpin to use_softpin (Jason) - Only remove bottom one page and top 4 GiB from virt (Jason) - refer to comment in anv_allocator about state address + size overflowing 48 bits (Jason) - Mention hi/lo allocators vs double-sided allocator in commit message (Chris) - assign state pool memory ranges statically (Jason) v3 (Jason Ekstrand): - Use (LOW|HIGH)_HEAP_(MIN|MAX)_ADDRESS rather than (1 << 31) for determining which heap to use in anv_vma_free - Only return de-canonicalized addresses to the heap Reviewed-by: Jordan Justen Reviewed-by: Jason Ekstrand Reviewed-by: Scott D Phillips --- src/intel/vulkan/anv_device.c | 84 ++++++++++++++++++++++++++++++++++ src/intel/vulkan/anv_private.h | 60 ++++++++++++++++++++++++ 2 files changed, 144 insertions(+) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 374fc16c4c9..276e32bddda 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -374,6 +374,9 @@ anv_physical_device_init(struct anv_physical_device *device, anv_gem_supports_syncobj_wait(fd); device->has_context_priority = anv_gem_has_context_priority(fd); + device->use_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN) + && device->supports_48bit_addresses; + bool swizzled = anv_gem_get_bit6_swizzle(fd, I915_TILING_X); /* Starting with Gen10, the timestamp frequency of the command streamer may @@ -1527,6 +1530,27 @@ VkResult anv_CreateDevice( goto fail_fd; } + if (physical_device->use_softpin) { + if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail_fd; + } + + /* keep the page with address zero out of the allocator */ + util_vma_heap_init(&device->vma_lo, LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE); + device->vma_lo_available = + physical_device->memory.heaps[physical_device->memory.heap_count - 1].size; + + /* Leave the last 4GiB out of the high vma range, so that no state base + * address + size can overflow 48 bits. For more information see the + * comment about Wa32bitGeneralStateOffset in anv_allocator.c + */ + util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS, + HIGH_HEAP_SIZE); + device->vma_hi_available = physical_device->memory.heap_count == 1 ? 0 : + physical_device->memory.heaps[0].size; + } + /* As per spec, the driver implementation may deny requests to acquire * a priority above the default priority (MEDIUM) if the caller does not * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_EXT @@ -1887,6 +1911,66 @@ VkResult anv_DeviceWaitIdle( return anv_device_submit_simple_batch(device, &batch); } +bool +anv_vma_alloc(struct anv_device *device, struct anv_bo *bo) +{ + if (!(bo->flags & EXEC_OBJECT_PINNED)) + return true; + + pthread_mutex_lock(&device->vma_mutex); + + bo->offset = 0; + + if (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS && + device->vma_hi_available >= bo->size) { + uint64_t addr = util_vma_heap_alloc(&device->vma_hi, bo->size, 4096); + if (addr) { + bo->offset = gen_canonical_address(addr); + assert(addr == gen_48b_address(bo->offset)); + device->vma_hi_available -= bo->size; + } + } + + if (bo->offset == 0 && device->vma_lo_available >= bo->size) { + uint64_t addr = util_vma_heap_alloc(&device->vma_lo, bo->size, 4096); + if (addr) { + bo->offset = gen_canonical_address(addr); + assert(addr == gen_48b_address(bo->offset)); + device->vma_lo_available -= bo->size; + } + } + + pthread_mutex_unlock(&device->vma_mutex); + + return bo->offset != 0; +} + +void +anv_vma_free(struct anv_device *device, struct anv_bo *bo) +{ + if (!(bo->flags & EXEC_OBJECT_PINNED)) + return; + + const uint64_t addr_48b = gen_48b_address(bo->offset); + + pthread_mutex_lock(&device->vma_mutex); + + if (addr_48b >= LOW_HEAP_MIN_ADDRESS && + addr_48b <= LOW_HEAP_MAX_ADDRESS) { + util_vma_heap_free(&device->vma_lo, addr_48b, bo->size); + device->vma_lo_available += bo->size; + } else { + assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS && + addr_48b <= HIGH_HEAP_MAX_ADDRESS); + util_vma_heap_free(&device->vma_hi, addr_48b, bo->size); + device->vma_hi_available += bo->size; + } + + pthread_mutex_unlock(&device->vma_mutex); + + bo->offset = 0; +} + VkResult anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size) { diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index c10af14eadf..60444d99a42 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -50,6 +50,7 @@ #include "util/list.h" #include "util/u_atomic.h" #include "util/u_vector.h" +#include "util/vma.h" #include "vk_alloc.h" #include "vk_debug_report.h" @@ -80,6 +81,55 @@ struct gen_l3_config; #include "common/intel_log.h" #include "wsi_common.h" +/* anv Virtual Memory Layout + * ========================= + * + * When the anv driver is determining the virtual graphics addresses of memory + * objects itself using the softpin mechanism, the following memory ranges + * will be used. + * + * Three special considerations to notice: + * + * (1) the dynamic state pool is located within the same 4 GiB as the low + * heap. This is to work around a VF cache issue described in a comment in + * anv_physical_device_init_heaps. + * + * (2) the binding table pool is located at lower addresses than the surface + * state pool, within a 4 GiB range. This allows surface state base addresses + * to cover both binding tables (16 bit offsets) and surface states (32 bit + * offsets). + * + * (3) the last 4 GiB of the address space is withheld from the high + * heap. Various hardware units will read past the end of an object for + * various reasons. This healthy margin prevents reads from wrapping around + * 48-bit addresses. + */ +#define LOW_HEAP_MIN_ADDRESS 0x000000001000ULL /* 4 KiB */ +#define LOW_HEAP_MAX_ADDRESS 0x0000bfffffffULL +#define DYNAMIC_STATE_POOL_MIN_ADDRESS 0x0000c0000000ULL /* 3 GiB */ +#define DYNAMIC_STATE_POOL_MAX_ADDRESS 0x0000ffffffffULL +#define BINDING_TABLE_POOL_MIN_ADDRESS 0x000100000000ULL /* 4 GiB */ +#define BINDING_TABLE_POOL_MAX_ADDRESS 0x00013fffffffULL +#define SURFACE_STATE_POOL_MIN_ADDRESS 0x000140000000ULL /* 5 GiB */ +#define SURFACE_STATE_POOL_MAX_ADDRESS 0x00017fffffffULL +#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */ +#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL +#define HIGH_HEAP_MIN_ADDRESS 0x0001c0000000ULL /* 7 GiB */ +#define HIGH_HEAP_MAX_ADDRESS 0xfffeffffffffULL + +#define LOW_HEAP_SIZE \ + (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1) +#define HIGH_HEAP_SIZE \ + (HIGH_HEAP_MAX_ADDRESS - HIGH_HEAP_MIN_ADDRESS + 1) +#define DYNAMIC_STATE_POOL_SIZE \ + (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1) +#define BINDING_TABLE_POOL_SIZE \ + (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1) +#define SURFACE_STATE_POOL_SIZE \ + (SURFACE_STATE_POOL_MAX_ADDRESS - SURFACE_STATE_POOL_MIN_ADDRESS + 1) +#define INSTRUCTION_STATE_POOL_SIZE \ + (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1) + /* Allowing different clear colors requires us to perform a depth resolve at * the end of certain render passes. This is because while slow clears store * the clear color in the HiZ buffer, fast clears (without a resolve) don't. @@ -791,6 +841,7 @@ struct anv_physical_device { bool has_syncobj; bool has_syncobj_wait; bool has_context_priority; + bool use_softpin; struct anv_device_extension_table supported_extensions; @@ -884,6 +935,12 @@ struct anv_device { struct anv_device_extension_table enabled_extensions; struct anv_dispatch_table dispatch; + pthread_mutex_t vma_mutex; + struct util_vma_heap vma_lo; + struct util_vma_heap vma_hi; + uint64_t vma_lo_available; + uint64_t vma_hi_available; + struct anv_bo_pool batch_bo_pool; struct anv_bo_cache bo_cache; @@ -977,6 +1034,9 @@ int anv_gem_syncobj_wait(struct anv_device *device, uint32_t *handles, uint32_t num_handles, int64_t abs_timeout_ns, bool wait_all); +bool anv_vma_alloc(struct anv_device *device, struct anv_bo *bo); +void anv_vma_free(struct anv_device *device, struct anv_bo *bo); + VkResult anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size); struct anv_reloc_list { -- 2.30.2