anv: Add vma_heap allocators in anv_device
authorScott D Phillips <scott.d.phillips@intel.com>
Wed, 7 Mar 2018 17:18:37 +0000 (09:18 -0800)
committerJason Ekstrand <jason.ekstrand@intel.com>
Thu, 31 May 2018 23:51:46 +0000 (16:51 -0700)
These will be used to assign virtual addresses to soft pinned
buffers in a later patch.

Two allocators are added for separate 'low' and 'high' virtual
memory areas. Another alternative would have been to add a
double-sided allocator, which wasn't done here just because it
didn't appear to give any code complexity advantages.

v2 (Scott Phillips):
 - rename has_exec_softpin to use_softpin (Jason)
 - Only remove bottom one page and top 4 GiB from virt (Jason)
 - refer to comment in anv_allocator about state address + size
   overflowing 48 bits (Jason)
 - Mention hi/lo allocators vs double-sided allocator in
   commit message (Chris)
 - assign state pool memory ranges statically (Jason)

v3 (Jason Ekstrand):
 - Use (LOW|HIGH)_HEAP_(MIN|MAX)_ADDRESS rather than (1 << 31) for
   determining which heap to use in anv_vma_free
 - Only return de-canonicalized addresses to the heap

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Scott D Phillips <scott.d.phillips@intel.com>
src/intel/vulkan/anv_device.c
src/intel/vulkan/anv_private.h

index 374fc16c4c99e882ccf364c263ca9806d09fc184..276e32bddda262680e852b4003bd14ad6b3955ba 100644 (file)
@@ -374,6 +374,9 @@ anv_physical_device_init(struct anv_physical_device *device,
                               anv_gem_supports_syncobj_wait(fd);
    device->has_context_priority = anv_gem_has_context_priority(fd);
 
+   device->use_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN)
+      && device->supports_48bit_addresses;
+
    bool swizzled = anv_gem_get_bit6_swizzle(fd, I915_TILING_X);
 
    /* Starting with Gen10, the timestamp frequency of the command streamer may
@@ -1527,6 +1530,27 @@ VkResult anv_CreateDevice(
       goto fail_fd;
    }
 
+   if (physical_device->use_softpin) {
+      if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
+         result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+         goto fail_fd;
+      }
+
+      /* keep the page with address zero out of the allocator */
+      util_vma_heap_init(&device->vma_lo, LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE);
+      device->vma_lo_available =
+         physical_device->memory.heaps[physical_device->memory.heap_count - 1].size;
+
+      /* Leave the last 4GiB out of the high vma range, so that no state base
+       * address + size can overflow 48 bits. For more information see the
+       * comment about Wa32bitGeneralStateOffset in anv_allocator.c
+       */
+      util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS,
+                         HIGH_HEAP_SIZE);
+      device->vma_hi_available = physical_device->memory.heap_count == 1 ? 0 :
+         physical_device->memory.heaps[0].size;
+   }
+
    /* As per spec, the driver implementation may deny requests to acquire
     * a priority above the default priority (MEDIUM) if the caller does not
     * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_EXT
@@ -1887,6 +1911,66 @@ VkResult anv_DeviceWaitIdle(
    return anv_device_submit_simple_batch(device, &batch);
 }
 
+bool
+anv_vma_alloc(struct anv_device *device, struct anv_bo *bo)
+{
+   if (!(bo->flags & EXEC_OBJECT_PINNED))
+      return true;
+
+   pthread_mutex_lock(&device->vma_mutex);
+
+   bo->offset = 0;
+
+   if (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS &&
+       device->vma_hi_available >= bo->size) {
+      uint64_t addr = util_vma_heap_alloc(&device->vma_hi, bo->size, 4096);
+      if (addr) {
+         bo->offset = gen_canonical_address(addr);
+         assert(addr == gen_48b_address(bo->offset));
+         device->vma_hi_available -= bo->size;
+      }
+   }
+
+   if (bo->offset == 0 && device->vma_lo_available >= bo->size) {
+      uint64_t addr = util_vma_heap_alloc(&device->vma_lo, bo->size, 4096);
+      if (addr) {
+         bo->offset = gen_canonical_address(addr);
+         assert(addr == gen_48b_address(bo->offset));
+         device->vma_lo_available -= bo->size;
+      }
+   }
+
+   pthread_mutex_unlock(&device->vma_mutex);
+
+   return bo->offset != 0;
+}
+
+void
+anv_vma_free(struct anv_device *device, struct anv_bo *bo)
+{
+   if (!(bo->flags & EXEC_OBJECT_PINNED))
+      return;
+
+   const uint64_t addr_48b = gen_48b_address(bo->offset);
+
+   pthread_mutex_lock(&device->vma_mutex);
+
+   if (addr_48b >= LOW_HEAP_MIN_ADDRESS &&
+       addr_48b <= LOW_HEAP_MAX_ADDRESS) {
+      util_vma_heap_free(&device->vma_lo, addr_48b, bo->size);
+      device->vma_lo_available += bo->size;
+   } else {
+      assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS &&
+             addr_48b <= HIGH_HEAP_MAX_ADDRESS);
+      util_vma_heap_free(&device->vma_hi, addr_48b, bo->size);
+      device->vma_hi_available += bo->size;
+   }
+
+   pthread_mutex_unlock(&device->vma_mutex);
+
+   bo->offset = 0;
+}
+
 VkResult
 anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size)
 {
index c10af14eadff385a3342ac085ee41a6ba9b999dc..60444d99a42d9222c3628048c3267cb0c16ad59d 100644 (file)
@@ -50,6 +50,7 @@
 #include "util/list.h"
 #include "util/u_atomic.h"
 #include "util/u_vector.h"
+#include "util/vma.h"
 #include "vk_alloc.h"
 #include "vk_debug_report.h"
 
@@ -80,6 +81,55 @@ struct gen_l3_config;
 #include "common/intel_log.h"
 #include "wsi_common.h"
 
+/* anv Virtual Memory Layout
+ * =========================
+ *
+ * When the anv driver is determining the virtual graphics addresses of memory
+ * objects itself using the softpin mechanism, the following memory ranges
+ * will be used.
+ *
+ * Three special considerations to notice:
+ *
+ * (1) the dynamic state pool is located within the same 4 GiB as the low
+ * heap. This is to work around a VF cache issue described in a comment in
+ * anv_physical_device_init_heaps.
+ *
+ * (2) the binding table pool is located at lower addresses than the surface
+ * state pool, within a 4 GiB range. This allows surface state base addresses
+ * to cover both binding tables (16 bit offsets) and surface states (32 bit
+ * offsets).
+ *
+ * (3) the last 4 GiB of the address space is withheld from the high
+ * heap. Various hardware units will read past the end of an object for
+ * various reasons. This healthy margin prevents reads from wrapping around
+ * 48-bit addresses.
+ */
+#define LOW_HEAP_MIN_ADDRESS               0x000000001000ULL /* 4 KiB */
+#define LOW_HEAP_MAX_ADDRESS               0x0000bfffffffULL
+#define DYNAMIC_STATE_POOL_MIN_ADDRESS     0x0000c0000000ULL /* 3 GiB */
+#define DYNAMIC_STATE_POOL_MAX_ADDRESS     0x0000ffffffffULL
+#define BINDING_TABLE_POOL_MIN_ADDRESS     0x000100000000ULL /* 4 GiB */
+#define BINDING_TABLE_POOL_MAX_ADDRESS     0x00013fffffffULL
+#define SURFACE_STATE_POOL_MIN_ADDRESS     0x000140000000ULL /* 5 GiB */
+#define SURFACE_STATE_POOL_MAX_ADDRESS     0x00017fffffffULL
+#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */
+#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
+#define HIGH_HEAP_MIN_ADDRESS              0x0001c0000000ULL /* 7 GiB */
+#define HIGH_HEAP_MAX_ADDRESS              0xfffeffffffffULL
+
+#define LOW_HEAP_SIZE               \
+   (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1)
+#define HIGH_HEAP_SIZE              \
+   (HIGH_HEAP_MAX_ADDRESS - HIGH_HEAP_MIN_ADDRESS + 1)
+#define DYNAMIC_STATE_POOL_SIZE     \
+   (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1)
+#define BINDING_TABLE_POOL_SIZE     \
+   (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1)
+#define SURFACE_STATE_POOL_SIZE     \
+   (SURFACE_STATE_POOL_MAX_ADDRESS - SURFACE_STATE_POOL_MIN_ADDRESS + 1)
+#define INSTRUCTION_STATE_POOL_SIZE \
+   (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1)
+
 /* Allowing different clear colors requires us to perform a depth resolve at
  * the end of certain render passes. This is because while slow clears store
  * the clear color in the HiZ buffer, fast clears (without a resolve) don't.
@@ -791,6 +841,7 @@ struct anv_physical_device {
     bool                                        has_syncobj;
     bool                                        has_syncobj_wait;
     bool                                        has_context_priority;
+    bool                                        use_softpin;
 
     struct anv_device_extension_table           supported_extensions;
 
@@ -884,6 +935,12 @@ struct anv_device {
     struct anv_device_extension_table           enabled_extensions;
     struct anv_dispatch_table                   dispatch;
 
+    pthread_mutex_t                             vma_mutex;
+    struct util_vma_heap                        vma_lo;
+    struct util_vma_heap                        vma_hi;
+    uint64_t                                    vma_lo_available;
+    uint64_t                                    vma_hi_available;
+
     struct anv_bo_pool                          batch_bo_pool;
 
     struct anv_bo_cache                         bo_cache;
@@ -977,6 +1034,9 @@ int anv_gem_syncobj_wait(struct anv_device *device,
                          uint32_t *handles, uint32_t num_handles,
                          int64_t abs_timeout_ns, bool wait_all);
 
+bool anv_vma_alloc(struct anv_device *device, struct anv_bo *bo);
+void anv_vma_free(struct anv_device *device, struct anv_bo *bo);
+
 VkResult anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size);
 
 struct anv_reloc_list {