anv: fix incorrect VMA alignment for CCS main surfaces
[mesa.git] / src / intel / vulkan / anv_device.c
index 87978b3d0860b988a2ce44c3af0e1e8081f362f7..ca71cc6d24522ee07e40b86f9c94f5b4d08d36d1 100644 (file)
@@ -126,25 +126,28 @@ anv_compute_heap_size(int fd, uint64_t gtt_size)
 static VkResult
 anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
 {
-   uint64_t gtt_size;
    if (anv_gem_get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE,
-                                 &gtt_size) == -1) {
+                                 &device->gtt_size) == -1) {
       /* If, for whatever reason, we can't actually get the GTT size from the
        * kernel (too old?) fall back to the aperture size.
        */
       anv_perf_warn(NULL, NULL,
                     "Failed to get I915_CONTEXT_PARAM_GTT_SIZE: %m");
 
-      if (anv_gem_get_aperture(fd, &gtt_size) == -1) {
+      if (anv_gem_get_aperture(fd, &device->gtt_size) == -1) {
          return vk_errorf(NULL, NULL, VK_ERROR_INITIALIZATION_FAILED,
                           "failed to get aperture size: %m");
       }
    }
 
+   /* We only allow 48-bit addresses with softpin because knowing the actual
+    * address is required for the vertex cache flush workaround.
+    */
    device->supports_48bit_addresses = (device->info.gen >= 8) &&
-      gtt_size > (4ULL << 30 /* GiB */);
+                                      device->has_softpin &&
+                                      device->gtt_size > (4ULL << 30 /* GiB */);
 
-   uint64_t heap_size = anv_compute_heap_size(fd, gtt_size);
+   uint64_t heap_size = anv_compute_heap_size(fd, device->gtt_size);
 
    if (heap_size > (2ull << 30) && !device->supports_48bit_addresses) {
       /* When running with an overridden PCI ID, we may get a GTT size from
@@ -158,69 +161,14 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
       heap_size = 2ull << 30;
    }
 
-   if (heap_size <= 3ull * (1ull << 30)) {
-      /* In this case, everything fits nicely into the 32-bit address space,
-       * so there's no need for supporting 48bit addresses on client-allocated
-       * memory objects.
-       */
-      device->memory.heap_count = 1;
-      device->memory.heaps[0] = (struct anv_memory_heap) {
-         .vma_start = LOW_HEAP_MIN_ADDRESS,
-         .vma_size = LOW_HEAP_SIZE,
-         .size = heap_size,
-         .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
-         .supports_48bit_addresses = false,
-      };
-   } else {
-      /* Not everything will fit nicely into a 32-bit address space.  In this
-       * case we need a 64-bit heap.  Advertise a small 32-bit heap and a
-       * larger 48-bit heap.  If we're in this case, then we have a total heap
-       * size larger than 3GiB which most likely means they have 8 GiB of
-       * video memory and so carving off 1 GiB for the 32-bit heap should be
-       * reasonable.
-       */
-      const uint64_t heap_size_32bit = 1ull << 30;
-      const uint64_t heap_size_48bit = heap_size - heap_size_32bit;
-
-      assert(device->supports_48bit_addresses);
-
-      device->memory.heap_count = 2;
-      device->memory.heaps[0] = (struct anv_memory_heap) {
-         .vma_start = HIGH_HEAP_MIN_ADDRESS,
-         /* Leave the last 4GiB out of the high vma range, so that no state
-          * base address + size can overflow 48 bits. For more information see
-          * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
-          */
-         .vma_size = gtt_size - (1ull << 32) - HIGH_HEAP_MIN_ADDRESS,
-         .size = heap_size_48bit,
-         .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
-         .supports_48bit_addresses = true,
-      };
-      device->memory.heaps[1] = (struct anv_memory_heap) {
-         .vma_start = LOW_HEAP_MIN_ADDRESS,
-         .vma_size = LOW_HEAP_SIZE,
-         .size = heap_size_32bit,
-         .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
-         .supports_48bit_addresses = false,
-      };
-   }
+   device->memory.heap_count = 1;
+   device->memory.heaps[0] = (struct anv_memory_heap) {
+      .size = heap_size,
+      .flags = VK_MEMORY_HEAP_DEVICE_LOCAL_BIT,
+   };
 
    uint32_t type_count = 0;
    for (uint32_t heap = 0; heap < device->memory.heap_count; heap++) {
-      uint32_t valid_buffer_usage = ~0;
-
-      /* There appears to be a hardware issue in the VF cache where it only
-       * considers the bottom 32 bits of memory addresses.  If you happen to
-       * have two vertex buffers which get placed exactly 4 GiB apart and use
-       * them in back-to-back draw calls, you can get collisions.  In order to
-       * solve this problem, we require vertex and index buffers be bound to
-       * memory allocated out of the 32-bit heap.
-       */
-      if (device->memory.heaps[heap].supports_48bit_addresses) {
-         valid_buffer_usage &= ~(VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
-                                 VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
-      }
-
       if (device->info.has_llc) {
          /* Big core GPUs share LLC with the CPU and thus one memory type can be
           * both cached and coherent at the same time.
@@ -231,7 +179,6 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
                              VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
                              VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
             .heapIndex = heap,
-            .valid_buffer_usage = valid_buffer_usage,
          };
       } else {
          /* The spec requires that we expose a host-visible, coherent memory
@@ -244,14 +191,12 @@ anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
                              VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                              VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
             .heapIndex = heap,
-            .valid_buffer_usage = valid_buffer_usage,
          };
          device->memory.types[type_count++] = (struct anv_memory_type) {
             .propertyFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
                              VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
                              VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
             .heapIndex = heap,
-            .valid_buffer_usage = valid_buffer_usage,
          };
       }
    }
@@ -471,10 +416,7 @@ anv_physical_device_init(struct anv_physical_device *device,
       goto fail;
    }
 
-   result = anv_physical_device_init_heaps(device, fd);
-   if (result != VK_SUCCESS)
-      goto fail;
-
+   device->has_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN);
    device->has_exec_async = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_ASYNC);
    device->has_exec_capture = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_CAPTURE);
    device->has_exec_fence = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_FENCE);
@@ -483,8 +425,12 @@ anv_physical_device_init(struct anv_physical_device *device,
                               anv_gem_supports_syncobj_wait(fd);
    device->has_context_priority = anv_gem_has_context_priority(fd);
 
-   device->use_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN)
-      && device->supports_48bit_addresses;
+   result = anv_physical_device_init_heaps(device, fd);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   device->use_softpin = device->has_softpin &&
+                         device->supports_48bit_addresses;
 
    device->has_context_isolation =
       anv_gem_get_param(fd, I915_PARAM_HAS_CONTEXT_ISOLATION);
@@ -1052,6 +998,15 @@ void anv_GetPhysicalDeviceFeatures2(
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_BUFFER_DEVICE_ADDRESS_FEATURES_KHR: {
+         VkPhysicalDeviceBufferDeviceAddressFeaturesKHR *features = (void *)ext;
+         features->bufferDeviceAddress = pdevice->has_a64_buffer_access;
+         features->bufferDeviceAddressCaptureReplay =
+            pdevice->has_a64_buffer_access;
+         features->bufferDeviceAddressMultiDevice = false;
+         break;
+      }
+
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COMPUTE_SHADER_DERIVATIVES_FEATURES_NV: {
          VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *features =
             (VkPhysicalDeviceComputeShaderDerivativesFeaturesNV *)ext;
@@ -2212,6 +2167,7 @@ anv_device_init_trivial_batch(struct anv_device *device)
 {
    VkResult result = anv_device_alloc_bo(device, 4096,
                                          ANV_BO_ALLOC_MAPPED,
+                                         0 /* explicit_address */,
                                          &device->trivial_batch_bo);
    if (result != VK_SUCCESS)
       return result;
@@ -2320,6 +2276,7 @@ anv_device_init_hiz_clear_value_bo(struct anv_device *device)
 {
    VkResult result = anv_device_alloc_bo(device, 4096,
                                          ANV_BO_ALLOC_MAPPED,
+                                         0 /* explicit_address */,
                                          &device->hiz_clear_bo);
    if (result != VK_SUCCESS)
       return result;
@@ -2549,16 +2506,19 @@ VkResult anv_CreateDevice(
       }
 
       /* keep the page with address zero out of the allocator */
-      struct anv_memory_heap *low_heap =
-         &physical_device->memory.heaps[physical_device->memory.heap_count - 1];
-      util_vma_heap_init(&device->vma_lo, low_heap->vma_start, low_heap->vma_size);
-      device->vma_lo_available = low_heap->size;
-
-      struct anv_memory_heap *high_heap =
-         &physical_device->memory.heaps[0];
-      util_vma_heap_init(&device->vma_hi, high_heap->vma_start, high_heap->vma_size);
-      device->vma_hi_available = physical_device->memory.heap_count == 1 ? 0 :
-         high_heap->size;
+      util_vma_heap_init(&device->vma_lo,
+                         LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE);
+
+      util_vma_heap_init(&device->vma_cva, CLIENT_VISIBLE_HEAP_MIN_ADDRESS,
+                         CLIENT_VISIBLE_HEAP_SIZE);
+
+      /* Leave the last 4GiB out of the high vma range, so that no state
+       * base address + size can overflow 48 bits. For more information see
+       * the comment about Wa32bitGeneralStateOffset in anv_allocator.c
+       */
+      util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS,
+                         physical_device->gtt_size - (1ull << 32) -
+                         HIGH_HEAP_MIN_ADDRESS);
    }
 
    list_inithead(&device->memory_objects);
@@ -2651,7 +2611,9 @@ VkResult anv_CreateDevice(
          goto fail_binding_table_pool;
    }
 
-   result = anv_device_alloc_bo(device, 4096, 0, &device->workaround_bo);
+   result = anv_device_alloc_bo(device, 4096, 0 /* flags */,
+                                0 /* explicit_address */,
+                                &device->workaround_bo);
    if (result != VK_SUCCESS)
       goto fail_surface_aux_map_pool;
 
@@ -2740,6 +2702,7 @@ VkResult anv_CreateDevice(
  fail_vmas:
    if (physical_device->use_softpin) {
       util_vma_heap_finish(&device->vma_hi);
+      util_vma_heap_finish(&device->vma_cva);
       util_vma_heap_finish(&device->vma_lo);
    }
  fail_queue:
@@ -2804,6 +2767,7 @@ void anv_DestroyDevice(
 
    if (physical_device->use_softpin) {
       util_vma_heap_finish(&device->vma_hi);
+      util_vma_heap_finish(&device->vma_cva);
       util_vma_heap_finish(&device->vma_lo);
    }
 
@@ -3005,34 +2969,66 @@ VkResult anv_DeviceWaitIdle(
 }
 
 bool
-anv_vma_alloc(struct anv_device *device, struct anv_bo *bo)
+anv_vma_alloc(struct anv_device *device, struct anv_bo *bo,
+              uint64_t client_address)
 {
-   if (!(bo->flags & EXEC_OBJECT_PINNED))
+   const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
+   const struct gen_device_info *devinfo = &pdevice->info;
+   /* Gen12 CCS surface addresses need to be 64K aligned. We have no way of
+    * telling what this allocation is for so pick the largest alignment.
+    */
+   const uint32_t vma_alignment =
+      devinfo->gen >= 12 ? (64 * 1024) : (4 * 1024);
+
+   if (!(bo->flags & EXEC_OBJECT_PINNED)) {
+      assert(!(bo->has_client_visible_address));
       return true;
+   }
 
    pthread_mutex_lock(&device->vma_mutex);
 
    bo->offset = 0;
 
-   if (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS &&
-       device->vma_hi_available >= bo->size) {
-      uint64_t addr = util_vma_heap_alloc(&device->vma_hi, bo->size, 4096);
+   if (bo->has_client_visible_address) {
+      assert(bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS);
+      if (client_address) {
+         if (util_vma_heap_alloc_addr(&device->vma_cva,
+                                      client_address, bo->size)) {
+            bo->offset = gen_canonical_address(client_address);
+         }
+      } else {
+         uint64_t addr =
+            util_vma_heap_alloc(&device->vma_cva, bo->size, vma_alignment);
+         if (addr) {
+            bo->offset = gen_canonical_address(addr);
+            assert(addr == gen_48b_address(bo->offset));
+         }
+      }
+      /* We don't want to fall back to other heaps */
+      goto done;
+   }
+
+   assert(client_address == 0);
+
+   if (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) {
+      uint64_t addr =
+         util_vma_heap_alloc(&device->vma_hi, bo->size, vma_alignment);
       if (addr) {
          bo->offset = gen_canonical_address(addr);
          assert(addr == gen_48b_address(bo->offset));
-         device->vma_hi_available -= bo->size;
       }
    }
 
-   if (bo->offset == 0 && device->vma_lo_available >= bo->size) {
-      uint64_t addr = util_vma_heap_alloc(&device->vma_lo, bo->size, 4096);
+   if (bo->offset == 0) {
+      uint64_t addr =
+         util_vma_heap_alloc(&device->vma_lo, bo->size, vma_alignment);
       if (addr) {
          bo->offset = gen_canonical_address(addr);
          assert(addr == gen_48b_address(bo->offset));
-         device->vma_lo_available -= bo->size;
       }
    }
 
+done:
    pthread_mutex_unlock(&device->vma_mutex);
 
    return bo->offset != 0;
@@ -3051,15 +3047,12 @@ anv_vma_free(struct anv_device *device, struct anv_bo *bo)
    if (addr_48b >= LOW_HEAP_MIN_ADDRESS &&
        addr_48b <= LOW_HEAP_MAX_ADDRESS) {
       util_vma_heap_free(&device->vma_lo, addr_48b, bo->size);
-      device->vma_lo_available += bo->size;
+   } else if (addr_48b >= CLIENT_VISIBLE_HEAP_MIN_ADDRESS &&
+              addr_48b <= CLIENT_VISIBLE_HEAP_MAX_ADDRESS) {
+      util_vma_heap_free(&device->vma_cva, addr_48b, bo->size);
    } else {
-      ASSERTED const struct anv_physical_device *physical_device =
-         &device->instance->physicalDevice;
-      assert(addr_48b >= physical_device->memory.heaps[0].vma_start &&
-             addr_48b < (physical_device->memory.heaps[0].vma_start +
-                         physical_device->memory.heaps[0].vma_size));
+      assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS);
       util_vma_heap_free(&device->vma_hi, addr_48b, bo->size);
-      device->vma_hi_available += bo->size;
    }
 
    pthread_mutex_unlock(&device->vma_mutex);
@@ -3083,10 +3076,22 @@ VkResult anv_AllocateMemory(
    /* The Vulkan 1.0.33 spec says "allocationSize must be greater than 0". */
    assert(pAllocateInfo->allocationSize > 0);
 
-   if (pAllocateInfo->allocationSize > MAX_MEMORY_ALLOCATION_SIZE)
-      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+   VkDeviceSize aligned_alloc_size =
+      align_u64(pAllocateInfo->allocationSize, 4096);
+
+   if (aligned_alloc_size > MAX_MEMORY_ALLOCATION_SIZE)
+      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count);
+   struct anv_memory_type *mem_type =
+      &pdevice->memory.types[pAllocateInfo->memoryTypeIndex];
+   assert(mem_type->heapIndex < pdevice->memory.heap_count);
+   struct anv_memory_heap *mem_heap =
+      &pdevice->memory.heaps[mem_type->heapIndex];
 
-   /* FINISHME: Fail if allocation request exceeds heap size. */
+   uint64_t mem_heap_used = p_atomic_read(&mem_heap->used);
+   if (mem_heap_used + aligned_alloc_size > mem_heap->size)
+      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
    mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8,
                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -3094,7 +3099,7 @@ VkResult anv_AllocateMemory(
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
    assert(pAllocateInfo->memoryTypeIndex < pdevice->memory.type_count);
-   mem->type = &pdevice->memory.types[pAllocateInfo->memoryTypeIndex];
+   mem->type = mem_type;
    mem->map = NULL;
    mem->map_size = 0;
    mem->ahw = NULL;
@@ -3102,23 +3107,57 @@ VkResult anv_AllocateMemory(
 
    enum anv_bo_alloc_flags alloc_flags = 0;
 
-   assert(mem->type->heapIndex < pdevice->memory.heap_count);
-   if (!pdevice->memory.heaps[mem->type->heapIndex].supports_48bit_addresses)
-      alloc_flags |= ANV_BO_ALLOC_32BIT_ADDRESS;
+   const VkExportMemoryAllocateInfo *export_info = NULL;
+   const VkImportAndroidHardwareBufferInfoANDROID *ahw_import_info = NULL;
+   const VkImportMemoryFdInfoKHR *fd_info = NULL;
+   const VkImportMemoryHostPointerInfoEXT *host_ptr_info = NULL;
+   const VkMemoryDedicatedAllocateInfo *dedicated_info = NULL;
+   VkMemoryAllocateFlags vk_flags = 0;
+   uint64_t client_address = 0;
 
-   const struct wsi_memory_allocate_info *wsi_info =
-      vk_find_struct_const(pAllocateInfo->pNext, WSI_MEMORY_ALLOCATE_INFO_MESA);
-   if (wsi_info && wsi_info->implicit_sync) {
-      /* We need to set the WRITE flag on window system buffers so that GEM
-       * will know we're writing to them and synchronize uses on other rings
-       * (eg if the display server uses the blitter ring).
-       */
-      alloc_flags |= ANV_BO_ALLOC_IMPLICIT_SYNC |
-                     ANV_BO_ALLOC_IMPLICIT_WRITE;
+   vk_foreach_struct_const(ext, pAllocateInfo->pNext) {
+      switch (ext->sType) {
+      case VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO:
+         export_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID:
+         ahw_import_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR:
+         fd_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT:
+         host_ptr_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO: {
+         const VkMemoryAllocateFlagsInfo *flags_info = (void *)ext;
+         vk_flags = flags_info->flags;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO:
+         dedicated_info = (void *)ext;
+         break;
+
+      case VK_STRUCTURE_TYPE_MEMORY_OPAQUE_CAPTURE_ADDRESS_ALLOCATE_INFO_KHR: {
+         const VkMemoryOpaqueCaptureAddressAllocateInfoKHR *addr_info =
+            (const VkMemoryOpaqueCaptureAddressAllocateInfoKHR *)ext;
+         client_address = addr_info->opaqueCaptureAddress;
+         break;
+      }
+
+      default:
+         anv_debug_ignored_stype(ext->sType);
+         break;
+      }
    }
 
-   const VkExportMemoryAllocateInfo *export_info =
-      vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO);
+   if (vk_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR)
+      alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
 
    /* Check if we need to support Android HW buffer export. If so,
     * create AHardwareBuffer and import memory from it.
@@ -3128,11 +3167,6 @@ VkResult anv_AllocateMemory(
        VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID)
       android_export = true;
 
-   /* Android memory import. */
-   const struct VkImportAndroidHardwareBufferInfoANDROID *ahw_import_info =
-      vk_find_struct_const(pAllocateInfo->pNext,
-                           IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID);
-
    if (ahw_import_info) {
       result = anv_import_ahw_memory(_device, mem, ahw_import_info);
       if (result != VK_SUCCESS)
@@ -3154,9 +3188,6 @@ VkResult anv_AllocateMemory(
       goto success;
    }
 
-   const VkImportMemoryFdInfoKHR *fd_info =
-      vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
-
    /* The Vulkan spec permits handleType to be 0, in which case the struct is
     * ignored.
     */
@@ -3168,7 +3199,7 @@ VkResult anv_AllocateMemory(
                VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
 
       result = anv_device_import_bo(device, fd_info->fd, alloc_flags,
-                                    &mem->bo);
+                                    client_address, &mem->bo);
       if (result != VK_SUCCESS)
          goto fail;
 
@@ -3207,9 +3238,6 @@ VkResult anv_AllocateMemory(
       goto success;
    }
 
-   const VkImportMemoryHostPointerInfoEXT *host_ptr_info =
-      vk_find_struct_const(pAllocateInfo->pNext,
-                           IMPORT_MEMORY_HOST_POINTER_INFO_EXT);
    if (host_ptr_info && host_ptr_info->handleType) {
       if (host_ptr_info->handleType ==
           VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_MAPPED_FOREIGN_MEMORY_BIT_EXT) {
@@ -3224,8 +3252,8 @@ VkResult anv_AllocateMemory(
                                                   host_ptr_info->pHostPointer,
                                                   pAllocateInfo->allocationSize,
                                                   alloc_flags,
+                                                  client_address,
                                                   &mem->bo);
-
       if (result != VK_SUCCESS)
          goto fail;
 
@@ -3239,12 +3267,10 @@ VkResult anv_AllocateMemory(
       alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
 
    result = anv_device_alloc_bo(device, pAllocateInfo->allocationSize,
-                                alloc_flags, &mem->bo);
+                                alloc_flags, client_address, &mem->bo);
    if (result != VK_SUCCESS)
       goto fail;
 
-   const VkMemoryDedicatedAllocateInfo *dedicated_info =
-      vk_find_struct_const(pAllocateInfo->pNext, MEMORY_DEDICATED_ALLOCATE_INFO);
    if (dedicated_info && dedicated_info->image != VK_NULL_HANDLE) {
       ANV_FROM_HANDLE(anv_image, image, dedicated_info->image);
 
@@ -3259,23 +3285,31 @@ VkResult anv_AllocateMemory(
                                       i915_tiling);
          if (ret) {
             anv_device_release_bo(device, mem->bo);
-            return vk_errorf(device->instance, NULL,
-                             VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                             "failed to set BO tiling: %m");
+            result = vk_errorf(device->instance, NULL,
+                               VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                               "failed to set BO tiling: %m");
+            goto fail;
          }
       }
    }
 
  success:
+   mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size);
+   if (mem_heap_used > mem_heap->size) {
+      p_atomic_add(&mem_heap->used, -mem->bo->size);
+      anv_device_release_bo(device, mem->bo);
+      result = vk_errorf(device->instance, NULL,
+                         VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                         "Out of heap memory");
+      goto fail;
+   }
+
    pthread_mutex_lock(&device->mutex);
    list_addtail(&mem->link, &device->memory_objects);
    pthread_mutex_unlock(&device->mutex);
 
    *pMem = anv_device_memory_to_handle(mem);
 
-   p_atomic_add(&pdevice->memory.heaps[mem->type->heapIndex].used,
-                mem->bo->size);
-
    return VK_SUCCESS;
 
  fail:
@@ -3535,12 +3569,7 @@ void anv_GetBufferMemoryRequirements(
     *    only if the memory type `i` in the VkPhysicalDeviceMemoryProperties
     *    structure for the physical device is supported.
     */
-   uint32_t memory_types = 0;
-   for (uint32_t i = 0; i < pdevice->memory.type_count; i++) {
-      uint32_t valid_usage = pdevice->memory.types[i].valid_buffer_usage;
-      if ((valid_usage & buffer->usage) == buffer->usage)
-         memory_types |= (1u << i);
-   }
+   uint32_t memory_types = (1ull << pdevice->memory.type_count) - 1;
 
    /* Base alignment requirement of a cache line */
    uint32_t alignment = 16;
@@ -3734,7 +3763,6 @@ anv_bind_buffer_memory(const VkBindBufferMemoryInfo *pBindInfo)
    assert(pBindInfo->sType == VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO);
 
    if (mem) {
-      assert((buffer->usage & mem->type->valid_buffer_usage) == buffer->usage);
       buffer->address = (struct anv_address) {
          .bo = mem->bo,
          .offset = pBindInfo->memoryOffset,
@@ -3895,8 +3923,17 @@ VkResult anv_CreateBuffer(
     VkBuffer*                                   pBuffer)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_physical_device *pdevice = &device->instance->physicalDevice;
    struct anv_buffer *buffer;
 
+   /* Don't allow creating buffers bigger than our address space.  The real
+    * issue here is that we may align up the buffer size and we don't want
+    * doing so to cause roll-over.  However, no one has any business
+    * allocating a buffer larger than our GTT size.
+    */
+   if (pCreateInfo->size > pdevice->gtt_size)
+      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO);
 
    buffer = vk_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8,
@@ -3927,17 +3964,37 @@ void anv_DestroyBuffer(
    vk_free2(&device->alloc, pAllocator, buffer);
 }
 
-VkDeviceAddress anv_GetBufferDeviceAddressEXT(
+VkDeviceAddress anv_GetBufferDeviceAddressKHR(
     VkDevice                                    device,
-    const VkBufferDeviceAddressInfoEXT*         pInfo)
+    const VkBufferDeviceAddressInfoKHR*         pInfo)
 {
    ANV_FROM_HANDLE(anv_buffer, buffer, pInfo->buffer);
 
+   assert(!anv_address_is_null(buffer->address));
    assert(buffer->address.bo->flags & EXEC_OBJECT_PINNED);
 
    return anv_address_physical(buffer->address);
 }
 
+uint64_t anv_GetBufferOpaqueCaptureAddressKHR(
+    VkDevice                                    device,
+    const VkBufferDeviceAddressInfoKHR*         pInfo)
+{
+   return 0;
+}
+
+uint64_t anv_GetDeviceMemoryOpaqueCaptureAddressKHR(
+    VkDevice                                    device,
+    const VkDeviceMemoryOpaqueCaptureAddressInfoKHR* pInfo)
+{
+   ANV_FROM_HANDLE(anv_device_memory, memory, pInfo->memory);
+
+   assert(memory->bo->flags & EXEC_OBJECT_PINNED);
+   assert(memory->bo->has_client_visible_address);
+
+   return gen_48b_address(memory->bo->offset);
+}
+
 void
 anv_fill_buffer_surface_state(struct anv_device *device, struct anv_state state,
                               enum isl_format format,