#include "anv_private.h"
+#include "common/gen_aux_map.h"
#include "util/anon_file.h"
#ifdef HAVE_VALGRIND
}
static uint32_t
-anv_device_get_bo_align(struct anv_device *device)
+anv_device_get_bo_align(struct anv_device *device,
+ enum anv_bo_alloc_flags alloc_flags)
{
- /* Gen12 CCS surface addresses need to be 64K aligned. We have no way of
- * telling what this allocation is for so pick the largest alignment.
- */
- if (device->info.gen >= 12)
+ /* Gen12 CCS surface addresses need to be 64K aligned. */
+ if (device->info.gen >= 12 && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS))
return 64 * 1024;
return 4096;
uint64_t explicit_address,
struct anv_bo **bo_out)
{
+ if (!device->physical->has_implicit_ccs)
+ assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
+
const uint32_t bo_flags =
anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
/* The kernel is going to give us whole pages anyway */
size = align_u64(size, 4096);
- const uint32_t align = anv_device_get_bo_align(device);
+ const uint32_t align = anv_device_get_bo_align(device, alloc_flags);
+
+ uint64_t ccs_size = 0;
+ if (device->info.has_aux_map && (alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS)) {
+ /* Align the size up to the next multiple of 64K so we don't have any
+ * AUX-TT entries pointing from a 64K page to itself.
+ */
+ size = align_u64(size, 64 * 1024);
- uint32_t gem_handle = anv_gem_create(device, size);
+ /* See anv_bo::_ccs_size */
+ ccs_size = align_u64(DIV_ROUND_UP(size, GEN_AUX_MAP_GEN12_CCS_SCALE), 4096);
+ }
+
+ uint32_t gem_handle = anv_gem_create(device, size + ccs_size);
if (gem_handle == 0)
return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
.refcount = 1,
.offset = -1,
.size = size,
+ ._ccs_size = ccs_size,
.flags = bo_flags,
.is_external = (alloc_flags & ANV_BO_ALLOC_EXTERNAL),
.has_client_visible_address =
(alloc_flags & ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS) != 0,
+ .has_implicit_ccs = ccs_size > 0,
};
if (alloc_flags & ANV_BO_ALLOC_MAPPED) {
new_bo.has_fixed_address = true;
new_bo.offset = explicit_address;
} else if (new_bo.flags & EXEC_OBJECT_PINNED) {
- new_bo.offset = anv_vma_alloc(device, new_bo.size, align,
- alloc_flags, explicit_address);
+ new_bo.offset = anv_vma_alloc(device, new_bo.size + new_bo._ccs_size,
+ align, alloc_flags, explicit_address);
if (new_bo.offset == 0) {
if (new_bo.map)
anv_gem_munmap(new_bo.map, size);
assert(!new_bo.has_client_visible_address);
}
+ if (new_bo._ccs_size > 0) {
+ assert(device->info.has_aux_map);
+ gen_aux_map_add_mapping(device->aux_map_ctx,
+ gen_canonical_address(new_bo.offset),
+ gen_canonical_address(new_bo.offset + new_bo.size),
+ new_bo.size, 0 /* format_bits */);
+ }
+
assert(new_bo.gem_handle);
/* If we just got this gem_handle from anv_bo_init_new then we know no one
ANV_BO_ALLOC_SNOOPED |
ANV_BO_ALLOC_FIXED_ADDRESS)));
+ /* We can't do implicit CCS with an aux table on shared memory */
+ if (!device->physical->has_implicit_ccs || device->info.has_aux_map)
+ assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
+
struct anv_bo_cache *cache = &device->bo_cache;
const uint32_t bo_flags =
anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
assert(client_address == gen_48b_address(client_address));
if (new_bo.flags & EXEC_OBJECT_PINNED) {
- /* Gen12 CCS surface addresses need to be 64K aligned. We have no way
- * of telling what this allocation is for so pick the largest
- * alignment.
- */
- const uint32_t align = device->info.gen >= 12 ? (64 * 1024) :
- (4 * 1024);
-
+ assert(new_bo._ccs_size == 0);
new_bo.offset = anv_vma_alloc(device, new_bo.size,
- anv_device_get_bo_align(device),
+ anv_device_get_bo_align(device,
+ alloc_flags),
alloc_flags, client_address);
if (new_bo.offset == 0) {
anv_gem_close(device, new_bo.gem_handle);
ANV_BO_ALLOC_SNOOPED |
ANV_BO_ALLOC_FIXED_ADDRESS)));
+ /* We can't do implicit CCS with an aux table on shared memory */
+ if (!device->physical->has_implicit_ccs || device->info.has_aux_map)
+ assert(!(alloc_flags & ANV_BO_ALLOC_IMPLICIT_CCS));
+
struct anv_bo_cache *cache = &device->bo_cache;
const uint32_t bo_flags =
anv_bo_alloc_flags_to_bo_flags(device, alloc_flags);
assert(client_address == gen_48b_address(client_address));
if (new_bo.flags & EXEC_OBJECT_PINNED) {
+ assert(new_bo._ccs_size == 0);
new_bo.offset = anv_vma_alloc(device, new_bo.size,
- anv_device_get_bo_align(device),
+ anv_device_get_bo_align(device,
+ alloc_flags),
alloc_flags, client_address);
if (new_bo.offset == 0) {
anv_gem_close(device, new_bo.gem_handle);
if (bo->map && !bo->from_host_ptr)
anv_gem_munmap(bo->map, bo->size);
+ if (bo->_ccs_size > 0) {
+ assert(device->physical->has_implicit_ccs);
+ assert(device->info.has_aux_map);
+ assert(bo->has_implicit_ccs);
+ gen_aux_map_unmap_range(device->aux_map_ctx,
+ gen_canonical_address(bo->offset),
+ bo->size);
+ }
+
if ((bo->flags & EXEC_OBJECT_PINNED) && !bo->has_fixed_address)
- anv_vma_free(device, bo->offset, bo->size);
+ anv_vma_free(device, bo->offset, bo->size + bo->_ccs_size);
uint32_t gem_handle = bo->gem_handle;
*/
device->has_bindless_samplers = device->info.gen >= 8;
+ device->has_implicit_ccs = device->info.has_aux_map;
+
device->has_mem_available = get_available_system_memory() != 0;
device->always_flush_cache =
}
}
+ /* By default, we want all VkDeviceMemory objects to support CCS */
+ if (device->physical->has_implicit_ccs)
+ alloc_flags |= ANV_BO_ALLOC_IMPLICIT_CCS;
+
if (vk_flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR)
alloc_flags |= ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS;
+ if ((export_info && export_info->handleTypes) ||
+ (fd_info && fd_info->handleType) ||
+ (host_ptr_info && host_ptr_info->handleType)) {
+ /* Anything imported or exported is EXTERNAL */
+ alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
+
+ /* We can't have implicit CCS on external memory with an AUX-table.
+ * Doing so would require us to sync the aux tables across processes
+ * which is impractical.
+ */
+ if (device->info.has_aux_map)
+ alloc_flags &= ~ANV_BO_ALLOC_IMPLICIT_CCS;
+ }
+
/* Check if we need to support Android HW buffer export. If so,
* create AHardwareBuffer and import memory from it.
*/
/* Regular allocate (not importing memory). */
- if (export_info && export_info->handleTypes)
- alloc_flags |= ANV_BO_ALLOC_EXTERNAL;
-
result = anv_device_alloc_bo(device, pAllocateInfo->allocationSize,
alloc_flags, client_address, &mem->bo);
if (result != VK_SUCCESS)
#include "vk_util.h"
#include "util/u_math.h"
-#include "common/gen_aux_map.h"
-
#include "vk_format_info.h"
static isl_surf_usage_flags_t
image->planes[plane].aux_usage = ISL_AUX_USAGE_CCS_D;
}
- add_surface(image, &image->planes[plane].aux_surface, plane);
+ if (!dev->physical->has_implicit_ccs)
+ add_surface(image, &image->planes[plane].aux_surface, plane);
+
add_aux_state_tracking_buffer(image, plane, dev);
}
}
return;
for (uint32_t p = 0; p < image->n_planes; ++p) {
- if (anv_image_plane_uses_aux_map(device, image, p) &&
- image->planes[p].address.bo) {
- gen_aux_map_unmap_range(device->aux_map_ctx,
- image->planes[p].aux_map_surface_address,
- image->planes[p].surface.isl.size_B);
- }
if (image->planes[p].bo_is_owned) {
assert(image->planes[p].address.bo != NULL);
anv_device_release_bo(device, image->planes[p].address.bo);
assert(!image->planes[plane].bo_is_owned);
if (!memory) {
- if (anv_image_plane_uses_aux_map(device, image, plane) &&
- image->planes[plane].address.bo) {
- gen_aux_map_unmap_range(device->aux_map_ctx,
- image->planes[plane].aux_map_surface_address,
- image->planes[plane].surface.isl.size_B);
- }
image->planes[plane].address = ANV_NULL_ADDRESS;
return;
}
.offset = memory_offset,
};
- if (anv_image_plane_uses_aux_map(device, image, plane)) {
- image->planes[plane].aux_map_surface_address =
- anv_address_physical(
- anv_address_add(image->planes[plane].address,
- image->planes[plane].surface.offset));
-
- gen_aux_map_add_image(device->aux_map_ctx,
- &image->planes[plane].surface.isl,
- image->planes[plane].aux_map_surface_address,
- anv_address_physical(
- anv_address_add(image->planes[plane].address,
- image->planes[plane].aux_surface.offset)));
- }
+ /* If we're on a platform that uses implicit CCS and our buffer does not
+ * have any implicit CCS data, disable compression on that image.
+ */
+ if (device->physical->has_implicit_ccs && !memory->bo->has_implicit_ccs)
+ image->planes[plane].aux_usage = ISL_AUX_USAGE_NONE;
}
/* We are binding AHardwareBuffer. Get a description, resolve the
*/
uint64_t offset;
+ /** Size of the buffer not including implicit aux */
uint64_t size;
/* Map for internally mapped BOs.
*/
void *map;
+ /** Size of the implicit CCS range at the end of the buffer
+ *
+ * On Gen12, CCS data is always a direct 1/256 scale-down. A single 64K
+ * page of main surface data maps to a 256B chunk of CCS data and that
+ * mapping is provided on TGL-LP by the AUX table which maps virtual memory
+ * addresses in the main surface to virtual memory addresses for CCS data.
+ *
+ * Because we can't change these maps around easily and because Vulkan
+ * allows two VkImages to be bound to overlapping memory regions (as long
+ * as the app is careful), it's not feasible to make this mapping part of
+ * the image. (On Gen11 and earlier, the mapping was provided via
+ * RENDER_SURFACE_STATE so each image had its own main -> CCS mapping.)
+ * Instead, we attach the CCS data directly to the buffer object and setup
+ * the AUX table mapping at BO creation time.
+ *
+ * This field is for internal tracking use by the BO allocator only and
+ * should not be touched by other parts of the code. If something wants to
+ * know if a BO has implicit CCS data, it should instead look at the
+ * has_implicit_ccs boolean below.
+ *
+ * This data is not included in maps of this buffer.
+ */
+ uint32_t _ccs_size;
+
/** Flags to pass to the kernel through drm_i915_exec_object2::flags */
uint32_t flags;
/** See also ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS */
bool has_client_visible_address:1;
+
+ /** True if this BO has implicit CCS data attached to it */
+ bool has_implicit_ccs:1;
};
static inline struct anv_bo *
/** True if we can use bindless access for samplers */
bool has_bindless_samplers;
+ /** True if this device has implicit AUX
+ *
+ * If true, CCS is handled as an implicit attachment to the BO rather than
+ * as an explicitly bound surface.
+ */
+ bool has_implicit_ccs;
+
bool always_flush_cache;
struct anv_device_extension_table supported_extensions;
/** Has an address which is visible to the client */
ANV_BO_ALLOC_CLIENT_VISIBLE_ADDRESS = (1 << 8),
+
+ /** This buffer has implicit CCS data attached to it */
+ ANV_BO_ALLOC_IMPLICIT_CCS = (1 << 9),
};
VkResult anv_device_alloc_bo(struct anv_device *device, uint64_t size,
*/
struct anv_address address;
- /**
- * Address of the main surface used to fill the aux map table. This is
- * used at destruction of the image since the Vulkan spec does not
- * guarantee that the address.bo field we still be valid at destruction.
- */
- uint64_t aux_map_surface_address;
-
/**
* When destroying the image, also free the bo.
* */
genX(flush_pipeline_select_3d)(cmd_buffer);
-#if GEN_GEN >= 12
- cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT;
-#endif
-
genX(cmd_buffer_emit_gen7_depth_flush)(cmd_buffer);
/* BLORP doesn't do anything fancy with depth such as discards, so we want
}
}
+#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
+
+#if GEN_GEN == 12
+static void
+anv_image_init_aux_tt(struct anv_cmd_buffer *cmd_buffer,
+ const struct anv_image *image,
+ VkImageAspectFlagBits aspect,
+ uint32_t base_level, uint32_t level_count,
+ uint32_t base_layer, uint32_t layer_count)
+{
+ uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);
+ assert(isl_aux_usage_has_ccs(image->planes[plane].aux_usage));
+
+ uint64_t base_address =
+ anv_address_physical(image->planes[plane].address);
+
+ const struct isl_surf *isl_surf = &image->planes[plane].surface.isl;
+ uint64_t format_bits = gen_aux_map_format_bits_for_isl_surf(isl_surf);
+
+ /* We're about to live-update the AUX-TT. We really don't want anyone else
+ * trying to read it while we're doing this. We could probably get away
+ * with not having this stall in some cases if we were really careful but
+ * it's better to play it safe. Full stall the GPU.
+ */
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
+ genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+
+ for (uint32_t a = 0; a < layer_count; a++) {
+ const uint32_t layer = base_layer + a;
+
+ uint64_t start_offset_B = UINT64_MAX, end_offset_B = 0;
+ for (uint32_t l = 0; l < level_count; l++) {
+ const uint32_t level = base_level + l;
+
+ uint32_t logical_array_layer, logical_z_offset_px;
+ if (image->type == VK_IMAGE_TYPE_3D) {
+ logical_array_layer = 0;
+
+ /* If the given miplevel does not have this layer, then any higher
+ * miplevels won't either because miplevels only get smaller the
+ * higher the LOD.
+ */
+ assert(layer < image->extent.depth);
+ if (layer >= anv_minify(image->extent.depth, level))
+ break;
+ logical_z_offset_px = layer;
+ } else {
+ assert(layer < image->array_size);
+ logical_array_layer = layer;
+ logical_z_offset_px = 0;
+ }
+
+ uint32_t slice_start_offset_B, slice_end_offset_B;
+ isl_surf_get_image_range_B_tile(isl_surf, level,
+ logical_array_layer,
+ logical_z_offset_px,
+ &slice_start_offset_B,
+ &slice_end_offset_B);
+
+ start_offset_B = MIN2(start_offset_B, slice_start_offset_B);
+ end_offset_B = MAX2(end_offset_B, slice_end_offset_B);
+ }
+
+ /* Aux operates 64K at a time */
+ start_offset_B = align_down_u64(start_offset_B, 64 * 1024);
+ end_offset_B = align_u64(end_offset_B, 64 * 1024);
+
+ for (uint64_t offset = start_offset_B;
+ offset < end_offset_B; offset += 64 * 1024) {
+ uint64_t address = base_address + offset;
+
+ uint64_t aux_entry_address, *aux_entry_map;
+ aux_entry_map = gen_aux_map_get_entry(cmd_buffer->device->aux_map_ctx,
+ address, &aux_entry_address);
+
+ const uint64_t old_aux_entry = READ_ONCE(*aux_entry_map);
+ uint64_t new_aux_entry =
+ (old_aux_entry & ~GEN_AUX_MAP_FORMAT_BITS_MASK) | format_bits;
+
+ /* We're only going to update the top 32 bits */
+ assert((uint32_t)old_aux_entry == (uint32_t)new_aux_entry);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
+ sdi.Address = (struct anv_address) {
+ .bo = NULL,
+ .offset = aux_entry_address + 4,
+ };
+ sdi.ImmediateData = new_aux_entry >> 32;
+ }
+ }
+ }
+
+ cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT;
+}
+#endif /* GEN_GEN == 12 */
+
/**
* @brief Transitions a color buffer from one layout to another.
*
VkImageLayout initial_layout,
VkImageLayout final_layout)
{
- const struct gen_device_info *devinfo = &cmd_buffer->device->info;
+ struct anv_device *device = cmd_buffer->device;
+ const struct gen_device_info *devinfo = &device->info;
/* Validate the inputs. */
assert(cmd_buffer);
assert(image && image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
if (initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
+#if GEN_GEN == 12
+ if (isl_aux_usage_has_ccs(image->planes[plane].aux_usage) &&
+ device->physical->has_implicit_ccs && devinfo->has_aux_map) {
+ anv_image_init_aux_tt(cmd_buffer, image, aspect,
+ base_level, level_count,
+ base_layer, layer_count);
+ }
+#else
+ assert(!(device->physical->has_implicit_ccs && devinfo->has_aux_map));
+#endif
+
/* A subresource in the undefined layout may have been aliased and
* populated with any arrangement of bits. Therefore, we must initialize
* the related aux buffer and clear buffer entry with desirable values.
genX(flush_pipeline_select_3d)(cmd_buffer);
-#if GEN_GEN >= 12
- cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT;
-#endif
-
if (vb_emit) {
const uint32_t num_buffers = __builtin_popcount(vb_emit);
const uint32_t num_dwords = 1 + num_buffers * 4;
genX(flush_pipeline_select_gpgpu)(cmd_buffer);
-#if GEN_GEN >= 12
- cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_AUX_TABLE_INVALIDATE_BIT;
-#endif
-
if (cmd_buffer->state.compute.pipeline_dirty) {
/* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
*