From e6803f6b6f06e805fe162d76aad5e25d2510232a Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 7 Feb 2019 14:10:33 -0600 Subject: [PATCH] anv: Use bindless textures and samplers This commit changes anv to put bindless handles and sampler pointers into the descriptor buffer and use those instead of bindful when we run out of binding table space. This "spilling" of descriptors allows to to advertise an almost unbounded number of images and samplers. Reviewed-by: Lionel Landwerlin Reviewed-by: Caio Marcelo de Oliveira Filho --- src/intel/vulkan/anv_descriptor_set.c | 89 ++++++++++++++++++- src/intel/vulkan/anv_device.c | 33 ++++++- .../vulkan/anv_nir_apply_pipeline_layout.c | 75 +++++++++++----- src/intel/vulkan/anv_private.h | 32 +++++++ src/intel/vulkan/genX_cmd_buffer.c | 12 ++- src/intel/vulkan/genX_state.c | 18 ++++ 6 files changed, 228 insertions(+), 31 deletions(-) diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index c7a90127511..6db6021822a 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -45,15 +45,24 @@ anv_descriptor_data_for_type(const struct anv_physical_device *device, switch (type) { case VK_DESCRIPTOR_TYPE_SAMPLER: data = ANV_DESCRIPTOR_SAMPLER_STATE; + if (device->has_bindless_samplers) + data |= ANV_DESCRIPTOR_SAMPLED_IMAGE; break; case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: data = ANV_DESCRIPTOR_SURFACE_STATE | ANV_DESCRIPTOR_SAMPLER_STATE; + if (device->has_bindless_images || device->has_bindless_samplers) + data |= ANV_DESCRIPTOR_SAMPLED_IMAGE; break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + data = ANV_DESCRIPTOR_SURFACE_STATE; + if (device->has_bindless_images) + data |= ANV_DESCRIPTOR_SAMPLED_IMAGE; + break; + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: data = ANV_DESCRIPTOR_SURFACE_STATE; break; @@ -100,6 +109,9 @@ anv_descriptor_data_size(enum anv_descriptor_data data) { unsigned size = 0; + if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) + size += sizeof(struct anv_sampled_image_descriptor); + if (data & ANV_DESCRIPTOR_IMAGE_PARAM) size += BRW_IMAGE_PARAM_SIZE * 4; @@ -118,7 +130,17 @@ anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout) return layout->array_size; } - return anv_descriptor_data_size(layout->data); + unsigned size = anv_descriptor_data_size(layout->data); + + /* For multi-planar bindings, we make every descriptor consume the maximum + * number of planes so we don't have to bother with walking arrays and + * adding things up every time. Fortunately, YCbCr samplers aren't all + * that common and likely won't be in the middle of big arrays. + */ + if (layout->max_plane_count > 1) + size *= layout->max_plane_count; + + return size; } /** Returns the size in bytes of each descriptor of the given type @@ -132,7 +154,11 @@ unsigned anv_descriptor_type_size(const struct anv_physical_device *pdevice, VkDescriptorType type) { - assert(type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT); + assert(type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT && + type != VK_DESCRIPTOR_TYPE_SAMPLER && + type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE && + type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER); + return anv_descriptor_data_size(anv_descriptor_data_for_type(pdevice, type)); } @@ -146,6 +172,12 @@ anv_descriptor_data_supports_bindless(const struct anv_physical_device *pdevice, return true; } + if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) { + assert(pdevice->has_bindless_images || pdevice->has_bindless_samplers); + return sampler ? pdevice->has_bindless_samplers : + pdevice->has_bindless_images; + } + return false; } @@ -586,6 +618,13 @@ VkResult anv_CreateDescriptorPool( unsigned desc_data_size = anv_descriptor_data_size(desc_data) * pCreateInfo->pPoolSizes[i].descriptorCount; + /* Combined image sampler descriptors can take up to 3 slots if they + * hold a YCbCr image. + */ + if (pCreateInfo->pPoolSizes[i].type == + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) + desc_data_size *= 3; + if (pCreateInfo->pPoolSizes[i].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { /* Inline uniform blocks are specified to use the descriptor array @@ -999,6 +1038,18 @@ anv_descriptor_set_write_image_param(uint32_t *param_desc_map, #undef WRITE_PARAM_FIELD } +static uint32_t +anv_surface_state_to_handle(struct anv_state state) +{ + /* Bits 31:12 of the bindless surface offset in the extended message + * descriptor is bits 25:6 of the byte-based address. + */ + assert(state.offset >= 0); + uint32_t offset = state.offset; + assert((offset & 0x3f) == 0 && offset < (1 << 26)); + return offset << 6; +} + void anv_descriptor_set_write_image_view(struct anv_device *device, struct anv_descriptor_set *set, @@ -1057,6 +1108,33 @@ anv_descriptor_set_write_image_view(struct anv_device *device, void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + element * anv_descriptor_size(bind_layout); + if (bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE) { + struct anv_sampled_image_descriptor desc_data[3]; + memset(desc_data, 0, sizeof(desc_data)); + + if (image_view) { + for (unsigned p = 0; p < image_view->n_planes; p++) { + struct anv_surface_state sstate = + (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ? + image_view->planes[p].general_sampler_surface_state : + image_view->planes[p].optimal_sampler_surface_state; + desc_data[p].image = anv_surface_state_to_handle(sstate.state); + } + } + + if (sampler) { + for (unsigned p = 0; p < sampler->n_planes; p++) + desc_data[p].sampler = sampler->bindless_state.offset + p * 32; + } + + /* We may have max_plane_count < 0 if this isn't a sampled image but it + * can be no more than the size of our array of handles. + */ + assert(bind_layout->max_plane_count <= ARRAY_SIZE(desc_data)); + memcpy(desc_map, desc_data, + MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0])); + } + if (bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM) { /* Storage images can only ever have one plane */ assert(image_view->n_planes == 1); @@ -1090,6 +1168,13 @@ anv_descriptor_set_write_buffer_view(struct anv_device *device, void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset + element * anv_descriptor_size(bind_layout); + if (bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE) { + struct anv_sampled_image_descriptor desc_data = { + .image = anv_surface_state_to_handle(buffer_view->surface_state), + }; + memcpy(desc_map, &desc_data, sizeof(desc_data)); + } + if (bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM) { anv_descriptor_set_write_image_param(desc_map, &buffer_view->storage_image_param); diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index d56e3593353..44fea839f52 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -280,6 +280,10 @@ anv_physical_device_init_uuids(struct anv_physical_device *device) sizeof(device->always_use_bindless)); _mesa_sha1_update(&sha1_ctx, &device->has_a64_buffer_access, sizeof(device->has_a64_buffer_access)); + _mesa_sha1_update(&sha1_ctx, &device->has_bindless_images, + sizeof(device->has_bindless_images)); + _mesa_sha1_update(&sha1_ctx, &device->has_bindless_samplers, + sizeof(device->has_bindless_samplers)); _mesa_sha1_final(&sha1_ctx, sha1); memcpy(device->pipeline_cache_uuid, sha1, VK_UUID_SIZE); @@ -464,6 +468,19 @@ anv_physical_device_init(struct anv_physical_device *device, device->has_a64_buffer_access = device->info.gen >= 8 && device->use_softpin; + /* We first get bindless image access on Skylake and we can only really do + * it if we don't have any relocations so we need softpin. + */ + device->has_bindless_images = device->info.gen >= 9 && + device->use_softpin; + + /* We've had bindless samplers since Ivy Bridge (forever in Vulkan terms) + * because it's just a matter of setting the sampler address in the sample + * message header. However, we've not bothered to wire it up for vec4 so + * we leave it disabled on gen7. + */ + device->has_bindless_samplers = device->info.gen >= 8; + /* Starting with Gen10, the timestamp frequency of the command streamer may * vary from one part to another. We can query the value from the kernel. */ @@ -1114,8 +1131,11 @@ void anv_GetPhysicalDeviceProperties( (1ul << 30) : (1ul << 27); const uint32_t max_ssbos = pdevice->has_a64_buffer_access ? UINT16_MAX : 64; - const uint32_t max_samplers = (devinfo->gen >= 8 || devinfo->is_haswell) ? - 128 : 16; + const uint32_t max_textures = + pdevice->has_bindless_images ? UINT16_MAX : 128; + const uint32_t max_samplers = + pdevice->has_bindless_samplers ? UINT16_MAX : + (devinfo->gen >= 8 || devinfo->is_haswell) ? 128 : 16; /* The moment we have anything bindless, claim a high per-stage limit */ const uint32_t max_per_stage = @@ -1144,7 +1164,7 @@ void anv_GetPhysicalDeviceProperties( .maxPerStageDescriptorSamplers = max_samplers, .maxPerStageDescriptorUniformBuffers = 64, .maxPerStageDescriptorStorageBuffers = max_ssbos, - .maxPerStageDescriptorSampledImages = max_samplers, + .maxPerStageDescriptorSampledImages = max_textures, .maxPerStageDescriptorStorageImages = MAX_IMAGES, .maxPerStageDescriptorInputAttachments = 64, .maxPerStageResources = max_per_stage, @@ -1153,7 +1173,7 @@ void anv_GetPhysicalDeviceProperties( .maxDescriptorSetUniformBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2, .maxDescriptorSetStorageBuffers = 6 * max_ssbos, /* number of stages * maxPerStageDescriptorStorageBuffers */ .maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_BUFFERS / 2, - .maxDescriptorSetSampledImages = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSampledImages */ + .maxDescriptorSetSampledImages = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */ .maxDescriptorSetStorageImages = 6 * MAX_IMAGES, /* number of stages * maxPerStageDescriptorStorageImages */ .maxDescriptorSetInputAttachments = 256, .maxVertexInputAttributes = MAX_VBS, @@ -3408,6 +3428,11 @@ void anv_DestroySampler( if (!sampler) return; + if (sampler->bindless_state.map) { + anv_state_pool_free(&device->dynamic_state_pool, + sampler->bindless_state); + } + vk_free2(&device->alloc, pAllocator, sampler); } diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index ab0103cfcd4..800ed2ef3e2 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -760,39 +760,64 @@ lower_tex_deref(nir_tex_instr *tex, nir_tex_src_type deref_src_type, unsigned array_size = state->layout->set[set].layout->binding[binding].array_size; - nir_tex_src_type offset_src_type; + unsigned binding_offset; if (deref_src_type == nir_tex_src_texture_deref) { - offset_src_type = nir_tex_src_texture_offset; - *base_index = state->set[set].surface_offsets[binding] + plane; + binding_offset = state->set[set].surface_offsets[binding]; } else { assert(deref_src_type == nir_tex_src_sampler_deref); - offset_src_type = nir_tex_src_sampler_offset; - *base_index = state->set[set].sampler_offsets[binding] + plane; + binding_offset = state->set[set].sampler_offsets[binding]; } + nir_builder *b = &state->builder; + + nir_tex_src_type offset_src_type; nir_ssa_def *index = NULL; - if (deref->deref_type != nir_deref_type_var) { - assert(deref->deref_type == nir_deref_type_array); + if (binding_offset > MAX_BINDING_TABLE_SIZE) { + const unsigned plane_offset = + plane * sizeof(struct anv_sampled_image_descriptor); - if (nir_src_is_const(deref->arr.index)) { - unsigned arr_index = nir_src_as_uint(deref->arr.index); - *base_index += MIN2(arr_index, array_size - 1); + nir_ssa_def *desc = + build_descriptor_load(deref, plane_offset, 2, 32, state); + + if (deref_src_type == nir_tex_src_texture_deref) { + offset_src_type = nir_tex_src_texture_handle; + index = nir_channel(b, desc, 0); } else { - nir_builder *b = &state->builder; - - /* From VK_KHR_sampler_ycbcr_conversion: - * - * If sampler Y’CBCR conversion is enabled, the combined image - * sampler must be indexed only by constant integral expressions when - * aggregated into arrays in shader code, irrespective of the - * shaderSampledImageArrayDynamicIndexing feature. - */ - assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1); + assert(deref_src_type == nir_tex_src_sampler_deref); + offset_src_type = nir_tex_src_sampler_handle; + index = nir_channel(b, desc, 1); + } + } else { + if (deref_src_type == nir_tex_src_texture_deref) { + offset_src_type = nir_tex_src_texture_offset; + } else { + assert(deref_src_type == nir_tex_src_sampler_deref); + offset_src_type = nir_tex_src_sampler_offset; + } - index = nir_ssa_for_src(b, deref->arr.index, 1); + *base_index = binding_offset + plane; - if (state->add_bounds_checks) - index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); + if (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + + if (nir_src_is_const(deref->arr.index)) { + unsigned arr_index = nir_src_as_uint(deref->arr.index); + *base_index += MIN2(arr_index, array_size - 1); + } else { + /* From VK_KHR_sampler_ycbcr_conversion: + * + * If sampler Y’CBCR conversion is enabled, the combined image + * sampler must be indexed only by constant integral expressions + * when aggregated into arrays in shader code, irrespective of + * the shaderSampledImageArrayDynamicIndexing feature. + */ + assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1); + + index = nir_ssa_for_src(b, deref->arr.index, 1); + + if (state->add_bounds_checks) + index = nir_umin(b, index, nir_imm_int(b, array_size - 1)); + } } } @@ -1062,6 +1087,10 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, anv_descriptor_requires_bindless(pdevice, binding, true)) { /* If this descriptor doesn't fit in the binding table or if it * requires bindless for some reason, flag it as bindless. + * + * We also make large sampler arrays bindless because we can avoid + * using indirect sends thanks to bindless samplers being packed + * less tightly than the sampler table. */ assert(anv_descriptor_supports_bindless(pdevice, binding, true)); state.set[set].sampler_offsets[b] = BINDLESS_OFFSET; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index bb24ff1ae82..9f525d1e21a 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -953,6 +953,10 @@ struct anv_physical_device { /** True if we can access buffers using A64 messages */ bool has_a64_buffer_access; + /** True if we can use bindless access for images */ + bool has_bindless_images; + /** True if we can use bindless access for samplers */ + bool has_bindless_samplers; struct anv_device_extension_table supported_extensions; @@ -1521,6 +1525,27 @@ struct anv_vue_header { float PointWidth; }; +/** Struct representing a sampled image descriptor + * + * This descriptor layout is used for sampled images, bare sampler, and + * combined image/sampler descriptors. + */ +struct anv_sampled_image_descriptor { + /** Bindless image handle + * + * This is expected to already be shifted such that the 20-bit + * SURFACE_STATE table index is in the top 20 bits. + */ + uint32_t image; + + /** Bindless sampler handle + * + * This is assumed to be a 32B-aligned SAMPLER_STATE pointer relative + * to the dynamic state base address. + */ + uint32_t sampler; +}; + /** Struct representing a address/range descriptor * * The fields of this struct correspond directly to the data layout of @@ -1547,6 +1572,8 @@ enum anv_descriptor_data { ANV_DESCRIPTOR_INLINE_UNIFORM = (1 << 4), /** anv_address_range_descriptor with a buffer address and range */ ANV_DESCRIPTOR_ADDRESS_RANGE = (1 << 5), + /** Bindless surface handle */ + ANV_DESCRIPTOR_SAMPLED_IMAGE = (1 << 6), }; struct anv_descriptor_set_binding_layout { @@ -3454,6 +3481,11 @@ struct anv_sampler { uint32_t state[3][4]; uint32_t n_planes; struct anv_ycbcr_conversion *conversion; + + /* Blob of sampler state data which is guaranteed to be 32-byte aligned + * and with a 32-byte stride for use as bindless samplers. + */ + struct anv_state bindless_state; }; struct anv_framebuffer { diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 3189585cbd3..1af36bced24 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -110,10 +110,18 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer) sba.InstructionBuffersizeModifyEnable = true; # endif # if (GEN_GEN >= 9) - sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { NULL, 0 }; + if (cmd_buffer->device->instance->physicalDevice.use_softpin) { + sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { + .bo = device->surface_state_pool.block_pool.bo, + .offset = 0, + }; + sba.BindlessSurfaceStateSize = (1 << 20) - 1; + } else { + sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS; + sba.BindlessSurfaceStateSize = 0; + } sba.BindlessSurfaceStateMOCS = GENX(MOCS); sba.BindlessSurfaceStateBaseAddressModifyEnable = true; - sba.BindlessSurfaceStateSize = 0; # endif # if (GEN_GEN >= 10) sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 }; diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c index 283cd8c501a..9276dc9470b 100644 --- a/src/intel/vulkan/genX_state.c +++ b/src/intel/vulkan/genX_state.c @@ -328,6 +328,8 @@ VkResult genX(CreateSampler)( VkSampler* pSampler) { ANV_FROM_HANDLE(anv_device, device, _device); + const struct anv_physical_device *pdevice = + &device->instance->physicalDevice; struct anv_sampler *sampler; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO); @@ -383,6 +385,17 @@ VkResult genX(CreateSampler)( } } + if (pdevice->has_bindless_samplers) { + /* If we have bindless, allocate enough samplers. We allocate 32 bytes + * for each sampler instead of 16 bytes because we want all bindless + * samplers to be 32-byte aligned so we don't have to use indirect + * sampler messages on them. + */ + sampler->bindless_state = + anv_state_pool_alloc(&device->dynamic_state_pool, + sampler->n_planes * 32, 32); + } + for (unsigned p = 0; p < sampler->n_planes; p++) { const bool plane_has_chroma = sampler->conversion && sampler->conversion->format->planes[p].has_chroma; @@ -452,6 +465,11 @@ VkResult genX(CreateSampler)( }; GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state); + + if (sampler->bindless_state.map) { + memcpy(sampler->bindless_state.map + p * 32, + sampler->state[p], GENX(SAMPLER_STATE_length) * 4); + } } *pSampler = anv_sampler_to_handle(sampler); -- 2.30.2