From c520f4dec9cbedd4132143f52411df18f97869e6 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 19 Nov 2018 14:28:39 -0600 Subject: [PATCH] anv: Add a concept of a descriptor buffer This buffer goes along side the CPU data structure and may contain pointers, bindless handles, or any other descriptor information. Currently, all descriptors are size zero and nothing goes in the buffer but this commit sets up the framework we will need later. Reviewed-by: Lionel Landwerlin --- src/intel/vulkan/anv_cmd_buffer.c | 31 ++++ src/intel/vulkan/anv_descriptor_set.c | 142 ++++++++++++++++++ .../vulkan/anv_nir_apply_pipeline_layout.c | 28 ++++ src/intel/vulkan/anv_private.h | 28 ++++ src/intel/vulkan/genX_cmd_buffer.c | 52 +++++++ 5 files changed, 281 insertions(+) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index adb8b3ddffe..f265a8bfc89 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -1001,6 +1001,37 @@ anv_cmd_buffer_push_descriptor_set(struct anv_cmd_buffer *cmd_buffer, set->buffer_view_count = layout->buffer_view_count; set->buffer_views = (*push_set)->buffer_views; + if (layout->descriptor_buffer_size && + ((*push_set)->set_used_on_gpu || + set->desc_mem.alloc_size < layout->descriptor_buffer_size)) { + /* The previous buffer is either actively used by some GPU command (so + * we can't modify it) or is too small. Allocate a new one. + */ + struct anv_state desc_mem = + anv_state_stream_alloc(&cmd_buffer->dynamic_state_stream, + layout->descriptor_buffer_size, 32); + if (set->desc_mem.alloc_size) { + /* TODO: Do we really need to copy all the time? */ + memcpy(desc_mem.map, set->desc_mem.map, + MIN2(desc_mem.alloc_size, set->desc_mem.alloc_size)); + } + set->desc_mem = desc_mem; + + struct anv_address addr = { + .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo, + .offset = set->desc_mem.offset, + }; + + const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev; + set->desc_surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, + isl_dev->ss.size, isl_dev->ss.align); + anv_fill_buffer_surface_state(cmd_buffer->device, + set->desc_surface_state, + ISL_FORMAT_R32G32B32A32_FLOAT, + addr, layout->descriptor_buffer_size, 1); + } + return set; } diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index 964180c5f96..3f0e3235500 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -82,6 +82,33 @@ anv_descriptor_data_for_type(const struct anv_physical_device *device, return data; } +static unsigned +anv_descriptor_data_size(enum anv_descriptor_data data) +{ + return 0; +} + +/** Returns the size in bytes of each descriptor with the given layout */ +unsigned +anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout) +{ + return anv_descriptor_data_size(layout->data); +} + +/** Returns the size in bytes of each descriptor of the given type + * + * This version of the function does not have access to the entire layout so + * it may only work on certain descriptor types where the descriptor size is + * entirely determined by the descriptor type. Whenever possible, code should + * use anv_descriptor_size() instead. + */ +unsigned +anv_descriptor_type_size(const struct anv_physical_device *pdevice, + VkDescriptorType type) +{ + return anv_descriptor_data_size(anv_descriptor_data_for_type(pdevice, type)); +} + void anv_GetDescriptorSetLayoutSupport( VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, @@ -198,6 +225,7 @@ VkResult anv_CreateDescriptorSetLayout( uint32_t buffer_view_count = 0; uint32_t dynamic_offset_count = 0; + uint32_t descriptor_buffer_size = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { const VkDescriptorSetLayoutBinding *binding = &pCreateInfo->pBindings[j]; @@ -267,11 +295,16 @@ VkResult anv_CreateDescriptorSetLayout( break; } + set_layout->binding[b].descriptor_offset = descriptor_buffer_size; + descriptor_buffer_size += anv_descriptor_size(&set_layout->binding[b]) * + binding->descriptorCount; + set_layout->shader_stages |= binding->stageFlags; } set_layout->buffer_view_count = buffer_view_count; set_layout->dynamic_offset_count = dynamic_offset_count; + set_layout->descriptor_buffer_size = descriptor_buffer_size; *pSetLayout = anv_descriptor_set_layout_to_handle(set_layout); @@ -315,6 +348,7 @@ sha1_update_descriptor_set_binding_layout(struct mesa_sha1 *ctx, SHA1_UPDATE_VALUE(ctx, layout->descriptor_index); SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_index); SHA1_UPDATE_VALUE(ctx, layout->buffer_view_index); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_offset); if (layout->immutable_samplers) { for (uint16_t i = 0; i < layout->array_size; i++) @@ -331,6 +365,7 @@ sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx, SHA1_UPDATE_VALUE(ctx, layout->shader_stages); SHA1_UPDATE_VALUE(ctx, layout->buffer_view_count); SHA1_UPDATE_VALUE(ctx, layout->dynamic_offset_count); + SHA1_UPDATE_VALUE(ctx, layout->descriptor_buffer_size); for (uint16_t i = 0; i < layout->binding_count; i++) sha1_update_descriptor_set_binding_layout(ctx, &layout->binding[i]); @@ -420,6 +455,12 @@ void anv_DestroyPipelineLayout( * and the free lists lets us recycle blocks for case 2). */ +/* The vma heap reserves 0 to mean NULL; we have to offset by some ammount to + * ensure we can allocate the entire BO without hitting zero. The actual + * amount doesn't matter. + */ +#define POOL_HEAP_OFFSET 64 + #define EMPTY 1 VkResult anv_CreateDescriptorPool( @@ -433,6 +474,7 @@ VkResult anv_CreateDescriptorPool( uint32_t descriptor_count = 0; uint32_t buffer_view_count = 0; + uint32_t descriptor_bo_size = 0; for (uint32_t i = 0; i < pCreateInfo->poolSizeCount; i++) { enum anv_descriptor_data desc_data = anv_descriptor_data_for_type(&device->instance->physicalDevice, @@ -441,8 +483,22 @@ VkResult anv_CreateDescriptorPool( if (desc_data & ANV_DESCRIPTOR_BUFFER_VIEW) buffer_view_count += pCreateInfo->pPoolSizes[i].descriptorCount; + unsigned desc_data_size = anv_descriptor_data_size(desc_data) * + pCreateInfo->pPoolSizes[i].descriptorCount; + descriptor_bo_size += desc_data_size; + descriptor_count += pCreateInfo->pPoolSizes[i].descriptorCount; } + /* We have to align descriptor buffer allocations to 32B so that we can + * push descriptor buffers. This means that each descriptor buffer + * allocated may burn up to 32B of extra space to get the right alignment. + * (Technically, it's at most 28B because we're always going to start at + * least 4B aligned but we're being conservative here.) Allocate enough + * extra space that we can chop it into maxSets pieces and align each one + * of them to 32B. + */ + descriptor_bo_size += 32 * pCreateInfo->maxSets; + descriptor_bo_size = ALIGN(descriptor_bo_size, 4096); const size_t pool_size = pCreateInfo->maxSets * sizeof(struct anv_descriptor_set) + @@ -459,6 +515,33 @@ VkResult anv_CreateDescriptorPool( pool->next = 0; pool->free_list = EMPTY; + if (descriptor_bo_size > 0) { + VkResult result = anv_bo_init_new(&pool->bo, device, descriptor_bo_size); + if (result != VK_SUCCESS) { + vk_free2(&device->alloc, pAllocator, pool); + return result; + } + + anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED); + + pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, + descriptor_bo_size, 0); + if (pool->bo.map == NULL) { + anv_gem_close(device, pool->bo.gem_handle); + vk_free2(&device->alloc, pAllocator, pool); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + + if (device->instance->physicalDevice.use_softpin) { + pool->bo.flags |= EXEC_OBJECT_PINNED; + anv_vma_alloc(device, &pool->bo); + } + + util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, descriptor_bo_size); + } else { + pool->bo.size = 0; + } + anv_state_stream_init(&pool->surface_state_stream, &device->surface_state_pool, 4096); pool->surface_state_free_list = NULL; @@ -479,6 +562,11 @@ void anv_DestroyDescriptorPool( if (!pool) return; + if (pool->bo.size) { + anv_gem_munmap(pool->bo.map, pool->bo.size); + anv_vma_free(device, &pool->bo); + anv_gem_close(device, pool->bo.gem_handle); + } anv_state_stream_finish(&pool->surface_state_stream); vk_free2(&device->alloc, pAllocator, pool); } @@ -493,6 +581,12 @@ VkResult anv_ResetDescriptorPool( pool->next = 0; pool->free_list = EMPTY; + + if (pool->bo.size) { + util_vma_heap_finish(&pool->bo_heap); + util_vma_heap_init(&pool->bo_heap, POOL_HEAP_OFFSET, pool->bo.size); + } + anv_state_stream_finish(&pool->surface_state_stream); anv_state_stream_init(&pool->surface_state_stream, &device->surface_state_pool, 4096); @@ -606,6 +700,37 @@ anv_descriptor_set_create(struct anv_device *device, if (result != VK_SUCCESS) return result; + if (layout->descriptor_buffer_size) { + /* Align the size to 32 so that alignment gaps don't cause extra holes + * in the heap which can lead to bad performance. + */ + uint64_t pool_vma_offset = + util_vma_heap_alloc(&pool->bo_heap, + ALIGN(layout->descriptor_buffer_size, 32), 32); + if (pool_vma_offset == 0) { + anv_descriptor_pool_free_set(pool, set); + return vk_error(VK_ERROR_FRAGMENTED_POOL); + } + assert(pool_vma_offset >= POOL_HEAP_OFFSET && + pool_vma_offset - POOL_HEAP_OFFSET <= INT32_MAX); + set->desc_mem.offset = pool_vma_offset - POOL_HEAP_OFFSET; + set->desc_mem.alloc_size = layout->descriptor_buffer_size; + set->desc_mem.map = pool->bo.map + set->desc_mem.offset; + + set->desc_surface_state = anv_descriptor_pool_alloc_state(pool); + anv_fill_buffer_surface_state(device, set->desc_surface_state, + ISL_FORMAT_R32G32B32A32_FLOAT, + (struct anv_address) { + .bo = &pool->bo, + .offset = set->desc_mem.offset, + }, + layout->descriptor_buffer_size, 1); + } else { + set->desc_mem = ANV_STATE_NULL; + set->desc_surface_state = ANV_STATE_NULL; + } + + set->pool = pool; set->layout = layout; anv_descriptor_set_layout_ref(layout); @@ -656,6 +781,13 @@ anv_descriptor_set_destroy(struct anv_device *device, { anv_descriptor_set_layout_unref(device, set->layout); + if (set->desc_mem.alloc_size) { + util_vma_heap_free(&pool->bo_heap, + (uint64_t)set->desc_mem.offset + POOL_HEAP_OFFSET, + set->desc_mem.alloc_size); + anv_descriptor_pool_free_state(pool, set->desc_surface_state); + } + for (uint32_t b = 0; b < set->buffer_view_count; b++) anv_descriptor_pool_free_state(pool, set->buffer_views[b].surface_state); @@ -925,6 +1057,16 @@ void anv_UpdateDescriptorSets( for (uint32_t j = 0; j < copy->descriptorCount; j++) dst_desc[j] = src_desc[j]; + + unsigned desc_size = anv_descriptor_size(src_layout); + if (desc_size > 0) { + assert(desc_size == anv_descriptor_size(dst_layout)); + memcpy(dst->desc_mem.map + dst_layout->descriptor_offset + + copy->dstArrayElement * desc_size, + src->desc_mem.map + src_layout->descriptor_offset + + copy->srcArrayElement * desc_size, + copy->descriptorCount * desc_size); + } } } diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 89f4bb7899c..1cb3ef51b30 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -27,6 +27,8 @@ #include "compiler/brw_nir.h" struct apply_pipeline_layout_state { + const struct anv_physical_device *pdevice; + nir_shader *shader; nir_builder builder; @@ -38,6 +40,9 @@ struct apply_pipeline_layout_state { bool uses_constants; uint8_t constants_offset; struct { + bool desc_buffer_used; + uint8_t desc_offset; + BITSET_WORD *used; uint8_t *surface_offsets; uint8_t *sampler_offsets; @@ -49,7 +54,17 @@ static void add_binding(struct apply_pipeline_layout_state *state, uint32_t set, uint32_t binding) { + const struct anv_descriptor_set_binding_layout *bind_layout = + &state->layout->set[set].layout->binding[binding]; + BITSET_SET(state->set[set].used, binding); + + /* Only flag the descriptor buffer as used if there's actually data for + * this binding. This lets us be lazy and call this function constantly + * without worrying about unnecessarily enabling the buffer. + */ + if (anv_descriptor_size(bind_layout)) + state->set[set].desc_buffer_used = true; } static void @@ -440,6 +455,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, struct anv_pipeline_bind_map *map) { struct apply_pipeline_layout_state state = { + .pdevice = pdevice, .shader = shader, .layout = layout, .add_bounds_checks = robust_buffer_access, @@ -464,6 +480,18 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice, get_used_bindings_block(block, &state); } + for (unsigned s = 0; s < layout->num_sets; s++) { + if (state.set[s].desc_buffer_used) { + map->surface_to_descriptor[map->surface_count] = + (struct anv_pipeline_binding) { + .set = ANV_DESCRIPTOR_SET_DESCRIPTORS, + .binding = s, + }; + state.set[s].desc_offset = map->surface_count; + map->surface_count++; + } + } + if (state.uses_constants) { state.constants_offset = map->surface_count; map->surface_to_descriptor[map->surface_count].set = diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 0573b99bab6..cd8414ac01f 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1530,10 +1530,18 @@ struct anv_descriptor_set_binding_layout { /* Index into the descriptor set buffer views */ int16_t buffer_view_index; + /* Offset into the descriptor buffer where this descriptor lives */ + uint32_t descriptor_offset; + /* Immutable samplers (or NULL if no immutable samplers) */ struct anv_sampler **immutable_samplers; }; +unsigned anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout); + +unsigned anv_descriptor_type_size(const struct anv_physical_device *pdevice, + VkDescriptorType type); + struct anv_descriptor_set_layout { /* Descriptor set layouts can be destroyed at almost any time */ uint32_t ref_cnt; @@ -1553,6 +1561,9 @@ struct anv_descriptor_set_layout { /* Number of dynamic offsets used by this descriptor set */ uint16_t dynamic_offset_count; + /* Size of the descriptor buffer for this descriptor set */ + uint32_t descriptor_buffer_size; + /* Bindings in this descriptor set */ struct anv_descriptor_set_binding_layout binding[0]; }; @@ -1594,8 +1605,15 @@ struct anv_descriptor { }; struct anv_descriptor_set { + struct anv_descriptor_pool *pool; struct anv_descriptor_set_layout *layout; uint32_t size; + + /* State relative to anv_descriptor_pool::bo */ + struct anv_state desc_mem; + /* Surface state for the descriptor buffer */ + struct anv_state desc_surface_state; + uint32_t buffer_view_count; struct anv_buffer_view *buffer_views; struct anv_descriptor descriptors[0]; @@ -1620,6 +1638,12 @@ struct anv_push_descriptor_set { /* Put this field right behind anv_descriptor_set so it fills up the * descriptors[0] field. */ struct anv_descriptor descriptors[MAX_PUSH_DESCRIPTORS]; + + /** True if the descriptor set buffer has been referenced by a draw or + * dispatch command. + */ + bool set_used_on_gpu; + struct anv_buffer_view buffer_views[MAX_PUSH_DESCRIPTORS]; }; @@ -1628,6 +1652,9 @@ struct anv_descriptor_pool { uint32_t next; uint32_t free_list; + struct anv_bo bo; + struct util_vma_heap bo_heap; + struct anv_state_stream surface_state_stream; void *surface_state_free_list; @@ -1724,6 +1751,7 @@ anv_descriptor_set_destroy(struct anv_device *device, struct anv_descriptor_pool *pool, struct anv_descriptor_set *set); +#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 3) #define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 2) #define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1) #define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index b5fc8be9475..7687507e6b7 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2029,6 +2029,31 @@ dynamic_offset_for_binding(const struct anv_cmd_pipeline_state *pipe_state, return pipe_state->dynamic_offsets[dynamic_offset_idx]; } +static struct anv_address +anv_descriptor_set_address(struct anv_cmd_buffer *cmd_buffer, + struct anv_descriptor_set *set) +{ + if (set->pool) { + /* This is a normal descriptor set */ + return (struct anv_address) { + .bo = &set->pool->bo, + .offset = set->desc_mem.offset, + }; + } else { + /* This is a push descriptor set. We have to flag it as used on the GPU + * so that the next time we push descriptors, we grab a new memory. + */ + struct anv_push_descriptor_set *push_set = + (struct anv_push_descriptor_set *)set; + push_set->set_used_on_gpu = true; + + return (struct anv_address) { + .bo = cmd_buffer->dynamic_state_stream.state_pool->block_pool.bo, + .offset = set->desc_mem.offset, + }; + } +} + static VkResult emit_binding_table(struct anv_cmd_buffer *cmd_buffer, gl_shader_stage stage, @@ -2149,6 +2174,18 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, add_surface_reloc(cmd_buffer, surface_state, cmd_buffer->state.compute.num_workgroups); continue; + } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + /* This is a descriptor set buffer so the set index is actually + * given by binding->binding. (Yes, that's confusing.) + */ + struct anv_descriptor_set *set = + pipe_state->descriptors[binding->binding]; + assert(set->desc_mem.alloc_size); + assert(set->desc_surface_state.alloc_size); + bt_map[s] = set->desc_surface_state.offset + state_offset; + add_surface_reloc(cmd_buffer, set->desc_surface_state, + anv_descriptor_set_address(cmd_buffer, set)); + continue; } const struct anv_descriptor *desc = @@ -2518,6 +2555,21 @@ cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer, DIV_ROUND_UP(constant_data_size, 32) - range->start); read_addr = anv_address_add(constant_data, range->start * 32); + } else if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS) { + /* This is a descriptor set buffer so the set index is + * actually given by binding->binding. (Yes, that's + * confusing.) + */ + struct anv_descriptor_set *set = + gfx_state->base.descriptors[binding->binding]; + struct anv_address desc_buffer_addr = + anv_descriptor_set_address(cmd_buffer, set); + const unsigned desc_buffer_size = set->desc_mem.alloc_size; + + read_len = MIN2(range->length, + DIV_ROUND_UP(desc_buffer_size, 32) - range->start); + read_addr = anv_address_add(desc_buffer_addr, + range->start * 32); } else { const struct anv_descriptor *desc = anv_descriptor_for_binding(&gfx_state->base, binding); -- 2.30.2