From: Bas Nieuwenhuizen Date: Thu, 14 Mar 2019 10:20:53 +0000 (+0100) Subject: radv: Support VK_EXT_inline_uniform_block. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=8d2654a4197bbf45cbe9f72e82f025d04cda7bc2;p=mesa.git radv: Support VK_EXT_inline_uniform_block. Basically just reserve the memory in the descriptor sets. On the shader side we construct a buffer descriptor, since AFAIU VGPR indexing on 32-bit pointers in LLVM is still broken. This fully supports update after bind and variable descriptor set sizes. However, the limits are somewhat arbitrary and are mostly about finding a reasonable division of a 2 GiB max memory size over the set. v2: - rebased on top of master (Samuel) - remove the loading resources rework (Samuel) - only load UBO descriptors if it's a pointer (Samuel) - use LLVMBuildPtrToInt to avoid IR failures (Samuel) Reviewed-by: Bas Nieuwenhuizen (v2) --- diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c index 68171b5d244..6c6b88a4553 100644 --- a/src/amd/vulkan/radv_descriptor_set.c +++ b/src/amd/vulkan/radv_descriptor_set.c @@ -127,6 +127,7 @@ VkResult radv_CreateDescriptorSetLayout( uint32_t b = binding->binding; uint32_t alignment; unsigned binding_buffer_count = 0; + uint32_t descriptor_count = binding->descriptorCount; switch (binding->descriptorType) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: @@ -164,6 +165,11 @@ VkResult radv_CreateDescriptorSetLayout( set_layout->binding[b].size = 16; alignment = 16; break; + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + alignment = 16; + set_layout->binding[b].size = descriptor_count; + descriptor_count = 1; + break; default: unreachable("unknown descriptor type\n"); break; @@ -171,7 +177,7 @@ VkResult radv_CreateDescriptorSetLayout( set_layout->size = align(set_layout->size, alignment); set_layout->binding[b].type = binding->descriptorType; - set_layout->binding[b].array_size = binding->descriptorCount; + set_layout->binding[b].array_size = descriptor_count; set_layout->binding[b].offset = set_layout->size; set_layout->binding[b].buffer_offset = buffer_count; set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count; @@ -207,9 +213,9 @@ VkResult radv_CreateDescriptorSetLayout( samplers_offset += 4 * sizeof(uint32_t) * binding->descriptorCount; } - set_layout->size += binding->descriptorCount * set_layout->binding[b].size; - buffer_count += binding->descriptorCount * binding_buffer_count; - dynamic_offset_count += binding->descriptorCount * + set_layout->size += descriptor_count * set_layout->binding[b].size; + buffer_count += descriptor_count * binding_buffer_count; + dynamic_offset_count += descriptor_count * set_layout->binding[b].dynamic_offset_count; set_layout->shader_stages |= binding->stageFlags; } @@ -264,6 +270,7 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device, uint64_t descriptor_size = 0; uint64_t descriptor_alignment = 1; + uint32_t descriptor_count = binding->descriptorCount; switch (binding->descriptorType) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: @@ -282,7 +289,7 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device, descriptor_alignment = 32; break; case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - if (!has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount)) { + if (!has_equal_immutable_samplers(binding->pImmutableSamplers, descriptor_count)) { descriptor_size = 64; } else { descriptor_size = 96; @@ -290,11 +297,16 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device, descriptor_alignment = 32; break; case VK_DESCRIPTOR_TYPE_SAMPLER: - if (!has_equal_immutable_samplers(binding->pImmutableSamplers, binding->descriptorCount)) { + if (!has_equal_immutable_samplers(binding->pImmutableSamplers, descriptor_count)) { descriptor_size = 16; descriptor_alignment = 16; } break; + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + descriptor_alignment = 16; + descriptor_size = descriptor_count; + descriptor_count = 1; + break; default: unreachable("unknown descriptor type\n"); break; @@ -305,18 +317,20 @@ void radv_GetDescriptorSetLayoutSupport(VkDevice device, } size = align_u64(size, descriptor_alignment); - uint64_t max_count = UINT64_MAX; - if (descriptor_size) - max_count = (UINT64_MAX - size) / descriptor_size; + uint64_t max_count = INT32_MAX; + if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + max_count = INT32_MAX - size; + else if (descriptor_size) + max_count = (INT32_MAX - size) / descriptor_size; - if (max_count < binding->descriptorCount) { + if (max_count < descriptor_count) { supported = false; } if (variable_flags && binding->binding bindingCount && variable_count && (variable_flags->pBindingFlags[binding->binding] & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT_EXT)) { variable_count->maxVariableDescriptorCount = MIN2(UINT32_MAX, max_count); } - size += binding->descriptorCount * descriptor_size; + size += descriptor_count * descriptor_size; } free(bindings); @@ -543,6 +557,21 @@ VkResult radv_CreateDescriptorPool( uint64_t size = sizeof(struct radv_descriptor_pool); uint64_t bo_size = 0, bo_count = 0, range_count = 0; + vk_foreach_struct(ext, pCreateInfo->pNext) { + switch (ext->sType) { + case VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_INLINE_UNIFORM_BLOCK_CREATE_INFO_EXT: { + const struct VkDescriptorPoolInlineUniformBlockCreateInfoEXT *info = + (const struct VkDescriptorPoolInlineUniformBlockCreateInfoEXT*)ext; + /* the sizes are 4 aligned, and we need to align to at + * most 32, which needs at most 28 bytes extra per + * binding. */ + bo_size += 28llu * info->maxInlineUniformBlockBindings; + break; + } + default: + break; + } + } for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) { if (pCreateInfo->pPoolSizes[i].type != VK_DESCRIPTOR_TYPE_SAMPLER) @@ -569,6 +598,9 @@ VkResult radv_CreateDescriptorPool( case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: bo_size += 96 * pCreateInfo->pPoolSizes[i].descriptorCount; break; + case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT: + bo_size += pCreateInfo->pPoolSizes[i].descriptorCount; + break; default: unreachable("unknown descriptor type\n"); break; @@ -764,6 +796,17 @@ static void write_buffer_descriptor(struct radv_device *device, *buffer_list = buffer->bo; } +static void write_block_descriptor(struct radv_device *device, + struct radv_cmd_buffer *cmd_buffer, + void *dst, + const VkWriteDescriptorSet *writeset) +{ + const VkWriteDescriptorSetInlineUniformBlockEXT *inline_ub = + vk_find_struct_const(writeset->pNext, WRITE_DESCRIPTOR_SET_INLINE_UNIFORM_BLOCK_EXT); + + memcpy(dst, inline_ub->pData, inline_ub->dataSize); +} + static void write_dynamic_buffer_descriptor(struct radv_device *device, struct radv_descriptor_range *range, struct radeon_winsys_bo **buffer_list, @@ -862,6 +905,12 @@ void radv_update_descriptor_sets( const uint32_t *samplers = radv_immutable_samplers(set->layout, binding_layout); ptr += binding_layout->offset / 4; + + if (writeset->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + write_block_descriptor(device, cmd_buffer, (uint8_t*)ptr + writeset->dstArrayElement, writeset); + continue; + } + ptr += binding_layout->size * writeset->dstArrayElement / 4; buffer_list += binding_layout->buffer_offset; buffer_list += writeset->dstArrayElement; @@ -1042,7 +1091,12 @@ VkResult radv_CreateDescriptorUpdateTemplate(VkDevice _device, default: break; } - dst_offset = binding_layout->offset / 4 + binding_layout->size * entry->dstArrayElement / 4; + dst_offset = binding_layout->offset / 4; + if (entry->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) + dst_offset += entry->dstArrayElement / 4; + else + dst_offset += binding_layout->size * entry->dstArrayElement / 4; + dst_stride = binding_layout->size / 4; break; } @@ -1092,6 +1146,11 @@ void radv_update_descriptor_set_with_template(struct radv_device *device, const uint8_t *pSrc = ((const uint8_t *) pData) + templ->entry[i].src_offset; uint32_t j; + if (templ->entry[i].descriptor_type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + memcpy((uint8_t*)pDst, pSrc, templ->entry[i].descriptor_count); + continue; + } + for (j = 0; j < templ->entry[i].descriptor_count; ++j) { switch (templ->entry[i].descriptor_type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 96c6543141e..774ee5b91df 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -919,6 +919,14 @@ void radv_GetPhysicalDeviceFeatures2( features->shaderSharedInt64Atomics = false; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_FEATURES_EXT: { + VkPhysicalDeviceInlineUniformBlockFeaturesEXT *features = + (VkPhysicalDeviceInlineUniformBlockFeaturesEXT *)ext; + + features->inlineUniformBlock = true; + features->descriptorBindingInlineUniformBlockUpdateAfterBind = true; + break; + } default: break; } @@ -1213,7 +1221,8 @@ void radv_GetPhysicalDeviceProperties2( properties->robustBufferAccessUpdateAfterBind = false; properties->quadDivergentImplicitLod = false; - size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) / + size_t max_descriptor_set_size = ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS - + MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_INLINE_UNIFORM_BLOCK_COUNT) / (32 /* uniform buffer, 32 due to potential space wasted on alignment */ + 32 /* storage buffer, 32 due to potential space wasted on alignment */ + 32 /* sampler, largest when combined with image */ + @@ -1301,6 +1310,17 @@ void radv_GetPhysicalDeviceProperties2( properties->transformFeedbackDraw = true; break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES_EXT: { + VkPhysicalDeviceInlineUniformBlockPropertiesEXT *props = + (VkPhysicalDeviceInlineUniformBlockPropertiesEXT *)ext; + + props->maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE; + props->maxPerStageDescriptorInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_SETS; + props->maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_SIZE * MAX_SETS; + props->maxDescriptorSetInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_COUNT; + props->maxDescriptorSetUpdateAfterBindInlineUniformBlocks = MAX_INLINE_UNIFORM_BLOCK_COUNT; + break; + } default: break; } diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py index 4b12ccc47a0..40fc585c503 100644 --- a/src/amd/vulkan/radv_extensions.py +++ b/src/amd/vulkan/radv_extensions.py @@ -112,6 +112,7 @@ EXTENSIONS = [ Extension('VK_EXT_external_memory_host', 1, 'device->rad_info.has_userptr'), Extension('VK_EXT_global_priority', 1, 'device->rad_info.has_ctx_priority'), Extension('VK_EXT_host_query_reset', 1, True), + Extension('VK_EXT_inline_uniform_block', 1, True), Extension('VK_EXT_memory_budget', 1, True), Extension('VK_EXT_memory_priority', 1, True), Extension('VK_EXT_pci_bus_info', 2, True), diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 58a3cf18fe1..5bc88298ee6 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -1305,13 +1305,35 @@ radv_load_resource(struct ac_shader_abi *abi, LLVMValueRef index, } else stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false); - offset = ac_build_imad(&ctx->ac, index, stride, - LLVMConstInt(ctx->ac.i32, base_offset, false)); + offset = LLVMConstInt(ctx->ac.i32, base_offset, false); + + if (layout->binding[binding].type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + offset = ac_build_imad(&ctx->ac, index, stride, offset); + } desc_ptr = ac_build_gep0(&ctx->ac, desc_ptr, offset); desc_ptr = ac_cast_ptr(&ctx->ac, desc_ptr, ctx->ac.v4i32); LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md); + if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) { + uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + LLVMValueRef desc_components[4] = { + LLVMBuildPtrToInt(ctx->ac.builder, desc_ptr, ctx->ac.intptr, ""), + LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi), false), + /* High limit to support variable sizes. */ + LLVMConstInt(ctx->ac.i32, 0xffffffff, false), + LLVMConstInt(ctx->ac.i32, desc_type, false), + }; + + return ac_build_gather_values(&ctx->ac, desc_components, 4); + } + return desc_ptr; } @@ -1910,6 +1932,11 @@ static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer struct radv_shader_context *ctx = radv_shader_context_from_abi(abi); LLVMValueRef result; + if (LLVMGetTypeKind(LLVMTypeOf(buffer_ptr)) != LLVMPointerTypeKind) { + /* Do not load the descriptor for inlined uniform blocks. */ + return buffer_ptr; + } + LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md); result = LLVMBuildLoad(ctx->ac.builder, buffer_ptr, ""); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 82ab4eff8ca..cd3af7e614d 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -102,6 +102,8 @@ typedef uint32_t xcb_window_t; #define MAX_SO_STREAMS 4 #define MAX_SO_BUFFERS 4 #define MAX_SO_OUTPUTS 64 +#define MAX_INLINE_UNIFORM_BLOCK_SIZE (4ull * 1024 * 1024) +#define MAX_INLINE_UNIFORM_BLOCK_COUNT 64 #define NUM_DEPTH_CLEAR_PIPELINES 3