From: Jason Ekstrand Date: Sat, 4 Mar 2017 17:23:26 +0000 (-0800) Subject: anv: Use on-the-fly surface states for dynamic buffer descriptors X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=dd4db84640bbb694f180dd50850c3388f67228be;p=mesa.git anv: Use on-the-fly surface states for dynamic buffer descriptors We have a performance problem with dynamic buffer descriptors. Because we are currently implementing them by pushing an offset into the shader and adding that offset onto the already existing offset for the UBO/SSBO operation, all UBO/SSBO operations on dynamic descriptors are indirect. The back-end compiler implements indirect pull constant loads using what basically amounts to a texelFetch instruction. For pull constant loads with constant offsets, however, we use an oword block read message which goes through the constant cache and reads a whole cache line at a time. Because of these two things, direct pull constant loads are much faster than indirect pull constant loads. Because all loads from dynamically bound buffers are indirect, the user takes a substantial performance penalty when using this "performance" feature. There are two potential solutions I have seen for this problem. The alternate solution is to continue pushing offsets into the shader but wire things up in the back-end compiler so that we use the oword block read messages anyway. The only reason we can do this because we know a priori that the dynamic offsets are uniform and 16-byte aligned. Unfortunately, thanks to the 16-byte alignment requirement of the oword messages, we can't do some general "if the indirect offset is uniform, use an oword message" sort of thing. This solution, however, is recommended for a few of reasons: 1. Surface states are relatively cheap. We've been using on-the-fly surface state setup for some time in GL and it works well. Also, dynamic offsets with on-the-fly surface state should still be cheaper than allocating new descriptor sets every time you want to change a buffer offset which is really the only requirement of the dynamic offsets feature. 2. This requires substantially less compiler plumbing. Not only can we delete the entire apply_dynamic_offsets pass but we can also avoid having to add architecture for passing dynamic offsets to the back- end compiler in such a way that it can continue using oword messages. 3. We get robust buffer access range-checking for free. Because the offset and range are baked into the surface state, we no longer need to pass ranges around and do bounds-checking in the shader. 4. Once we finally get UBO pushing implemented, it will be much easier to handle pushing chunks of dynamic descriptors if the compiler remains blissfully unaware of dynamic descriptors. This commit improves performance of The Talos Principle on ULTRA settings by around 50% and brings it nicely into line with OpenGL performance. Reviewed-by: Lionel Landwerlin --- diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index 13375749ae3..4eaf380492f 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -176,7 +176,6 @@ VULKAN_FILES := \ vulkan/anv_image.c \ vulkan/anv_intel.c \ vulkan/anv_nir.h \ - vulkan/anv_nir_apply_dynamic_offsets.c \ vulkan/anv_nir_apply_pipeline_layout.c \ vulkan/anv_nir_lower_input_attachments.c \ vulkan/anv_nir_lower_push_constants.c \ diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index cab1dd7305a..a6ad48a8627 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -507,42 +507,31 @@ void anv_CmdBindDescriptorSets( assert(firstSet + descriptorSetCount < MAX_SETS); + uint32_t dynamic_slot = 0; for (uint32_t i = 0; i < descriptorSetCount; i++) { ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]); set_layout = layout->set[firstSet + i].layout; - if (cmd_buffer->state.descriptors[firstSet + i] != set) { - cmd_buffer->state.descriptors[firstSet + i] = set; - cmd_buffer->state.descriptors_dirty |= set_layout->shader_stages; - } + cmd_buffer->state.descriptors[firstSet + i] = set; if (set_layout->dynamic_offset_count > 0) { - anv_foreach_stage(s, set_layout->shader_stages) { - anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, s, dynamic); - - struct anv_push_constants *push = - cmd_buffer->state.push_constants[s]; - - unsigned d = layout->set[firstSet + i].dynamic_offset_start; - const uint32_t *offsets = pDynamicOffsets; - struct anv_descriptor *desc = set->descriptors; - - for (unsigned b = 0; b < set_layout->binding_count; b++) { - if (set_layout->binding[b].dynamic_offset_index < 0) - continue; - - unsigned array_size = set_layout->binding[b].array_size; - for (unsigned j = 0; j < array_size; j++) { - push->dynamic[d].offset = *(offsets++); - push->dynamic[d].range = (desc->buffer_view) ? - desc->buffer_view->range : 0; - desc++; - d++; - } - } - } - cmd_buffer->state.push_constants_dirty |= set_layout->shader_stages; + uint32_t dynamic_offset_start = + layout->set[firstSet + i].dynamic_offset_start; + + /* Assert that everything is in range */ + assert(dynamic_offset_start + set_layout->dynamic_offset_count <= + ARRAY_SIZE(cmd_buffer->state.dynamic_offsets)); + assert(dynamic_slot + set_layout->dynamic_offset_count <= + dynamicOffsetCount); + + typed_memcpy(&cmd_buffer->state.dynamic_offsets[dynamic_offset_start], + &pDynamicOffsets[dynamic_slot], + set_layout->dynamic_offset_count); + + dynamic_slot += set_layout->dynamic_offset_count; } + + cmd_buffer->state.descriptors_dirty |= set_layout->shader_stages; } } diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index 2a37d7d34ca..175efdbcb91 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -662,35 +662,39 @@ anv_descriptor_set_write_buffer(struct anv_descriptor_set *set, assert(type == bind_layout->type); - struct anv_buffer_view *bview = - &set->buffer_views[bind_layout->buffer_index + element]; - - bview->format = anv_isl_format_for_descriptor_type(type); - bview->bo = buffer->bo; - bview->offset = buffer->offset + offset; - - /* For buffers with dynamic offsets, we use the full possible range in the - * surface state and do the actual range-checking in the shader. - */ - if (bind_layout->dynamic_offset_index >= 0) - range = VK_WHOLE_SIZE; - bview->range = anv_buffer_get_range(buffer, offset, range); - - /* If we're writing descriptors through a push command, we need to allocate - * the surface state from the command buffer. Otherwise it will be - * allocated by the descriptor pool when calling - * vkAllocateDescriptorSets. */ - if (alloc_stream) - bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64); - - anv_fill_buffer_surface_state(device, bview->surface_state, - bview->format, - bview->offset, bview->range, 1); - - *desc = (struct anv_descriptor) { - .type = type, - .buffer_view = bview, - }; + if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { + *desc = (struct anv_descriptor) { + .type = type, + .buffer = buffer, + .offset = offset, + .range = range, + }; + } else { + struct anv_buffer_view *bview = + &set->buffer_views[bind_layout->buffer_index + element]; + + bview->format = anv_isl_format_for_descriptor_type(type); + bview->bo = buffer->bo; + bview->offset = buffer->offset + offset; + bview->range = anv_buffer_get_range(buffer, offset, range); + + /* If we're writing descriptors through a push command, we need to + * allocate the surface state from the command buffer. Otherwise it will + * be allocated by the descriptor pool when calling + * vkAllocateDescriptorSets. */ + if (alloc_stream) + bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64); + + anv_fill_buffer_surface_state(device, bview->surface_state, + bview->format, + bview->offset, bview->range, 1); + + *desc = (struct anv_descriptor) { + .type = type, + .buffer_view = bview, + }; + } } void anv_UpdateDescriptorSets( diff --git a/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c b/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c deleted file mode 100644 index 80ef8eef5ed..00000000000 --- a/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "anv_nir.h" -#include "nir/nir_builder.h" - -static void -apply_dynamic_offsets_block(nir_block *block, nir_builder *b, - const struct anv_pipeline_layout *layout, - bool add_bounds_checks, - uint32_t indices_start) -{ - struct anv_descriptor_set_layout *set_layout; - - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - unsigned block_idx_src; - switch (intrin->intrinsic) { - case nir_intrinsic_load_ubo: - case nir_intrinsic_load_ssbo: - block_idx_src = 0; - break; - case nir_intrinsic_store_ssbo: - block_idx_src = 1; - break; - default: - continue; /* the loop */ - } - - nir_instr *res_instr = intrin->src[block_idx_src].ssa->parent_instr; - assert(res_instr->type == nir_instr_type_intrinsic); - nir_intrinsic_instr *res_intrin = nir_instr_as_intrinsic(res_instr); - assert(res_intrin->intrinsic == nir_intrinsic_vulkan_resource_index); - - unsigned set = res_intrin->const_index[0]; - unsigned binding = res_intrin->const_index[1]; - - set_layout = layout->set[set].layout; - if (set_layout->binding[binding].dynamic_offset_index < 0) - continue; - - b->cursor = nir_before_instr(&intrin->instr); - - /* First, we need to generate the uniform load for the buffer offset */ - uint32_t index = layout->set[set].dynamic_offset_start + - set_layout->binding[binding].dynamic_offset_index; - uint32_t array_size = set_layout->binding[binding].array_size; - - nir_intrinsic_instr *offset_load = - nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform); - offset_load->num_components = 2; - nir_intrinsic_set_base(offset_load, indices_start + index * 8); - nir_intrinsic_set_range(offset_load, array_size * 8); - offset_load->src[0] = nir_src_for_ssa(nir_imul(b, res_intrin->src[0].ssa, - nir_imm_int(b, 8))); - - nir_ssa_dest_init(&offset_load->instr, &offset_load->dest, 2, 32, NULL); - nir_builder_instr_insert(b, &offset_load->instr); - - nir_src *offset_src = nir_get_io_offset_src(intrin); - nir_ssa_def *old_offset = nir_ssa_for_src(b, *offset_src, 1); - nir_ssa_def *new_offset = nir_iadd(b, old_offset, &offset_load->dest.ssa); - nir_instr_rewrite_src(&intrin->instr, offset_src, - nir_src_for_ssa(new_offset)); - - if (!add_bounds_checks) - continue; - - /* In order to avoid out-of-bounds access, we predicate */ - nir_ssa_def *pred = nir_uge(b, nir_channel(b, &offset_load->dest.ssa, 1), - old_offset); - nir_if *if_stmt = nir_if_create(b->shader); - if_stmt->condition = nir_src_for_ssa(pred); - nir_cf_node_insert(b->cursor, &if_stmt->cf_node); - - nir_instr_remove(&intrin->instr); - nir_instr_insert_after_cf_list(&if_stmt->then_list, &intrin->instr); - - if (intrin->intrinsic != nir_intrinsic_store_ssbo) { - /* It's a load, we need a phi node */ - nir_phi_instr *phi = nir_phi_instr_create(b->shader); - nir_ssa_dest_init(&phi->instr, &phi->dest, - intrin->num_components, - intrin->dest.ssa.bit_size, NULL); - - nir_phi_src *src1 = ralloc(phi, nir_phi_src); - struct exec_node *tnode = exec_list_get_tail(&if_stmt->then_list); - src1->pred = exec_node_data(nir_block, tnode, cf_node.node); - src1->src = nir_src_for_ssa(&intrin->dest.ssa); - exec_list_push_tail(&phi->srcs, &src1->node); - - b->cursor = nir_after_cf_list(&if_stmt->else_list); - nir_const_value zero_val = { .u32 = { 0, 0, 0, 0 } }; - nir_ssa_def *zero = nir_build_imm(b, intrin->num_components, - intrin->dest.ssa.bit_size, zero_val); - - nir_phi_src *src2 = ralloc(phi, nir_phi_src); - struct exec_node *enode = exec_list_get_tail(&if_stmt->else_list); - src2->pred = exec_node_data(nir_block, enode, cf_node.node); - src2->src = nir_src_for_ssa(zero); - exec_list_push_tail(&phi->srcs, &src2->node); - - assert(intrin->dest.is_ssa); - nir_ssa_def_rewrite_uses(&intrin->dest.ssa, - nir_src_for_ssa(&phi->dest.ssa)); - - nir_instr_insert_after_cf(&if_stmt->cf_node, &phi->instr); - } - } -} - -void -anv_nir_apply_dynamic_offsets(struct anv_pipeline *pipeline, - nir_shader *shader, - struct brw_stage_prog_data *prog_data) -{ - const struct anv_pipeline_layout *layout = pipeline->layout; - if (!layout || !layout->stage[shader->stage].has_dynamic_offsets) - return; - - const bool add_bounds_checks = pipeline->device->robust_buffer_access; - - nir_foreach_function(function, shader) { - if (!function->impl) - continue; - - nir_builder builder; - nir_builder_init(&builder, function->impl); - - nir_foreach_block(block, function->impl) { - apply_dynamic_offsets_block(block, &builder, pipeline->layout, - add_bounds_checks, shader->num_uniforms); - } - - nir_metadata_preserve(function->impl, nir_metadata_block_index | - nir_metadata_dominance); - } - - struct anv_push_constants *null_data = NULL; - for (unsigned i = 0; i < MAX_DYNAMIC_BUFFERS; i++) { - prog_data->param[i * 2 + shader->num_uniforms / 4] = - (const union gl_constant_value *)&null_data->dynamic[i].offset; - prog_data->param[i * 2 + 1 + shader->num_uniforms / 4] = - (const union gl_constant_value *)&null_data->dynamic[i].range; - } - - shader->num_uniforms += MAX_DYNAMIC_BUFFERS * 8; -} diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 6362c2db6b1..1e8e28dc3d3 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -352,9 +352,6 @@ anv_pipeline_compile(struct anv_pipeline *pipeline, prog_data->nr_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float); } - if (pipeline->layout && pipeline->layout->stage[stage].has_dynamic_offsets) - prog_data->nr_params += MAX_DYNAMIC_BUFFERS * 2; - if (nir->info->num_images > 0) { prog_data->nr_params += nir->info->num_images * BRW_IMAGE_PARAM_SIZE; pipeline->needs_data_cache = true; @@ -386,9 +383,6 @@ anv_pipeline_compile(struct anv_pipeline *pipeline, } } - /* Set up dynamic offsets */ - anv_nir_apply_dynamic_offsets(pipeline, nir, prog_data); - /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */ if (pipeline->layout) anv_nir_apply_pipeline_layout(pipeline, nir, prog_data, map); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 5fb0c264f6f..7682bfcc757 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -901,6 +901,12 @@ struct anv_descriptor { enum isl_aux_usage aux_usage; }; + struct { + struct anv_buffer *buffer; + uint64_t offset; + uint64_t range; + }; + struct anv_buffer_view *buffer_view; }; }; @@ -1172,12 +1178,6 @@ struct anv_push_constants { uint32_t base_vertex; uint32_t base_instance; - /* Offsets and ranges for dynamically bound buffers */ - struct { - uint32_t offset; - uint32_t range; - } dynamic[MAX_DYNAMIC_BUFFERS]; - /* Image data for image_load_store on pre-SKL */ struct brw_image_param images[MAX_IMAGES]; }; @@ -1271,6 +1271,7 @@ struct anv_cmd_state { uint32_t restart_index; struct anv_vertex_binding vertex_bindings[MAX_VBS]; struct anv_descriptor_set * descriptors[MAX_SETS]; + uint32_t dynamic_offsets[MAX_DYNAMIC_BUFFERS]; VkShaderStageFlags push_constant_stages; struct anv_push_constants * push_constants[MESA_SHADER_STAGES]; struct anv_state binding_tables[MESA_SHADER_STAGES]; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 9fdc08be506..a12bd67bc87 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1215,8 +1215,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: surface_state = desc->buffer_view->surface_state; assert(surface_state.alloc_size); @@ -1225,6 +1223,34 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, desc->buffer_view->offset); break; + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { + uint32_t dynamic_offset_idx = + pipeline->layout->set[binding->set].dynamic_offset_start + + set->layout->binding[binding->binding].dynamic_offset_index + + binding->index; + + /* Compute the offset within the buffer */ + uint64_t offset = desc->offset + + cmd_buffer->state.dynamic_offsets[dynamic_offset_idx]; + /* Clamp to the buffer size */ + offset = MIN2(offset, desc->buffer->size); + /* Clamp the range to the buffer size */ + uint32_t range = MIN2(desc->range, desc->buffer->size - offset); + + surface_state = + anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64); + enum isl_format format = + anv_isl_format_for_descriptor_type(desc->type); + + anv_fill_buffer_surface_state(cmd_buffer->device, surface_state, + format, offset, range, 1); + add_surface_state_reloc(cmd_buffer, surface_state, + desc->buffer->bo, + desc->buffer->offset + offset); + break; + } + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: surface_state = (binding->write_only) ? desc->buffer_view->writeonly_storage_surface_state