anv: Use on-the-fly surface states for dynamic buffer descriptors
authorJason Ekstrand <jason.ekstrand@intel.com>
Sat, 4 Mar 2017 17:23:26 +0000 (09:23 -0800)
committerJason Ekstrand <jason.ekstrand@intel.com>
Mon, 13 Mar 2017 14:58:00 +0000 (07:58 -0700)
We have a performance problem with dynamic buffer descriptors.  Because
we are currently implementing them by pushing an offset into the shader
and adding that offset onto the already existing offset for the UBO/SSBO
operation, all UBO/SSBO operations on dynamic descriptors are indirect.
The back-end compiler implements indirect pull constant loads using what
basically amounts to a texelFetch instruction.  For pull constant loads
with constant offsets, however, we use an oword block read message which
goes through the constant cache and reads a whole cache line at a time.
Because of these two things, direct pull constant loads are much faster
than indirect pull constant loads.  Because all loads from dynamically
bound buffers are indirect, the user takes a substantial performance
penalty when using this "performance" feature.

There are two potential solutions I have seen for this problem.  The
alternate solution is to continue pushing offsets into the shader but
wire things up in the back-end compiler so that we use the oword block
read messages anyway.  The only reason we can do this because we know a
priori that the dynamic offsets are uniform and 16-byte aligned.
Unfortunately, thanks to the 16-byte alignment requirement of the oword
messages, we can't do some general "if the indirect offset is uniform,
use an oword message" sort of thing.

This solution, however, is recommended for a few of reasons:

 1. Surface states are relatively cheap.  We've been using on-the-fly
    surface state setup for some time in GL and it works well.  Also,
    dynamic offsets with on-the-fly surface state should still be
    cheaper than allocating new descriptor sets every time you want to
    change a buffer offset which is really the only requirement of the
    dynamic offsets feature.

 2. This requires substantially less compiler plumbing.  Not only can we
    delete the entire apply_dynamic_offsets pass but we can also avoid
    having to add architecture for passing dynamic offsets to the back-
    end compiler in such a way that it can continue using oword messages.

 3. We get robust buffer access range-checking for free.  Because the
    offset and range are baked into the surface state, we no longer need
    to pass ranges around and do bounds-checking in the shader.

 4. Once we finally get UBO pushing implemented, it will be much easier
    to handle pushing chunks of dynamic descriptors if the compiler
    remains blissfully unaware of dynamic descriptors.

This commit improves performance of The Talos Principle on ULTRA
settings by around 50% and brings it nicely into line with OpenGL
performance.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
src/intel/Makefile.sources
src/intel/vulkan/anv_cmd_buffer.c
src/intel/vulkan/anv_descriptor_set.c
src/intel/vulkan/anv_nir_apply_dynamic_offsets.c [deleted file]
src/intel/vulkan/anv_pipeline.c
src/intel/vulkan/anv_private.h
src/intel/vulkan/genX_cmd_buffer.c

index 13375749ae333605247486fb3a176427e5dcfcff..4eaf380492f63ed147e7499a27fe59ebb2344a30 100644 (file)
@@ -176,7 +176,6 @@ VULKAN_FILES := \
        vulkan/anv_image.c \
        vulkan/anv_intel.c \
        vulkan/anv_nir.h \
-       vulkan/anv_nir_apply_dynamic_offsets.c \
        vulkan/anv_nir_apply_pipeline_layout.c \
        vulkan/anv_nir_lower_input_attachments.c \
        vulkan/anv_nir_lower_push_constants.c \
index cab1dd7305a3b2c783a300488840f3cb7dab4682..a6ad48a862713b5fd4e4a44dd12e139f5df3bb6f 100644 (file)
@@ -507,42 +507,31 @@ void anv_CmdBindDescriptorSets(
 
    assert(firstSet + descriptorSetCount < MAX_SETS);
 
+   uint32_t dynamic_slot = 0;
    for (uint32_t i = 0; i < descriptorSetCount; i++) {
       ANV_FROM_HANDLE(anv_descriptor_set, set, pDescriptorSets[i]);
       set_layout = layout->set[firstSet + i].layout;
 
-      if (cmd_buffer->state.descriptors[firstSet + i] != set) {
-         cmd_buffer->state.descriptors[firstSet + i] = set;
-         cmd_buffer->state.descriptors_dirty |= set_layout->shader_stages;
-      }
+      cmd_buffer->state.descriptors[firstSet + i] = set;
 
       if (set_layout->dynamic_offset_count > 0) {
-         anv_foreach_stage(s, set_layout->shader_stages) {
-            anv_cmd_buffer_ensure_push_constant_field(cmd_buffer, s, dynamic);
-
-            struct anv_push_constants *push =
-               cmd_buffer->state.push_constants[s];
-
-            unsigned d = layout->set[firstSet + i].dynamic_offset_start;
-            const uint32_t *offsets = pDynamicOffsets;
-            struct anv_descriptor *desc = set->descriptors;
-
-            for (unsigned b = 0; b < set_layout->binding_count; b++) {
-               if (set_layout->binding[b].dynamic_offset_index < 0)
-                  continue;
-
-               unsigned array_size = set_layout->binding[b].array_size;
-               for (unsigned j = 0; j < array_size; j++) {
-                  push->dynamic[d].offset = *(offsets++);
-                  push->dynamic[d].range = (desc->buffer_view) ?
-                                            desc->buffer_view->range : 0;
-                  desc++;
-                  d++;
-               }
-            }
-         }
-         cmd_buffer->state.push_constants_dirty |= set_layout->shader_stages;
+         uint32_t dynamic_offset_start =
+            layout->set[firstSet + i].dynamic_offset_start;
+
+         /* Assert that everything is in range */
+         assert(dynamic_offset_start + set_layout->dynamic_offset_count <=
+                ARRAY_SIZE(cmd_buffer->state.dynamic_offsets));
+         assert(dynamic_slot + set_layout->dynamic_offset_count <=
+                dynamicOffsetCount);
+
+         typed_memcpy(&cmd_buffer->state.dynamic_offsets[dynamic_offset_start],
+                      &pDynamicOffsets[dynamic_slot],
+                      set_layout->dynamic_offset_count);
+
+         dynamic_slot += set_layout->dynamic_offset_count;
       }
+
+      cmd_buffer->state.descriptors_dirty |= set_layout->shader_stages;
    }
 }
 
index 2a37d7d34ca01101b624ca6f359f80f72105518e..175efdbcb91ed25d2c59f7c393f50131892dc2e2 100644 (file)
@@ -662,35 +662,39 @@ anv_descriptor_set_write_buffer(struct anv_descriptor_set *set,
 
    assert(type == bind_layout->type);
 
-   struct anv_buffer_view *bview =
-      &set->buffer_views[bind_layout->buffer_index + element];
-
-   bview->format = anv_isl_format_for_descriptor_type(type);
-   bview->bo = buffer->bo;
-   bview->offset = buffer->offset + offset;
-
-   /* For buffers with dynamic offsets, we use the full possible range in the
-    * surface state and do the actual range-checking in the shader.
-    */
-   if (bind_layout->dynamic_offset_index >= 0)
-      range = VK_WHOLE_SIZE;
-   bview->range = anv_buffer_get_range(buffer, offset, range);
-
-   /* If we're writing descriptors through a push command, we need to allocate
-    * the surface state from the command buffer. Otherwise it will be
-    * allocated by the descriptor pool when calling
-    * vkAllocateDescriptorSets. */
-   if (alloc_stream)
-      bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64);
-
-   anv_fill_buffer_surface_state(device, bview->surface_state,
-                                 bview->format,
-                                 bview->offset, bview->range, 1);
-
-   *desc = (struct anv_descriptor) {
-      .type = type,
-      .buffer_view = bview,
-   };
+   if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
+       type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
+      *desc = (struct anv_descriptor) {
+         .type = type,
+         .buffer = buffer,
+         .offset = offset,
+         .range = range,
+      };
+   } else {
+      struct anv_buffer_view *bview =
+         &set->buffer_views[bind_layout->buffer_index + element];
+
+      bview->format = anv_isl_format_for_descriptor_type(type);
+      bview->bo = buffer->bo;
+      bview->offset = buffer->offset + offset;
+      bview->range = anv_buffer_get_range(buffer, offset, range);
+
+      /* If we're writing descriptors through a push command, we need to
+       * allocate the surface state from the command buffer. Otherwise it will
+       * be allocated by the descriptor pool when calling
+       * vkAllocateDescriptorSets. */
+      if (alloc_stream)
+         bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64);
+
+      anv_fill_buffer_surface_state(device, bview->surface_state,
+                                    bview->format,
+                                    bview->offset, bview->range, 1);
+
+      *desc = (struct anv_descriptor) {
+         .type = type,
+         .buffer_view = bview,
+      };
+   }
 }
 
 void anv_UpdateDescriptorSets(
diff --git a/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c b/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c
deleted file mode 100644 (file)
index 80ef8ee..0000000
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "anv_nir.h"
-#include "nir/nir_builder.h"
-
-static void
-apply_dynamic_offsets_block(nir_block *block, nir_builder *b,
-                            const struct anv_pipeline_layout *layout,
-                            bool add_bounds_checks,
-                            uint32_t indices_start)
-{
-   struct anv_descriptor_set_layout *set_layout;
-
-   nir_foreach_instr_safe(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-      unsigned block_idx_src;
-      switch (intrin->intrinsic) {
-      case nir_intrinsic_load_ubo:
-      case nir_intrinsic_load_ssbo:
-         block_idx_src = 0;
-         break;
-      case nir_intrinsic_store_ssbo:
-         block_idx_src = 1;
-         break;
-      default:
-         continue; /* the loop */
-      }
-
-      nir_instr *res_instr = intrin->src[block_idx_src].ssa->parent_instr;
-      assert(res_instr->type == nir_instr_type_intrinsic);
-      nir_intrinsic_instr *res_intrin = nir_instr_as_intrinsic(res_instr);
-      assert(res_intrin->intrinsic == nir_intrinsic_vulkan_resource_index);
-
-      unsigned set = res_intrin->const_index[0];
-      unsigned binding = res_intrin->const_index[1];
-
-      set_layout = layout->set[set].layout;
-      if (set_layout->binding[binding].dynamic_offset_index < 0)
-         continue;
-
-      b->cursor = nir_before_instr(&intrin->instr);
-
-      /* First, we need to generate the uniform load for the buffer offset */
-      uint32_t index = layout->set[set].dynamic_offset_start +
-                       set_layout->binding[binding].dynamic_offset_index;
-      uint32_t array_size = set_layout->binding[binding].array_size;
-
-      nir_intrinsic_instr *offset_load =
-         nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
-      offset_load->num_components = 2;
-      nir_intrinsic_set_base(offset_load, indices_start + index * 8);
-      nir_intrinsic_set_range(offset_load, array_size * 8);
-      offset_load->src[0] = nir_src_for_ssa(nir_imul(b, res_intrin->src[0].ssa,
-                                                     nir_imm_int(b, 8)));
-
-      nir_ssa_dest_init(&offset_load->instr, &offset_load->dest, 2, 32, NULL);
-      nir_builder_instr_insert(b, &offset_load->instr);
-
-      nir_src *offset_src = nir_get_io_offset_src(intrin);
-      nir_ssa_def *old_offset = nir_ssa_for_src(b, *offset_src, 1);
-      nir_ssa_def *new_offset = nir_iadd(b, old_offset, &offset_load->dest.ssa);
-      nir_instr_rewrite_src(&intrin->instr, offset_src,
-                            nir_src_for_ssa(new_offset));
-
-      if (!add_bounds_checks)
-         continue;
-
-      /* In order to avoid out-of-bounds access, we predicate */
-      nir_ssa_def *pred = nir_uge(b, nir_channel(b, &offset_load->dest.ssa, 1),
-                                  old_offset);
-      nir_if *if_stmt = nir_if_create(b->shader);
-      if_stmt->condition = nir_src_for_ssa(pred);
-      nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
-
-      nir_instr_remove(&intrin->instr);
-      nir_instr_insert_after_cf_list(&if_stmt->then_list, &intrin->instr);
-
-      if (intrin->intrinsic != nir_intrinsic_store_ssbo) {
-         /* It's a load, we need a phi node */
-         nir_phi_instr *phi = nir_phi_instr_create(b->shader);
-         nir_ssa_dest_init(&phi->instr, &phi->dest,
-                           intrin->num_components,
-                           intrin->dest.ssa.bit_size, NULL);
-
-         nir_phi_src *src1 = ralloc(phi, nir_phi_src);
-         struct exec_node *tnode = exec_list_get_tail(&if_stmt->then_list);
-         src1->pred = exec_node_data(nir_block, tnode, cf_node.node);
-         src1->src = nir_src_for_ssa(&intrin->dest.ssa);
-         exec_list_push_tail(&phi->srcs, &src1->node);
-
-         b->cursor = nir_after_cf_list(&if_stmt->else_list);
-         nir_const_value zero_val = { .u32 = { 0, 0, 0, 0 } };
-         nir_ssa_def *zero = nir_build_imm(b, intrin->num_components,
-                                           intrin->dest.ssa.bit_size, zero_val);
-
-         nir_phi_src *src2 = ralloc(phi, nir_phi_src);
-         struct exec_node *enode = exec_list_get_tail(&if_stmt->else_list);
-         src2->pred = exec_node_data(nir_block, enode, cf_node.node);
-         src2->src = nir_src_for_ssa(zero);
-         exec_list_push_tail(&phi->srcs, &src2->node);
-
-         assert(intrin->dest.is_ssa);
-         nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
-                                  nir_src_for_ssa(&phi->dest.ssa));
-
-         nir_instr_insert_after_cf(&if_stmt->cf_node, &phi->instr);
-      }
-   }
-}
-
-void
-anv_nir_apply_dynamic_offsets(struct anv_pipeline *pipeline,
-                              nir_shader *shader,
-                              struct brw_stage_prog_data *prog_data)
-{
-   const struct anv_pipeline_layout *layout = pipeline->layout;
-   if (!layout || !layout->stage[shader->stage].has_dynamic_offsets)
-      return;
-
-   const bool add_bounds_checks = pipeline->device->robust_buffer_access;
-
-   nir_foreach_function(function, shader) {
-      if (!function->impl)
-         continue;
-
-      nir_builder builder;
-      nir_builder_init(&builder, function->impl);
-
-      nir_foreach_block(block, function->impl) {
-         apply_dynamic_offsets_block(block, &builder, pipeline->layout,
-                                     add_bounds_checks, shader->num_uniforms);
-      }
-
-      nir_metadata_preserve(function->impl, nir_metadata_block_index |
-                                            nir_metadata_dominance);
-   }
-
-   struct anv_push_constants *null_data = NULL;
-   for (unsigned i = 0; i < MAX_DYNAMIC_BUFFERS; i++) {
-      prog_data->param[i * 2 + shader->num_uniforms / 4] =
-         (const union gl_constant_value *)&null_data->dynamic[i].offset;
-      prog_data->param[i * 2 + 1 + shader->num_uniforms / 4] =
-         (const union gl_constant_value *)&null_data->dynamic[i].range;
-   }
-
-   shader->num_uniforms += MAX_DYNAMIC_BUFFERS * 8;
-}
index 6362c2db6b15ddb2ab8ca7f88c0982c9e18d2dda..1e8e28dc3d3756aae8be02defed4b4a83337916c 100644 (file)
@@ -352,9 +352,6 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
       prog_data->nr_params += MAX_PUSH_CONSTANTS_SIZE / sizeof(float);
    }
 
-   if (pipeline->layout && pipeline->layout->stage[stage].has_dynamic_offsets)
-      prog_data->nr_params += MAX_DYNAMIC_BUFFERS * 2;
-
    if (nir->info->num_images > 0) {
       prog_data->nr_params += nir->info->num_images * BRW_IMAGE_PARAM_SIZE;
       pipeline->needs_data_cache = true;
@@ -386,9 +383,6 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
       }
    }
 
-   /* Set up dynamic offsets */
-   anv_nir_apply_dynamic_offsets(pipeline, nir, prog_data);
-
    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
    if (pipeline->layout)
       anv_nir_apply_pipeline_layout(pipeline, nir, prog_data, map);
index 5fb0c264f6f968dfd544c54e9499be068b36c5e1..7682bfcc757791d21877004f47308f18b916b28e 100644 (file)
@@ -901,6 +901,12 @@ struct anv_descriptor {
          enum isl_aux_usage aux_usage;
       };
 
+      struct {
+         struct anv_buffer *buffer;
+         uint64_t offset;
+         uint64_t range;
+      };
+
       struct anv_buffer_view *buffer_view;
    };
 };
@@ -1172,12 +1178,6 @@ struct anv_push_constants {
    uint32_t base_vertex;
    uint32_t base_instance;
 
-   /* Offsets and ranges for dynamically bound buffers */
-   struct {
-      uint32_t offset;
-      uint32_t range;
-   } dynamic[MAX_DYNAMIC_BUFFERS];
-
    /* Image data for image_load_store on pre-SKL */
    struct brw_image_param images[MAX_IMAGES];
 };
@@ -1271,6 +1271,7 @@ struct anv_cmd_state {
    uint32_t                                     restart_index;
    struct anv_vertex_binding                    vertex_bindings[MAX_VBS];
    struct anv_descriptor_set *                  descriptors[MAX_SETS];
+   uint32_t                                     dynamic_offsets[MAX_DYNAMIC_BUFFERS];
    VkShaderStageFlags                           push_constant_stages;
    struct anv_push_constants *                  push_constants[MESA_SHADER_STAGES];
    struct anv_state                             binding_tables[MESA_SHADER_STAGES];
index 9fdc08be506a0681f458c29b6f9f2eae2499db94..a12bd67bc872e4f58a161d10fa656f3f1e13adb2 100644 (file)
@@ -1215,8 +1215,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
 
       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
          surface_state = desc->buffer_view->surface_state;
          assert(surface_state.alloc_size);
@@ -1225,6 +1223,34 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
                                  desc->buffer_view->offset);
          break;
 
+      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
+      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+         uint32_t dynamic_offset_idx =
+            pipeline->layout->set[binding->set].dynamic_offset_start +
+            set->layout->binding[binding->binding].dynamic_offset_index +
+            binding->index;
+
+         /* Compute the offset within the buffer */
+         uint64_t offset = desc->offset +
+            cmd_buffer->state.dynamic_offsets[dynamic_offset_idx];
+         /* Clamp to the buffer size */
+         offset = MIN2(offset, desc->buffer->size);
+         /* Clamp the range to the buffer size */
+         uint32_t range = MIN2(desc->range, desc->buffer->size - offset);
+
+         surface_state =
+            anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
+         enum isl_format format =
+            anv_isl_format_for_descriptor_type(desc->type);
+
+         anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
+                                       format, offset, range, 1);
+         add_surface_state_reloc(cmd_buffer, surface_state,
+                                 desc->buffer->bo,
+                                 desc->buffer->offset + offset);
+         break;
+      }
+
       case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
          surface_state = (binding->write_only)
             ? desc->buffer_view->writeonly_storage_surface_state