anv: Implement SSBOs bindings with GPU addresses in the descriptor BO
authorJason Ekstrand <jason.ekstrand@intel.com>
Wed, 9 Jan 2019 22:04:22 +0000 (16:04 -0600)
committerJason Ekstrand <jason@jlekstrand.net>
Fri, 19 Apr 2019 19:56:42 +0000 (19:56 +0000)
This commit adds a new way for ANV to do SSBO bindings by just passing a
GPU address in through the descriptor buffer and using the A64 messages
to access the GPU address directly.  This means that our variable
pointers are now "real" pointers instead of a vec2(BTI, offset) pair.
This carries a few of advantages:

 1. It lets us support a virtually unbounded number of SSBO bindings.

 2. It lets us implement VK_KHR_shader_atomic_int64 which we couldn't
    implement before because those atomic messages are only available
    in the bindless A64 form.

 3. It's way better than messing around with bindless handles for SSBOs
    which is the only other option for VK_EXT_descriptor_indexing.

 4. It's more future looking, maybe?  At the least, this is what NVIDIA
    does (they don't have binding based SSBOs at all).  This doesn't a
    priori mean it's better, it just means it's probably not terrible.

The big disadvantage, of course, is that we have to start doing our own
bounds checking for robustBufferAccess again have to push in dynamic
offsets.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
src/intel/vulkan/anv_cmd_buffer.c
src/intel/vulkan/anv_descriptor_set.c
src/intel/vulkan/anv_device.c
src/intel/vulkan/anv_nir_apply_pipeline_layout.c
src/intel/vulkan/anv_pipeline.c
src/intel/vulkan/anv_private.h

index 1b34644a4343697034a671a02a7883c96b2bdaf6..981c071fc237c6a7ceaf6c5dfd473bbab900bf76 100644 (file)
@@ -594,6 +594,14 @@ anv_cmd_buffer_bind_descriptor_set(struct anv_cmd_buffer *cmd_buffer,
 
          *dynamic_offsets += set_layout->dynamic_offset_count;
          *dynamic_offset_count -= set_layout->dynamic_offset_count;
+
+         if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
+            cmd_buffer->state.push_constants_dirty |=
+               VK_SHADER_STAGE_COMPUTE_BIT;
+         } else {
+            cmd_buffer->state.push_constants_dirty |=
+               VK_SHADER_STAGE_ALL_GRAPHICS;
+         }
       }
    }
 
@@ -739,7 +747,8 @@ anv_cmd_buffer_merge_dynamic(struct anv_cmd_buffer *cmd_buffer,
 }
 
 static uint32_t
-anv_push_constant_value(struct anv_push_constants *data, uint32_t param)
+anv_push_constant_value(const struct anv_cmd_pipeline_state *state,
+                        const struct anv_push_constants *data, uint32_t param)
 {
    if (BRW_PARAM_IS_BUILTIN(param)) {
       switch (param) {
@@ -754,20 +763,28 @@ anv_push_constant_value(struct anv_push_constants *data, uint32_t param)
       default:
          unreachable("Invalid param builtin");
       }
-   } else {
+   } else if (ANV_PARAM_IS_PUSH(param)) {
       uint32_t offset = ANV_PARAM_PUSH_OFFSET(param);
       assert(offset % sizeof(uint32_t) == 0);
       if (offset < data->size)
          return *(uint32_t *)((uint8_t *)data + offset);
       else
          return 0;
+   } else if (ANV_PARAM_IS_DYN_OFFSET(param)) {
+      unsigned idx = ANV_PARAM_DYN_OFFSET_IDX(param);
+      assert(idx < MAX_DYNAMIC_BUFFERS);
+      return state->dynamic_offsets[idx];
    }
+
+   assert(!"Invalid param");
+   return 0;
 }
 
 struct anv_state
 anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
                               gl_shader_stage stage)
 {
+   struct anv_cmd_pipeline_state *pipeline_state = &cmd_buffer->state.gfx.base;
    struct anv_pipeline *pipeline = cmd_buffer->state.gfx.base.pipeline;
 
    /* If we don't have this stage, bail. */
@@ -780,7 +797,7 @@ anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
       pipeline->shaders[stage]->prog_data;
 
    /* If we don't actually have any push constants, bail. */
-   if (data == NULL || prog_data == NULL || prog_data->nr_params == 0)
+   if (prog_data == NULL || prog_data->nr_params == 0)
       return (struct anv_state) { .offset = 0 };
 
    struct anv_state state =
@@ -790,8 +807,10 @@ anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
 
    /* Walk through the param array and fill the buffer with data */
    uint32_t *u32_map = state.map;
-   for (unsigned i = 0; i < prog_data->nr_params; i++)
-      u32_map[i] = anv_push_constant_value(data, prog_data->param[i]);
+   for (unsigned i = 0; i < prog_data->nr_params; i++) {
+      u32_map[i] = anv_push_constant_value(pipeline_state, data,
+                                           prog_data->param[i]);
+   }
 
    return state;
 }
@@ -799,6 +818,7 @@ anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
 struct anv_state
 anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
 {
+   struct anv_cmd_pipeline_state *pipeline_state = &cmd_buffer->state.compute.base;
    struct anv_push_constants *data =
       cmd_buffer->state.push_constants[MESA_SHADER_COMPUTE];
    struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline;
@@ -826,7 +846,8 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
            i < cs_prog_data->push.cross_thread.dwords;
            i++) {
          assert(prog_data->param[i] != BRW_PARAM_BUILTIN_SUBGROUP_ID);
-         u32_map[i] = anv_push_constant_value(data, prog_data->param[i]);
+         u32_map[i] = anv_push_constant_value(pipeline_state, data,
+                                              prog_data->param[i]);
       }
    }
 
@@ -840,8 +861,8 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer)
             if (prog_data->param[src] == BRW_PARAM_BUILTIN_SUBGROUP_ID) {
                u32_map[dst] = t;
             } else {
-               u32_map[dst] =
-                  anv_push_constant_value(data, prog_data->param[src]);
+               u32_map[dst] = anv_push_constant_value(pipeline_state, data,
+                                                      prog_data->param[src]);
             }
          }
       }
index 90a02997a8dcc00fa15047771e23f8b7a3da199c..85915cfb9d00e7c47b3d33c3258ded62ce88dec8 100644 (file)
@@ -84,6 +84,14 @@ anv_descriptor_data_for_type(const struct anv_physical_device *device,
       unreachable("Unsupported descriptor type");
    }
 
+   /* On gen8 and above when we have softpin enabled, we also need to push
+    * SSBO address ranges so that we can use A64 messages in the shader.
+    */
+   if (device->has_a64_buffer_access &&
+       (type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+        type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC))
+      data |= ANV_DESCRIPTOR_ADDRESS_RANGE;
+
    return data;
 }
 
@@ -95,6 +103,9 @@ anv_descriptor_data_size(enum anv_descriptor_data data)
    if (data & ANV_DESCRIPTOR_IMAGE_PARAM)
       size += BRW_IMAGE_PARAM_SIZE * 4;
 
+   if (data & ANV_DESCRIPTOR_ADDRESS_RANGE)
+      size += sizeof(struct anv_address_range_descriptor);
+
    return size;
 }
 
@@ -130,6 +141,11 @@ anv_descriptor_data_supports_bindless(const struct anv_physical_device *pdevice,
                                       enum anv_descriptor_data data,
                                       bool sampler)
 {
+   if (data & ANV_DESCRIPTOR_ADDRESS_RANGE) {
+      assert(pdevice->has_a64_buffer_access);
+      return true;
+   }
+
    return false;
 }
 
@@ -1077,6 +1093,9 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
 
    assert(type == bind_layout->type);
 
+   struct anv_address bind_addr = anv_address_add(buffer->address, offset);
+   uint64_t bind_range = anv_buffer_get_range(buffer, offset, range);
+
    if (type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
        type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
       *desc = (struct anv_descriptor) {
@@ -1091,8 +1110,8 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
          &set->buffer_views[bind_layout->buffer_view_index + element];
 
       bview->format = anv_isl_format_for_descriptor_type(type);
-      bview->range = anv_buffer_get_range(buffer, offset, range);
-      bview->address = anv_address_add(buffer->address, offset);
+      bview->range = bind_range;
+      bview->address = bind_addr;
 
       /* If we're writing descriptors through a push command, we need to
        * allocate the surface state from the command buffer. Otherwise it will
@@ -1102,14 +1121,24 @@ anv_descriptor_set_write_buffer(struct anv_device *device,
          bview->surface_state = anv_state_stream_alloc(alloc_stream, 64, 64);
 
       anv_fill_buffer_surface_state(device, bview->surface_state,
-                                    bview->format,
-                                    bview->address, bview->range, 1);
+                                    bview->format, bind_addr, bind_range, 1);
 
       *desc = (struct anv_descriptor) {
          .type = type,
          .buffer_view = bview,
       };
    }
+
+   void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
+                    element * anv_descriptor_size(bind_layout);
+
+   if (bind_layout->data & ANV_DESCRIPTOR_ADDRESS_RANGE) {
+      struct anv_address_range_descriptor desc = {
+         .address = anv_address_physical(bind_addr),
+         .range = bind_range,
+      };
+      memcpy(desc_map, &desc, sizeof(desc));
+   }
 }
 
 void
index 8c60b917050c111f3f2e6c73ae5a6d48cf73b206..de56926d9356522b342c069390b73c4e01ec2e0b 100644 (file)
@@ -278,6 +278,8 @@ anv_physical_device_init_uuids(struct anv_physical_device *device)
                      sizeof(device->chipset_id));
    _mesa_sha1_update(&sha1_ctx, &device->always_use_bindless,
                      sizeof(device->always_use_bindless));
+   _mesa_sha1_update(&sha1_ctx, &device->has_a64_buffer_access,
+                     sizeof(device->has_a64_buffer_access));
    _mesa_sha1_final(&sha1_ctx, sha1);
    memcpy(device->pipeline_cache_uuid, sha1, VK_UUID_SIZE);
 
@@ -1103,9 +1105,15 @@ void anv_GetPhysicalDeviceProperties(
    const uint32_t max_raw_buffer_sz = devinfo->gen >= 7 ?
                                       (1ul << 30) : (1ul << 27);
 
+   const uint32_t max_ssbos = pdevice->has_a64_buffer_access ? UINT16_MAX : 64;
    const uint32_t max_samplers = (devinfo->gen >= 8 || devinfo->is_haswell) ?
                                  128 : 16;
 
+   /* The moment we have anything bindless, claim a high per-stage limit */
+   const uint32_t max_per_stage =
+      pdevice->has_a64_buffer_access ? UINT32_MAX :
+                                       MAX_BINDING_TABLE_SIZE - MAX_RTS;
+
    VkSampleCountFlags sample_counts =
       isl_device_get_sample_counts(&pdevice->isl_dev);
 
@@ -1127,15 +1135,15 @@ void anv_GetPhysicalDeviceProperties(
       .maxBoundDescriptorSets                   = MAX_SETS,
       .maxPerStageDescriptorSamplers            = max_samplers,
       .maxPerStageDescriptorUniformBuffers      = 64,
-      .maxPerStageDescriptorStorageBuffers      = 64,
+      .maxPerStageDescriptorStorageBuffers      = max_ssbos,
       .maxPerStageDescriptorSampledImages       = max_samplers,
       .maxPerStageDescriptorStorageImages       = MAX_IMAGES,
       .maxPerStageDescriptorInputAttachments    = 64,
-      .maxPerStageResources                     = MAX_BINDING_TABLE_SIZE - MAX_RTS,
+      .maxPerStageResources                     = max_per_stage,
       .maxDescriptorSetSamplers                 = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSamplers */
       .maxDescriptorSetUniformBuffers           = 6 * 64,           /* number of stages * maxPerStageDescriptorUniformBuffers */
       .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
-      .maxDescriptorSetStorageBuffers           = 6 * 64,           /* number of stages * maxPerStageDescriptorStorageBuffers */
+      .maxDescriptorSetStorageBuffers           = 6 * max_ssbos,    /* number of stages * maxPerStageDescriptorStorageBuffers */
       .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
       .maxDescriptorSetSampledImages            = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSampledImages */
       .maxDescriptorSetStorageImages            = 6 * MAX_IMAGES,   /* number of stages * maxPerStageDescriptorStorageImages */
index 7abc27be1032b69b4c15dcc1558f05c472435763..356a56e47bbd46d228ec6a1e0b2ae5d78f1697a9 100644 (file)
@@ -45,6 +45,8 @@ struct apply_pipeline_layout_state {
    /* Place to flag lowered instructions so we don't lower them twice */
    struct set *lowered_instrs;
 
+   int dynamic_offset_uniform_start;
+
    bool uses_constants;
    uint8_t constants_offset;
    struct {
@@ -159,7 +161,12 @@ find_descriptor_for_index_src(nir_src src,
    if (!intrin || intrin->intrinsic != nir_intrinsic_vulkan_resource_index)
       return false;
 
-   return true;
+   uint32_t set = nir_intrinsic_desc_set(intrin);
+   uint32_t binding = nir_intrinsic_binding(intrin);
+   uint32_t surface_index = state->set[set].surface_offsets[binding];
+
+   /* Only lower to a BTI message if we have a valid binding table index. */
+   return surface_index < MAX_BINDING_TABLE_SIZE;
 }
 
 static bool
@@ -327,6 +334,7 @@ lower_res_index_intrinsic(nir_intrinsic_instr *intrin,
 
    uint32_t set = nir_intrinsic_desc_set(intrin);
    uint32_t binding = nir_intrinsic_binding(intrin);
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
 
    const struct anv_descriptor_set_binding_layout *bind_layout =
       &state->layout->set[set].layout->binding[binding];
@@ -339,14 +347,55 @@ lower_res_index_intrinsic(nir_intrinsic_instr *intrin,
       array_index = nir_umin(b, array_index, nir_imm_int(b, array_size - 1));
 
    nir_ssa_def *index;
-   if (bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
+   if (state->pdevice->has_a64_buffer_access &&
+       (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+        desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)) {
+      /* We store the descriptor offset as 16.8.8 where the top 16 bits are
+       * the offset into the descriptor set, the next 8 are the binding table
+       * index of the descriptor buffer, and the bottom 8 bits are the offset
+       * (in bytes) into the dynamic offset table.
+       */
+      assert(bind_layout->dynamic_offset_index < MAX_DYNAMIC_BUFFERS);
+      uint32_t dynamic_offset_index = 0xff; /* No dynamic offset */
+      if (bind_layout->dynamic_offset_index >= 0) {
+         dynamic_offset_index =
+            state->layout->set[set].dynamic_offset_start +
+            bind_layout->dynamic_offset_index;
+      }
+
+      const uint32_t desc_offset =
+         bind_layout->descriptor_offset << 16 |
+         (uint32_t)state->set[set].desc_offset << 8 |
+         dynamic_offset_index;
+
+      if (state->add_bounds_checks) {
+         /* We're using nir_address_format_64bit_bounded_global */
+         assert(intrin->dest.ssa.num_components == 4);
+         assert(intrin->dest.ssa.bit_size == 32);
+         index = nir_vec4(b, nir_imm_int(b, desc_offset),
+                             nir_ssa_for_src(b, intrin->src[0], 1),
+                             nir_imm_int(b, array_size - 1),
+                             nir_ssa_undef(b, 1, 32));
+      } else {
+         /* We're using nir_address_format_64bit_global */
+         assert(intrin->dest.ssa.num_components == 1);
+         assert(intrin->dest.ssa.bit_size == 64);
+         index = nir_pack_64_2x32_split(b, nir_imm_int(b, desc_offset),
+                                           nir_ssa_for_src(b, intrin->src[0], 1));
+      }
+   } else if (bind_layout->data & ANV_DESCRIPTOR_INLINE_UNIFORM) {
       /* This is an inline uniform block.  Just reference the descriptor set
-       * and use the descriptor offset as the base.
+       * and use the descriptor offset as the base.  Inline uniforms always
+       * use  nir_address_format_32bit_index_offset
        */
+      assert(intrin->dest.ssa.num_components == 2);
+      assert(intrin->dest.ssa.bit_size == 32);
       index = nir_imm_ivec2(b, state->set[set].desc_offset,
                                bind_layout->descriptor_offset);
    } else {
       /* We're using nir_address_format_32bit_index_offset */
+      assert(intrin->dest.ssa.num_components == 2);
+      assert(intrin->dest.ssa.bit_size == 32);
       index = nir_vec2(b, nir_iadd_imm(b, array_index, surface_index),
                           nir_imm_int(b, 0));
    }
@@ -364,6 +413,8 @@ lower_res_reindex_intrinsic(nir_intrinsic_instr *intrin,
 
    b->cursor = nir_before_instr(&intrin->instr);
 
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+
    /* For us, the resource indices are just indices into the binding table and
     * array elements are sequential.  A resource_reindex just turns into an
     * add of the two indices.
@@ -372,15 +423,81 @@ lower_res_reindex_intrinsic(nir_intrinsic_instr *intrin,
    nir_ssa_def *old_index = intrin->src[0].ssa;
    nir_ssa_def *offset = intrin->src[1].ssa;
 
-   nir_ssa_def *new_index =
-      nir_vec2(b, nir_iadd(b, nir_channel(b, old_index, 0), offset),
-                  nir_channel(b, old_index, 1));
+   nir_ssa_def *new_index;
+   if (state->pdevice->has_a64_buffer_access &&
+       (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+        desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)) {
+      if (state->add_bounds_checks) {
+         /* We're using nir_address_format_64bit_bounded_global */
+         assert(intrin->dest.ssa.num_components == 4);
+         assert(intrin->dest.ssa.bit_size == 32);
+         new_index = nir_vec4(b, nir_channel(b, old_index, 0),
+                                 nir_iadd(b, nir_channel(b, old_index, 1),
+                                             offset),
+                                 nir_channel(b, old_index, 2),
+                                 nir_ssa_undef(b, 1, 32));
+      } else {
+         /* We're using nir_address_format_64bit_global */
+         assert(intrin->dest.ssa.num_components == 1);
+         assert(intrin->dest.ssa.bit_size == 64);
+         nir_ssa_def *base = nir_unpack_64_2x32_split_x(b, old_index);
+         nir_ssa_def *arr_idx = nir_unpack_64_2x32_split_y(b, old_index);
+         new_index = nir_pack_64_2x32_split(b, base, nir_iadd(b, arr_idx, offset));
+      }
+   } else {
+      /* We're using nir_address_format_32bit_index_offset */
+      assert(intrin->dest.ssa.num_components == 2);
+      assert(intrin->dest.ssa.bit_size == 32);
+      new_index = nir_vec2(b, nir_iadd(b, nir_channel(b, old_index, 0), offset),
+                              nir_channel(b, old_index, 1));
+   }
 
    assert(intrin->dest.is_ssa);
    nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(new_index));
    nir_instr_remove(&intrin->instr);
 }
 
+static nir_ssa_def *
+build_ssbo_descriptor_load(const VkDescriptorType desc_type,
+                           nir_ssa_def *index,
+                           struct apply_pipeline_layout_state *state)
+{
+   nir_builder *b = &state->builder;
+
+   nir_ssa_def *desc_offset, *array_index;
+   if (state->add_bounds_checks) {
+      /* We're using nir_address_format_64bit_bounded_global */
+      desc_offset = nir_channel(b, index, 0);
+      array_index = nir_umin(b, nir_channel(b, index, 1),
+                                nir_channel(b, index, 2));
+   } else {
+      desc_offset = nir_unpack_64_2x32_split_x(b, index);
+      array_index = nir_unpack_64_2x32_split_y(b, index);
+   }
+
+   /* The desc_offset is actually 16.8.8 */
+   nir_ssa_def *desc_buffer_index =
+      nir_extract_u8(b, desc_offset, nir_imm_int(b, 1));
+   nir_ssa_def *desc_offset_base =
+      nir_extract_u16(b, desc_offset, nir_imm_int(b, 1));
+
+   /* Compute the actual descriptor offset */
+   const unsigned descriptor_size =
+      anv_descriptor_type_size(state->pdevice, desc_type);
+   desc_offset = nir_iadd(b, desc_offset_base,
+                             nir_imul_imm(b, array_index, descriptor_size));
+
+   nir_intrinsic_instr *desc_load =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
+   desc_load->src[0] = nir_src_for_ssa(desc_buffer_index);
+   desc_load->src[1] = nir_src_for_ssa(desc_offset);
+   desc_load->num_components = 4;
+   nir_ssa_dest_init(&desc_load->instr, &desc_load->dest, 4, 32, NULL);
+   nir_builder_instr_insert(b, &desc_load->instr);
+
+   return &desc_load->dest.ssa;
+}
+
 static void
 lower_load_vulkan_descriptor(nir_intrinsic_instr *intrin,
                              struct apply_pipeline_layout_state *state)
@@ -389,12 +506,84 @@ lower_load_vulkan_descriptor(nir_intrinsic_instr *intrin,
 
    b->cursor = nir_before_instr(&intrin->instr);
 
-   /* We follow the nir_address_format_32bit_index_offset model */
+   const VkDescriptorType desc_type = nir_intrinsic_desc_type(intrin);
+
    assert(intrin->src[0].is_ssa);
    nir_ssa_def *index = intrin->src[0].ssa;
 
+   nir_ssa_def *desc;
+   if (state->pdevice->has_a64_buffer_access &&
+       (desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER ||
+        desc_type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)) {
+      desc = build_ssbo_descriptor_load(desc_type, index, state);
+
+      /* We want nir_address_format_64bit_global */
+      if (!state->add_bounds_checks)
+         desc = nir_pack_64_2x32(b, nir_channels(b, desc, 0x3));
+
+      if (state->dynamic_offset_uniform_start >= 0) {
+         /* This shader has dynamic offsets and we have no way of knowing
+          * (save from the dynamic offset base index) if this buffer has a
+          * dynamic offset.
+          */
+         nir_ssa_def *desc_offset, *array_index;
+         if (state->add_bounds_checks) {
+            /* We're using nir_address_format_64bit_bounded_global */
+            desc_offset = nir_channel(b, index, 0);
+            array_index = nir_umin(b, nir_channel(b, index, 1),
+                                      nir_channel(b, index, 2));
+         } else {
+            desc_offset = nir_unpack_64_2x32_split_x(b, index);
+            array_index = nir_unpack_64_2x32_split_y(b, index);
+         }
+
+         nir_ssa_def *dyn_offset_base =
+            nir_extract_u8(b, desc_offset, nir_imm_int(b, 0));
+         nir_ssa_def *dyn_offset_idx =
+            nir_iadd(b, dyn_offset_base, array_index);
+         if (state->add_bounds_checks) {
+            dyn_offset_idx = nir_umin(b, dyn_offset_idx,
+                                         nir_imm_int(b, MAX_DYNAMIC_BUFFERS));
+         }
+
+         nir_intrinsic_instr *dyn_load =
+            nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
+         nir_intrinsic_set_base(dyn_load, state->dynamic_offset_uniform_start);
+         nir_intrinsic_set_range(dyn_load, MAX_DYNAMIC_BUFFERS * 4);
+         dyn_load->src[0] = nir_src_for_ssa(nir_imul_imm(b, dyn_offset_idx, 4));
+         dyn_load->num_components = 1;
+         nir_ssa_dest_init(&dyn_load->instr, &dyn_load->dest, 1, 32, NULL);
+         nir_builder_instr_insert(b, &dyn_load->instr);
+
+         nir_ssa_def *dynamic_offset =
+            nir_bcsel(b, nir_ieq(b, dyn_offset_base, nir_imm_int(b, 0xff)),
+                         nir_imm_int(b, 0), &dyn_load->dest.ssa);
+
+         if (state->add_bounds_checks) {
+            /* The dynamic offset gets added to the base pointer so that we
+             * have a sliding window range.
+             *
+             * We're using nir_address_format_64bit_bounded_global.
+             */
+            nir_ssa_def *base_ptr =
+               nir_pack_64_2x32(b, nir_channels(b, desc, 0x3));
+            base_ptr = nir_iadd(b, base_ptr, nir_u2u64(b, dynamic_offset));
+            desc = nir_vec4(b, nir_unpack_64_2x32_split_x(b, base_ptr),
+                               nir_unpack_64_2x32_split_y(b, base_ptr),
+                               nir_channel(b, desc, 2),
+                               nir_channel(b, desc, 3));
+         } else {
+            /* We're using nir_address_format_64bit_global */
+            desc = nir_iadd(b, desc, nir_u2u64(b, dynamic_offset));
+         }
+      }
+   } else {
+      /* We follow the nir_address_format_32bit_index_offset model */
+      desc = index;
+   }
+
    assert(intrin->dest.is_ssa);
-   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(index));
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(desc));
    nir_instr_remove(&intrin->instr);
 }
 
@@ -409,15 +598,24 @@ lower_get_buffer_size(nir_intrinsic_instr *intrin,
 
    b->cursor = nir_before_instr(&intrin->instr);
 
+   const VkDescriptorType desc_type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+
    assert(intrin->src[0].is_ssa);
    nir_ssa_def *index = intrin->src[0].ssa;
 
-   /* We're following the nir_address_format_32bit_index_offset model so the
-    * binding table index is the first component of the address.  The
-    * back-end wants a scalar binding table index source.
-    */
-   nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
-                         nir_src_for_ssa(nir_channel(b, index, 0)));
+   if (state->pdevice->has_a64_buffer_access) {
+      nir_ssa_def *desc = build_ssbo_descriptor_load(desc_type, index, state);
+      nir_ssa_def *size = nir_channel(b, desc, 2);
+      nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(size));
+      nir_instr_remove(&intrin->instr);
+   } else {
+      /* We're following the nir_address_format_32bit_index_offset model so
+       * the binding table index is the first component of the address.  The
+       * back-end wants a scalar binding table index source.
+       */
+      nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
+                            nir_src_for_ssa(nir_channel(b, index, 0)));
+   }
 }
 
 static nir_ssa_def *
@@ -724,6 +922,7 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
       .layout = layout,
       .add_bounds_checks = robust_buffer_access,
       .lowered_instrs = _mesa_pointer_set_create(mem_ctx),
+      .dynamic_offset_uniform_start = -1,
    };
 
    for (unsigned s = 0; s < layout->num_sets; s++) {
@@ -813,11 +1012,16 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
    qsort(infos, used_binding_count, sizeof(struct binding_info),
          compare_binding_infos);
 
+   bool have_dynamic_buffers = false;
+
    for (unsigned i = 0; i < used_binding_count; i++) {
       unsigned set = infos[i].set, b = infos[i].binding;
       struct anv_descriptor_set_binding_layout *binding =
             &layout->set[set].layout->binding[b];
 
+      if (binding->dynamic_offset_index >= 0)
+         have_dynamic_buffers = true;
+
       const uint32_t array_size = binding->array_size;
 
       if (binding->data & ANV_DESCRIPTOR_SURFACE_STATE) {
@@ -874,6 +1078,16 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
       }
    }
 
+   if (have_dynamic_buffers) {
+      state.dynamic_offset_uniform_start = shader->num_uniforms;
+      uint32_t *param = brw_stage_prog_data_add_params(prog_data,
+                                                       MAX_DYNAMIC_BUFFERS);
+      for (unsigned i = 0; i < MAX_DYNAMIC_BUFFERS; i++)
+         param[i] = ANV_PARAM_DYN_OFFSET(i);
+      shader->num_uniforms += MAX_DYNAMIC_BUFFERS * 4;
+      assert(shader->num_uniforms == prog_data->nr_params * 4);
+   }
+
    nir_foreach_variable(var, &shader->uniforms) {
       const struct glsl_type *glsl_type = glsl_without_array(var->type);
 
index 2dd60f2dd2c6623d3b9ab8f06a387623595e1ef1..b0ed21873769205e4c19e1f1cd8985717c46a648 100644 (file)
@@ -166,12 +166,20 @@ anv_shader_compile_to_nir(struct anv_device *device,
          .variable_pointers = true,
       },
       .ubo_ptr_type = glsl_vector_type(GLSL_TYPE_UINT, 2),
-      .ssbo_ptr_type = glsl_vector_type(GLSL_TYPE_UINT, 2),
       .phys_ssbo_ptr_type = glsl_vector_type(GLSL_TYPE_UINT64, 1),
       .push_const_ptr_type = glsl_uint_type(),
       .shared_ptr_type = glsl_uint_type(),
    };
 
+   if (pdevice->has_a64_buffer_access) {
+      if (device->robust_buffer_access)
+         spirv_options.ssbo_ptr_type = glsl_vector_type(GLSL_TYPE_UINT, 4);
+      else
+         spirv_options.ssbo_ptr_type = glsl_vector_type(GLSL_TYPE_UINT64, 1);
+   } else {
+      spirv_options.ssbo_ptr_type = glsl_vector_type(GLSL_TYPE_UINT, 2);
+   }
+
    nir_function *entry_point =
       spirv_to_nir(spirv, module->size / 4,
                    spec_entries, num_spec_entries,
@@ -553,8 +561,9 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
                        struct anv_pipeline_stage *stage,
                        struct anv_pipeline_layout *layout)
 {
-   const struct brw_compiler *compiler =
-      pipeline->device->instance->physicalDevice.compiler;
+   const struct anv_physical_device *pdevice =
+      &pipeline->device->instance->physicalDevice;
+   const struct brw_compiler *compiler = pdevice->compiler;
 
    struct brw_stage_prog_data *prog_data = &stage->prog_data.base;
    nir_shader *nir = stage->nir;
@@ -607,15 +616,26 @@ anv_pipeline_lower_nir(struct anv_pipeline *pipeline,
 
    /* Apply the actual pipeline layout to UBOs, SSBOs, and textures */
    if (layout) {
-      anv_nir_apply_pipeline_layout(&pipeline->device->instance->physicalDevice,
+      anv_nir_apply_pipeline_layout(pdevice,
                                     pipeline->device->robust_buffer_access,
                                     layout, nir, prog_data,
                                     &stage->bind_map);
 
-      NIR_PASS_V(nir, nir_lower_explicit_io,
-                 nir_var_mem_ubo | nir_var_mem_ssbo,
+      NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ubo,
                  nir_address_format_32bit_index_offset);
 
+      nir_address_format ssbo_address_format;
+      if (pdevice->has_a64_buffer_access) {
+         if (pipeline->device->robust_buffer_access)
+            ssbo_address_format = nir_address_format_64bit_bounded_global;
+         else
+            ssbo_address_format = nir_address_format_64bit_global;
+      } else {
+         ssbo_address_format = nir_address_format_32bit_index_offset;
+      }
+      NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_ssbo,
+                 ssbo_address_format);
+
       NIR_PASS_V(nir, nir_opt_constant_folding);
    }
 
index ac63ab8b3befac408ae3cf3ec0381fe1be0504b0..9c747fa019c52ca5d21d4f017ce33d2da2f8fbd0 100644 (file)
@@ -1521,6 +1521,19 @@ struct anv_vue_header {
    float PointWidth;
 };
 
+/** Struct representing a address/range descriptor
+ *
+ * The fields of this struct correspond directly to the data layout of
+ * nir_address_format_64bit_bounded_global addresses.  The last field is the
+ * offset in the NIR address so it must be zero so that when you load the
+ * descriptor you get a pointer to the start of the range.
+ */
+struct anv_address_range_descriptor {
+   uint64_t address;
+   uint32_t range;
+   uint32_t zero;
+};
+
 enum anv_descriptor_data {
    /** The descriptor contains a BTI reference to a surface state */
    ANV_DESCRIPTOR_SURFACE_STATE  = (1 << 0),
@@ -1532,6 +1545,8 @@ enum anv_descriptor_data {
    ANV_DESCRIPTOR_IMAGE_PARAM    = (1 << 3),
    /** The descriptor contains auxiliary image layout data */
    ANV_DESCRIPTOR_INLINE_UNIFORM = (1 << 4),
+   /** anv_address_range_descriptor with a buffer address and range */
+   ANV_DESCRIPTOR_ADDRESS_RANGE  = (1 << 5),
 };
 
 struct anv_descriptor_set_binding_layout {
@@ -2086,8 +2101,13 @@ struct anv_xfb_binding {
 };
 
 #define ANV_PARAM_PUSH(offset)         ((1 << 16) | (uint32_t)(offset))
+#define ANV_PARAM_IS_PUSH(param)       ((uint32_t)(param) >> 16 == 1)
 #define ANV_PARAM_PUSH_OFFSET(param)   ((param) & 0xffff)
 
+#define ANV_PARAM_DYN_OFFSET(offset)      ((2 << 16) | (uint32_t)(offset))
+#define ANV_PARAM_IS_DYN_OFFSET(param)    ((uint32_t)(param) >> 16 == 2)
+#define ANV_PARAM_DYN_OFFSET_IDX(param)   ((param) & 0xffff)
+
 struct anv_push_constants {
    /* Current allocated size of this push constants data structure.
     * Because a decent chunk of it may not be used (images on SKL, for