anv: Use bindless textures and samplers
authorJason Ekstrand <jason.ekstrand@intel.com>
Thu, 7 Feb 2019 20:10:33 +0000 (14:10 -0600)
committerJason Ekstrand <jason@jlekstrand.net>
Fri, 19 Apr 2019 19:56:42 +0000 (19:56 +0000)
This commit changes anv to put bindless handles and sampler pointers
into the descriptor buffer and use those instead of bindful when we run
out of binding table space.  This "spilling" of descriptors allows to to
advertise an almost unbounded number of images and samplers.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
src/intel/vulkan/anv_descriptor_set.c
src/intel/vulkan/anv_device.c
src/intel/vulkan/anv_nir_apply_pipeline_layout.c
src/intel/vulkan/anv_private.h
src/intel/vulkan/genX_cmd_buffer.c
src/intel/vulkan/genX_state.c

index c7a901275114a367a8e9456bbc7544b388dd9ab4..6db6021822ad480a40370118886f6e2ed2d5eb58 100644 (file)
@@ -45,15 +45,24 @@ anv_descriptor_data_for_type(const struct anv_physical_device *device,
    switch (type) {
    case VK_DESCRIPTOR_TYPE_SAMPLER:
       data = ANV_DESCRIPTOR_SAMPLER_STATE;
+      if (device->has_bindless_samplers)
+         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
       break;
 
    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
       data = ANV_DESCRIPTOR_SURFACE_STATE |
              ANV_DESCRIPTOR_SAMPLER_STATE;
+      if (device->has_bindless_images || device->has_bindless_samplers)
+         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
       break;
 
    case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
    case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
+      data = ANV_DESCRIPTOR_SURFACE_STATE;
+      if (device->has_bindless_images)
+         data |= ANV_DESCRIPTOR_SAMPLED_IMAGE;
+      break;
+
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
       data = ANV_DESCRIPTOR_SURFACE_STATE;
       break;
@@ -100,6 +109,9 @@ anv_descriptor_data_size(enum anv_descriptor_data data)
 {
    unsigned size = 0;
 
+   if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE)
+      size += sizeof(struct anv_sampled_image_descriptor);
+
    if (data & ANV_DESCRIPTOR_IMAGE_PARAM)
       size += BRW_IMAGE_PARAM_SIZE * 4;
 
@@ -118,7 +130,17 @@ anv_descriptor_size(const struct anv_descriptor_set_binding_layout *layout)
       return layout->array_size;
    }
 
-   return anv_descriptor_data_size(layout->data);
+   unsigned size = anv_descriptor_data_size(layout->data);
+
+   /* For multi-planar bindings, we make every descriptor consume the maximum
+    * number of planes so we don't have to bother with walking arrays and
+    * adding things up every time.  Fortunately, YCbCr samplers aren't all
+    * that common and likely won't be in the middle of big arrays.
+    */
+   if (layout->max_plane_count > 1)
+      size *= layout->max_plane_count;
+
+   return size;
 }
 
 /** Returns the size in bytes of each descriptor of the given type
@@ -132,7 +154,11 @@ unsigned
 anv_descriptor_type_size(const struct anv_physical_device *pdevice,
                          VkDescriptorType type)
 {
-   assert(type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT);
+   assert(type != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT &&
+          type != VK_DESCRIPTOR_TYPE_SAMPLER &&
+          type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
+          type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER);
+
    return anv_descriptor_data_size(anv_descriptor_data_for_type(pdevice, type));
 }
 
@@ -146,6 +172,12 @@ anv_descriptor_data_supports_bindless(const struct anv_physical_device *pdevice,
       return true;
    }
 
+   if (data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+      assert(pdevice->has_bindless_images || pdevice->has_bindless_samplers);
+      return sampler ? pdevice->has_bindless_samplers :
+                       pdevice->has_bindless_images;
+   }
+
    return false;
 }
 
@@ -586,6 +618,13 @@ VkResult anv_CreateDescriptorPool(
       unsigned desc_data_size = anv_descriptor_data_size(desc_data) *
                                 pCreateInfo->pPoolSizes[i].descriptorCount;
 
+      /* Combined image sampler descriptors can take up to 3 slots if they
+       * hold a YCbCr image.
+       */
+      if (pCreateInfo->pPoolSizes[i].type ==
+          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
+         desc_data_size *= 3;
+
       if (pCreateInfo->pPoolSizes[i].type ==
           VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
          /* Inline uniform blocks are specified to use the descriptor array
@@ -999,6 +1038,18 @@ anv_descriptor_set_write_image_param(uint32_t *param_desc_map,
 #undef WRITE_PARAM_FIELD
 }
 
+static uint32_t
+anv_surface_state_to_handle(struct anv_state state)
+{
+   /* Bits 31:12 of the bindless surface offset in the extended message
+    * descriptor is bits 25:6 of the byte-based address.
+    */
+   assert(state.offset >= 0);
+   uint32_t offset = state.offset;
+   assert((offset & 0x3f) == 0 && offset < (1 << 26));
+   return offset << 6;
+}
+
 void
 anv_descriptor_set_write_image_view(struct anv_device *device,
                                     struct anv_descriptor_set *set,
@@ -1057,6 +1108,33 @@ anv_descriptor_set_write_image_view(struct anv_device *device,
    void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
                     element * anv_descriptor_size(bind_layout);
 
+   if (bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+      struct anv_sampled_image_descriptor desc_data[3];
+      memset(desc_data, 0, sizeof(desc_data));
+
+      if (image_view) {
+         for (unsigned p = 0; p < image_view->n_planes; p++) {
+            struct anv_surface_state sstate =
+               (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
+               image_view->planes[p].general_sampler_surface_state :
+               image_view->planes[p].optimal_sampler_surface_state;
+            desc_data[p].image = anv_surface_state_to_handle(sstate.state);
+         }
+      }
+
+      if (sampler) {
+         for (unsigned p = 0; p < sampler->n_planes; p++)
+            desc_data[p].sampler = sampler->bindless_state.offset + p * 32;
+      }
+
+      /* We may have max_plane_count < 0 if this isn't a sampled image but it
+       * can be no more than the size of our array of handles.
+       */
+      assert(bind_layout->max_plane_count <= ARRAY_SIZE(desc_data));
+      memcpy(desc_map, desc_data,
+             MAX2(1, bind_layout->max_plane_count) * sizeof(desc_data[0]));
+   }
+
    if (bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM) {
       /* Storage images can only ever have one plane */
       assert(image_view->n_planes == 1);
@@ -1090,6 +1168,13 @@ anv_descriptor_set_write_buffer_view(struct anv_device *device,
    void *desc_map = set->desc_mem.map + bind_layout->descriptor_offset +
                     element * anv_descriptor_size(bind_layout);
 
+   if (bind_layout->data & ANV_DESCRIPTOR_SAMPLED_IMAGE) {
+      struct anv_sampled_image_descriptor desc_data = {
+         .image = anv_surface_state_to_handle(buffer_view->surface_state),
+      };
+      memcpy(desc_map, &desc_data, sizeof(desc_data));
+   }
+
    if (bind_layout->data & ANV_DESCRIPTOR_IMAGE_PARAM) {
       anv_descriptor_set_write_image_param(desc_map,
                                            &buffer_view->storage_image_param);
index d56e359335377a699fb9cad90f6c18e91d4055bd..44fea839f5215aa6f84cfa698f0a9be542766e15 100644 (file)
@@ -280,6 +280,10 @@ anv_physical_device_init_uuids(struct anv_physical_device *device)
                      sizeof(device->always_use_bindless));
    _mesa_sha1_update(&sha1_ctx, &device->has_a64_buffer_access,
                      sizeof(device->has_a64_buffer_access));
+   _mesa_sha1_update(&sha1_ctx, &device->has_bindless_images,
+                     sizeof(device->has_bindless_images));
+   _mesa_sha1_update(&sha1_ctx, &device->has_bindless_samplers,
+                     sizeof(device->has_bindless_samplers));
    _mesa_sha1_final(&sha1_ctx, sha1);
    memcpy(device->pipeline_cache_uuid, sha1, VK_UUID_SIZE);
 
@@ -464,6 +468,19 @@ anv_physical_device_init(struct anv_physical_device *device,
    device->has_a64_buffer_access = device->info.gen >= 8 &&
                                    device->use_softpin;
 
+   /* We first get bindless image access on Skylake and we can only really do
+    * it if we don't have any relocations so we need softpin.
+    */
+   device->has_bindless_images = device->info.gen >= 9 &&
+                                 device->use_softpin;
+
+   /* We've had bindless samplers since Ivy Bridge (forever in Vulkan terms)
+    * because it's just a matter of setting the sampler address in the sample
+    * message header.  However, we've not bothered to wire it up for vec4 so
+    * we leave it disabled on gen7.
+    */
+   device->has_bindless_samplers = device->info.gen >= 8;
+
    /* Starting with Gen10, the timestamp frequency of the command streamer may
     * vary from one part to another. We can query the value from the kernel.
     */
@@ -1114,8 +1131,11 @@ void anv_GetPhysicalDeviceProperties(
                                       (1ul << 30) : (1ul << 27);
 
    const uint32_t max_ssbos = pdevice->has_a64_buffer_access ? UINT16_MAX : 64;
-   const uint32_t max_samplers = (devinfo->gen >= 8 || devinfo->is_haswell) ?
-                                 128 : 16;
+   const uint32_t max_textures =
+      pdevice->has_bindless_images ? UINT16_MAX : 128;
+   const uint32_t max_samplers =
+      pdevice->has_bindless_samplers ? UINT16_MAX :
+      (devinfo->gen >= 8 || devinfo->is_haswell) ? 128 : 16;
 
    /* The moment we have anything bindless, claim a high per-stage limit */
    const uint32_t max_per_stage =
@@ -1144,7 +1164,7 @@ void anv_GetPhysicalDeviceProperties(
       .maxPerStageDescriptorSamplers            = max_samplers,
       .maxPerStageDescriptorUniformBuffers      = 64,
       .maxPerStageDescriptorStorageBuffers      = max_ssbos,
-      .maxPerStageDescriptorSampledImages       = max_samplers,
+      .maxPerStageDescriptorSampledImages       = max_textures,
       .maxPerStageDescriptorStorageImages       = MAX_IMAGES,
       .maxPerStageDescriptorInputAttachments    = 64,
       .maxPerStageResources                     = max_per_stage,
@@ -1153,7 +1173,7 @@ void anv_GetPhysicalDeviceProperties(
       .maxDescriptorSetUniformBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
       .maxDescriptorSetStorageBuffers           = 6 * max_ssbos,    /* number of stages * maxPerStageDescriptorStorageBuffers */
       .maxDescriptorSetStorageBuffersDynamic    = MAX_DYNAMIC_BUFFERS / 2,
-      .maxDescriptorSetSampledImages            = 6 * max_samplers, /* number of stages * maxPerStageDescriptorSampledImages */
+      .maxDescriptorSetSampledImages            = 6 * max_textures, /* number of stages * maxPerStageDescriptorSampledImages */
       .maxDescriptorSetStorageImages            = 6 * MAX_IMAGES,   /* number of stages * maxPerStageDescriptorStorageImages */
       .maxDescriptorSetInputAttachments         = 256,
       .maxVertexInputAttributes                 = MAX_VBS,
@@ -3408,6 +3428,11 @@ void anv_DestroySampler(
    if (!sampler)
       return;
 
+   if (sampler->bindless_state.map) {
+      anv_state_pool_free(&device->dynamic_state_pool,
+                          sampler->bindless_state);
+   }
+
    vk_free2(&device->alloc, pAllocator, sampler);
 }
 
index ab0103cfcd4f5e4608d9fe4d5e8733bfad6fd1e0..800ed2ef3e24b2d7739f2fa5e36e093b84c21bff 100644 (file)
@@ -760,39 +760,64 @@ lower_tex_deref(nir_tex_instr *tex, nir_tex_src_type deref_src_type,
    unsigned array_size =
       state->layout->set[set].layout->binding[binding].array_size;
 
-   nir_tex_src_type offset_src_type;
+   unsigned binding_offset;
    if (deref_src_type == nir_tex_src_texture_deref) {
-      offset_src_type = nir_tex_src_texture_offset;
-      *base_index = state->set[set].surface_offsets[binding] + plane;
+      binding_offset = state->set[set].surface_offsets[binding];
    } else {
       assert(deref_src_type == nir_tex_src_sampler_deref);
-      offset_src_type = nir_tex_src_sampler_offset;
-      *base_index = state->set[set].sampler_offsets[binding] + plane;
+      binding_offset = state->set[set].sampler_offsets[binding];
    }
 
+   nir_builder *b = &state->builder;
+
+   nir_tex_src_type offset_src_type;
    nir_ssa_def *index = NULL;
-   if (deref->deref_type != nir_deref_type_var) {
-      assert(deref->deref_type == nir_deref_type_array);
+   if (binding_offset > MAX_BINDING_TABLE_SIZE) {
+      const unsigned plane_offset =
+         plane * sizeof(struct anv_sampled_image_descriptor);
 
-      if (nir_src_is_const(deref->arr.index)) {
-         unsigned arr_index = nir_src_as_uint(deref->arr.index);
-         *base_index += MIN2(arr_index, array_size - 1);
+      nir_ssa_def *desc =
+         build_descriptor_load(deref, plane_offset, 2, 32, state);
+
+      if (deref_src_type == nir_tex_src_texture_deref) {
+         offset_src_type = nir_tex_src_texture_handle;
+         index = nir_channel(b, desc, 0);
       } else {
-         nir_builder *b = &state->builder;
-
-         /* From VK_KHR_sampler_ycbcr_conversion:
-          *
-          * If sampler Y’CBCR conversion is enabled, the combined image
-          * sampler must be indexed only by constant integral expressions when
-          * aggregated into arrays in shader code, irrespective of the
-          * shaderSampledImageArrayDynamicIndexing feature.
-          */
-         assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1);
+         assert(deref_src_type == nir_tex_src_sampler_deref);
+         offset_src_type = nir_tex_src_sampler_handle;
+         index = nir_channel(b, desc, 1);
+      }
+   } else {
+      if (deref_src_type == nir_tex_src_texture_deref) {
+         offset_src_type = nir_tex_src_texture_offset;
+      } else {
+         assert(deref_src_type == nir_tex_src_sampler_deref);
+         offset_src_type = nir_tex_src_sampler_offset;
+      }
 
-         index = nir_ssa_for_src(b, deref->arr.index, 1);
+      *base_index = binding_offset + plane;
 
-         if (state->add_bounds_checks)
-            index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
+      if (deref->deref_type != nir_deref_type_var) {
+         assert(deref->deref_type == nir_deref_type_array);
+
+         if (nir_src_is_const(deref->arr.index)) {
+            unsigned arr_index = nir_src_as_uint(deref->arr.index);
+            *base_index += MIN2(arr_index, array_size - 1);
+         } else {
+            /* From VK_KHR_sampler_ycbcr_conversion:
+             *
+             * If sampler Y’CBCR conversion is enabled, the combined image
+             * sampler must be indexed only by constant integral expressions
+             * when aggregated into arrays in shader code, irrespective of
+             * the shaderSampledImageArrayDynamicIndexing feature.
+             */
+            assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1);
+
+            index = nir_ssa_for_src(b, deref->arr.index, 1);
+
+            if (state->add_bounds_checks)
+               index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
+         }
       }
    }
 
@@ -1062,6 +1087,10 @@ anv_nir_apply_pipeline_layout(const struct anv_physical_device *pdevice,
              anv_descriptor_requires_bindless(pdevice, binding, true)) {
             /* If this descriptor doesn't fit in the binding table or if it
              * requires bindless for some reason, flag it as bindless.
+             *
+             * We also make large sampler arrays bindless because we can avoid
+             * using indirect sends thanks to bindless samplers being packed
+             * less tightly than the sampler table.
              */
             assert(anv_descriptor_supports_bindless(pdevice, binding, true));
             state.set[set].sampler_offsets[b] = BINDLESS_OFFSET;
index bb24ff1ae824dabcd0adf2bc2ee6d7ef16dc1bc2..9f525d1e21af59d0f2416b3bc1fa969520a24b3c 100644 (file)
@@ -953,6 +953,10 @@ struct anv_physical_device {
 
     /** True if we can access buffers using A64 messages */
     bool                                        has_a64_buffer_access;
+    /** True if we can use bindless access for images */
+    bool                                        has_bindless_images;
+    /** True if we can use bindless access for samplers */
+    bool                                        has_bindless_samplers;
 
     struct anv_device_extension_table           supported_extensions;
 
@@ -1521,6 +1525,27 @@ struct anv_vue_header {
    float PointWidth;
 };
 
+/** Struct representing a sampled image descriptor
+ *
+ * This descriptor layout is used for sampled images, bare sampler, and
+ * combined image/sampler descriptors.
+ */
+struct anv_sampled_image_descriptor {
+   /** Bindless image handle
+    *
+    * This is expected to already be shifted such that the 20-bit
+    * SURFACE_STATE table index is in the top 20 bits.
+    */
+   uint32_t image;
+
+   /** Bindless sampler handle
+    *
+    * This is assumed to be a 32B-aligned SAMPLER_STATE pointer relative
+    * to the dynamic state base address.
+    */
+   uint32_t sampler;
+};
+
 /** Struct representing a address/range descriptor
  *
  * The fields of this struct correspond directly to the data layout of
@@ -1547,6 +1572,8 @@ enum anv_descriptor_data {
    ANV_DESCRIPTOR_INLINE_UNIFORM = (1 << 4),
    /** anv_address_range_descriptor with a buffer address and range */
    ANV_DESCRIPTOR_ADDRESS_RANGE  = (1 << 5),
+   /** Bindless surface handle */
+   ANV_DESCRIPTOR_SAMPLED_IMAGE  = (1 << 6),
 };
 
 struct anv_descriptor_set_binding_layout {
@@ -3454,6 +3481,11 @@ struct anv_sampler {
    uint32_t                     state[3][4];
    uint32_t                     n_planes;
    struct anv_ycbcr_conversion *conversion;
+
+   /* Blob of sampler state data which is guaranteed to be 32-byte aligned
+    * and with a 32-byte stride for use as bindless samplers.
+    */
+   struct anv_state             bindless_state;
 };
 
 struct anv_framebuffer {
index 3189585cbd3d15cc3d39b8747d29d01f1af40320..1af36bced24b1cb359bb9465681ac5485b2cefd3 100644 (file)
@@ -110,10 +110,18 @@ genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
       sba.InstructionBuffersizeModifyEnable     = true;
 #  endif
 #  if (GEN_GEN >= 9)
-      sba.BindlessSurfaceStateBaseAddress = (struct anv_address) { NULL, 0 };
+      if (cmd_buffer->device->instance->physicalDevice.use_softpin) {
+         sba.BindlessSurfaceStateBaseAddress = (struct anv_address) {
+            .bo = device->surface_state_pool.block_pool.bo,
+            .offset = 0,
+         };
+         sba.BindlessSurfaceStateSize = (1 << 20) - 1;
+      } else {
+         sba.BindlessSurfaceStateBaseAddress = ANV_NULL_ADDRESS;
+         sba.BindlessSurfaceStateSize = 0;
+      }
       sba.BindlessSurfaceStateMOCS = GENX(MOCS);
       sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
-      sba.BindlessSurfaceStateSize = 0;
 #  endif
 #  if (GEN_GEN >= 10)
       sba.BindlessSamplerStateBaseAddress = (struct anv_address) { NULL, 0 };
index 283cd8c501a7e23a12355fa8bbfbbe4281ecd57b..9276dc9470b7254405eabe5be6adc1d098af6c00 100644 (file)
@@ -328,6 +328,8 @@ VkResult genX(CreateSampler)(
     VkSampler*                                  pSampler)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
+   const struct anv_physical_device *pdevice =
+      &device->instance->physicalDevice;
    struct anv_sampler *sampler;
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO);
@@ -383,6 +385,17 @@ VkResult genX(CreateSampler)(
       }
    }
 
+   if (pdevice->has_bindless_samplers) {
+      /* If we have bindless, allocate enough samplers.  We allocate 32 bytes
+       * for each sampler instead of 16 bytes because we want all bindless
+       * samplers to be 32-byte aligned so we don't have to use indirect
+       * sampler messages on them.
+       */
+      sampler->bindless_state =
+         anv_state_pool_alloc(&device->dynamic_state_pool,
+                              sampler->n_planes * 32, 32);
+   }
+
    for (unsigned p = 0; p < sampler->n_planes; p++) {
       const bool plane_has_chroma =
          sampler->conversion && sampler->conversion->format->planes[p].has_chroma;
@@ -452,6 +465,11 @@ VkResult genX(CreateSampler)(
       };
 
       GENX(SAMPLER_STATE_pack)(NULL, sampler->state[p], &sampler_state);
+
+      if (sampler->bindless_state.map) {
+         memcpy(sampler->bindless_state.map + p * 32,
+                sampler->state[p], GENX(SAMPLER_STATE_length) * 4);
+      }
    }
 
    *pSampler = anv_sampler_to_handle(sampler);