tu: Switch to the bindless descriptor model
authorConnor Abbott <cwabbott0@gmail.com>
Mon, 16 Mar 2020 10:49:19 +0000 (11:49 +0100)
committerMarge Bot <eric+marge@anholt.net>
Thu, 9 Apr 2020 15:56:55 +0000 (15:56 +0000)
Under the bindless model, there are 5 "base" registers programmed with a
64-bit address, and sam/ldib/ldc and so on each specify a base register
and an offset, in units of 16 dwords. The base registers correspond to
descriptor sets in Vulkan. We allocate a buffer at descriptor set
creation time, hopefully outside the main rendering loop, and then
switching descriptor sets is just a matter of programming the base
registers differently. Note, however, that some kinds of descriptors
need to be patched at command recording time, in particular dynamic
UBO's and SSBO's, which need to be patched at CmdBindDescriptorSets
time, and input attachments which need to be patched at draw time based
on the the pipeline that's bound. We reserve the fifth base register
(which seems to be unused by the blob driver) for these, creating a
descriptor set on-the-fly and combining all the dynamic descriptors from
all the different descriptor sets. This way, we never have to copy the
rest of the descriptor set at draw time like the blob seems to do. I
mostly chose to do this because the infrastructure was already there in
the form of dynamic_descriptors, and other drivers (at least radv) don't
cheat either when implementing this.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4358>

src/freedreno/vulkan/tu_cmd_buffer.c
src/freedreno/vulkan/tu_descriptor_set.c
src/freedreno/vulkan/tu_descriptor_set.h
src/freedreno/vulkan/tu_device.c
src/freedreno/vulkan/tu_pipeline.c
src/freedreno/vulkan/tu_private.h
src/freedreno/vulkan/tu_shader.c

index 3de6fb42afb807a4721a3c4cee0ff85ce6a523b6..1c9c43f35e920620c604cef3accb84cc39886734 100644 (file)
@@ -1875,16 +1875,62 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
       descriptors_state->sets[idx] = set;
       descriptors_state->valid |= (1u << idx);
 
+      /* Note: the actual input attachment indices come from the shader
+       * itself, so we can't generate the patched versions of these until
+       * draw time when both the pipeline and descriptors are bound and
+       * we're inside the render pass.
+       */
+      unsigned dst_idx = layout->set[idx].input_attachment_start;
+      memcpy(&descriptors_state->input_attachments[dst_idx * A6XX_TEX_CONST_DWORDS],
+             set->dynamic_descriptors,
+             set->layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
+
       for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
-         unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
+         /* Dynamic buffers come after input attachments in the descriptor set
+          * itself, but due to how the Vulkan descriptor set binding works, we
+          * have to put input attachments and dynamic buffers in separate
+          * buffers in the descriptor_state and then combine them at draw
+          * time. Binding a descriptor set only invalidates the descriptor
+          * sets after it, but if we try to tightly pack the descriptors after
+          * the input attachments then we could corrupt dynamic buffers in the
+          * descriptor set before it, or we'd have to move all the dynamic
+          * buffers over. We just put them into separate buffers to make
+          * binding as well as the later patching of input attachments easy.
+          */
+         unsigned src_idx = j + set->layout->input_attachment_count;
+         unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
          assert(dyn_idx < dynamicOffsetCount);
 
-         descriptors_state->dynamic_buffers[idx] =
-         set->dynamic_descriptors[j].va + pDynamicOffsets[dyn_idx];
+         uint32_t *dst =
+            &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
+         uint32_t *src =
+            &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
+         uint32_t offset = pDynamicOffsets[dyn_idx];
+
+         /* Patch the storage/uniform descriptors right away. */
+         if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
+            /* Note: we can assume here that the addition won't roll over and
+             * change the SIZE field.
+             */
+            uint64_t va = src[0] | ((uint64_t)src[1] << 32);
+            va += offset;
+            dst[0] = va;
+            dst[1] = va >> 32;
+         } else {
+            memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
+            /* Note: A6XX_IBO_5_DEPTH is always 0 */
+            uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
+            va += offset;
+            dst[4] = va;
+            dst[5] = va >> 32;
+         }
       }
    }
 
-   cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
+   if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE)
+      cmd_buffer->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
+   else
+      cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
 }
 
 void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
@@ -2316,6 +2362,9 @@ tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer,
       tu_bo_list_add(&cmd->bo_list, iview->image->bo,
                      MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
    }
+
+   /* Flag input attachment descriptors for re-emission if necessary */
+   cmd->state.dirty |= TU_CMD_DIRTY_INPUT_ATTACHMENTS;
 }
 
 void
@@ -2395,6 +2444,9 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents)
    tu6_emit_mrt(cmd, cmd->state.subpass, cs);
    tu6_emit_msaa(cs, cmd->state.subpass->samples);
    tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
+
+   /* Flag input attachment descriptors for re-emission if necessary */
+   cmd->state.dirty |= TU_CMD_DIRTY_INPUT_ATTACHMENTS;
 }
 
 void
@@ -2459,6 +2511,7 @@ struct tu_draw_info
 
 #define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
 #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
+#define ENABLE_NON_GMEM (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_SYSMEM)
 
 enum tu_draw_state_group_id
 {
@@ -2472,10 +2525,8 @@ enum tu_draw_state_group_id
    TU_DRAW_STATE_BLEND,
    TU_DRAW_STATE_VS_CONST,
    TU_DRAW_STATE_FS_CONST,
-   TU_DRAW_STATE_VS_TEX,
-   TU_DRAW_STATE_FS_TEX_SYSMEM,
-   TU_DRAW_STATE_FS_TEX_GMEM,
-   TU_DRAW_STATE_FS_IBO,
+   TU_DRAW_STATE_DESC_SETS,
+   TU_DRAW_STATE_DESC_SETS_GMEM,
    TU_DRAW_STATE_VS_PARAMS,
 
    TU_DRAW_STATE_COUNT,
@@ -2488,149 +2539,6 @@ struct tu_draw_state_group
    struct tu_cs_entry ib;
 };
 
-const static void *
-sampler_ptr(struct tu_descriptor_state *descriptors_state,
-            const struct tu_descriptor_map *map, unsigned i,
-            unsigned array_index)
-{
-   assert(descriptors_state->valid & (1 << map->set[i]));
-
-   struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
-   assert(map->binding[i] < set->layout->binding_count);
-
-   const struct tu_descriptor_set_binding_layout *layout =
-      &set->layout->binding[map->binding[i]];
-
-   if (layout->immutable_samplers_offset) {
-      const uint32_t *immutable_samplers =
-         tu_immutable_samplers(set->layout, layout);
-
-      return &immutable_samplers[array_index * A6XX_TEX_SAMP_DWORDS];
-   }
-
-   switch (layout->type) {
-   case VK_DESCRIPTOR_TYPE_SAMPLER:
-      return &set->mapped_ptr[layout->offset / 4];
-   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-      return &set->mapped_ptr[layout->offset / 4 + A6XX_TEX_CONST_DWORDS +
-                              array_index * (A6XX_TEX_CONST_DWORDS + A6XX_TEX_SAMP_DWORDS)];
-   default:
-      unreachable("unimplemented descriptor type");
-      break;
-   }
-}
-
-static void
-write_tex_const(struct tu_cmd_buffer *cmd,
-                uint32_t *dst,
-                struct tu_descriptor_state *descriptors_state,
-                const struct tu_descriptor_map *map,
-                unsigned i, unsigned array_index, bool is_sysmem)
-{
-   assert(descriptors_state->valid & (1 << map->set[i]));
-
-   struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
-   assert(map->binding[i] < set->layout->binding_count);
-
-   const struct tu_descriptor_set_binding_layout *layout =
-      &set->layout->binding[map->binding[i]];
-
-   switch (layout->type) {
-   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
-   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-   case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-      memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
-                                   array_index * A6XX_TEX_CONST_DWORDS],
-             A6XX_TEX_CONST_DWORDS * 4);
-      break;
-   case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-      memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
-                                   array_index *
-                                   (A6XX_TEX_CONST_DWORDS +
-                                    A6XX_TEX_SAMP_DWORDS)],
-             A6XX_TEX_CONST_DWORDS * 4);
-      break;
-   default:
-      unreachable("unimplemented descriptor type");
-      break;
-   }
-
-   if (layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT && !is_sysmem) {
-      const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
-      uint32_t a = cmd->state.subpass->input_attachments[map->value[i] +
-                                                         array_index].attachment;
-      const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
-
-      assert(att->gmem_offset >= 0);
-
-      dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
-      dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
-      dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
-      dst[2] |=
-         A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
-         A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
-      dst[3] = 0;
-      dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
-      dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
-      for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
-         dst[i] = 0;
-
-      if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
-         tu_finishme("patch input attachment pitch for secondary cmd buffer");
-   }
-}
-
-static void
-write_image_ibo(struct tu_cmd_buffer *cmd,
-                uint32_t *dst,
-                struct tu_descriptor_state *descriptors_state,
-                const struct tu_descriptor_map *map,
-                unsigned i, unsigned array_index)
-{
-   assert(descriptors_state->valid & (1 << map->set[i]));
-
-   struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
-   assert(map->binding[i] < set->layout->binding_count);
-
-   const struct tu_descriptor_set_binding_layout *layout =
-      &set->layout->binding[map->binding[i]];
-
-   assert(layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE);
-
-   memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
-                                (array_index * 2 + 1) * A6XX_TEX_CONST_DWORDS],
-          A6XX_TEX_CONST_DWORDS * 4);
-}
-
-static uint64_t
-buffer_ptr(struct tu_descriptor_state *descriptors_state,
-           const struct tu_descriptor_map *map,
-           unsigned i, unsigned array_index)
-{
-   assert(descriptors_state->valid & (1 << map->set[i]));
-
-   struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
-   assert(map->binding[i] < set->layout->binding_count);
-
-   const struct tu_descriptor_set_binding_layout *layout =
-      &set->layout->binding[map->binding[i]];
-
-   switch (layout->type) {
-   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-      return descriptors_state->dynamic_buffers[layout->dynamic_offset_offset +
-                                                array_index];
-   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-      return (uint64_t) set->mapped_ptr[layout->offset / 4 + array_index * 2 + 1] << 32 |
-                        set->mapped_ptr[layout->offset / 4 + array_index * 2];
-   default:
-      unreachable("unimplemented descriptor type");
-      break;
-   }
-}
-
 static inline uint32_t
 tu6_stage2opcode(gl_shader_stage type)
 {
@@ -2708,21 +2616,24 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
       debug_assert((size % 16) == 0);
       debug_assert((offset % 16) == 0);
 
-      /* Look through the UBO map to find our UBO index, and get the VA for
-       * that UBO.
+      /* Dig out the descriptor from the descriptor state and read the VA from
+       * it.
        */
-      uint64_t va = 0;
-      uint32_t ubo_idx = state->range[i].block - 1;
-      uint32_t ubo_map_base = 0;
-      for (int j = 0; j < link->ubo_map.num; j++) {
-         if (ubo_idx >= ubo_map_base &&
-             ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) {
-            va = buffer_ptr(descriptors_state, &link->ubo_map, j,
-                            ubo_idx - ubo_map_base);
-            break;
-         }
-         ubo_map_base += link->ubo_map.array_size[j];
-      }
+      assert(state->range[i].bindless);
+      uint32_t *base = state->range[i].bindless_base == MAX_SETS ?
+         descriptors_state->dynamic_descriptors :
+         descriptors_state->sets[state->range[i].bindless_base]->mapped_ptr;
+      unsigned block = state->range[i].block;
+      /* If the block in the shader here is in the dynamic descriptor set, it
+       * is an index into the dynamic descriptor set which is combined from
+       * dynamic descriptors and input attachments on-the-fly, and we don't
+       * have access to it here. Instead we work backwards to get the index
+       * into dynamic_descriptors.
+       */
+      if (state->range[i].bindless_base == MAX_SETS)
+         block -= pipeline->layout->input_attachment_count;
+      uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
+      uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
       assert(va);
 
       tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
@@ -2735,43 +2646,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline,
    }
 }
 
-static void
-tu6_emit_ubos(struct tu_cs *cs, const struct tu_pipeline *pipeline,
-              struct tu_descriptor_state *descriptors_state,
-              gl_shader_stage type)
-{
-   const struct tu_program_descriptor_linkage *link =
-      &pipeline->program.link[type];
-
-   uint32_t num = MIN2(link->ubo_map.num_desc, link->const_state.num_ubos);
-   uint32_t anum = align(num, 2);
-
-   if (!num)
-      return;
-
-   tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (2 * anum));
-   tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(link->const_state.offsets.ubo) |
-         CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-         CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
-         CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
-         CP_LOAD_STATE6_0_NUM_UNIT(anum/2));
-   tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
-   tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-
-   unsigned emitted = 0;
-   for (unsigned i = 0; emitted < num && i < link->ubo_map.num; i++) {
-      for (unsigned j = 0; emitted < num && j < link->ubo_map.array_size[i]; j++) {
-         tu_cs_emit_qw(cs, buffer_ptr(descriptors_state, &link->ubo_map, i, j));
-         emitted++;
-      }
-   }
-
-   for (; emitted < anum; emitted++) {
-      tu_cs_emit(cs, 0xffffffff);
-      tu_cs_emit(cs, 0xffffffff);
-   }
-}
-
 static struct tu_cs_entry
 tu6_emit_consts(struct tu_cmd_buffer *cmd,
                 const struct tu_pipeline *pipeline,
@@ -2782,7 +2656,6 @@ tu6_emit_consts(struct tu_cmd_buffer *cmd,
    tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
 
    tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
-   tu6_emit_ubos(&cs, pipeline, descriptors_state, type);
 
    return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
 }
@@ -2828,225 +2701,138 @@ tu6_emit_vs_params(struct tu_cmd_buffer *cmd,
 }
 
 static VkResult
-tu6_emit_textures(struct tu_cmd_buffer *cmd,
-                  const struct tu_pipeline *pipeline,
-                  struct tu_descriptor_state *descriptors_state,
-                  gl_shader_stage type,
-                  struct tu_cs_entry *entry,
-                  bool is_sysmem)
+tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
+                         const struct tu_pipeline *pipeline,
+                         VkPipelineBindPoint bind_point,
+                         struct tu_cs_entry *entry,
+                         bool gmem)
 {
    struct tu_cs *draw_state = &cmd->sub_cs;
-   const struct tu_program_descriptor_linkage *link =
-      &pipeline->program.link[type];
+   struct tu_pipeline_layout *layout = pipeline->layout;
+   struct tu_descriptor_state *descriptors_state =
+      tu_get_descriptors_state(cmd, bind_point);
+   const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
+   const uint32_t *input_attachment_idx =
+      pipeline->program.input_attachment_idx;
+   uint32_t num_dynamic_descs = layout->dynamic_offset_count +
+      layout->input_attachment_count;
+   struct ts_cs_memory dynamic_desc_set;
    VkResult result;
 
-   if (link->texture_map.num_desc == 0 && link->sampler_map.num_desc == 0) {
-      *entry = (struct tu_cs_entry) {};
-      return VK_SUCCESS;
-   }
-
-   /* allocate and fill texture state */
-   struct ts_cs_memory tex_const;
-   result = tu_cs_alloc(draw_state, link->texture_map.num_desc,
-                        A6XX_TEX_CONST_DWORDS, &tex_const);
-   if (result != VK_SUCCESS)
-      return result;
-
-   int tex_index = 0;
-   for (unsigned i = 0; i < link->texture_map.num; i++) {
-      for (int j = 0; j < link->texture_map.array_size[i]; j++) {
-         write_tex_const(cmd,
-                         &tex_const.map[A6XX_TEX_CONST_DWORDS * tex_index++],
-                         descriptors_state, &link->texture_map, i, j,
-                         is_sysmem);
-      }
-   }
-
-   /* allocate and fill sampler state */
-   struct ts_cs_memory tex_samp = { 0 };
-   if (link->sampler_map.num_desc) {
-      result = tu_cs_alloc(draw_state, link->sampler_map.num_desc,
-                           A6XX_TEX_SAMP_DWORDS, &tex_samp);
+   if (num_dynamic_descs > 0) {
+      /* allocate and fill out dynamic descriptor set */
+      result = tu_cs_alloc(draw_state, num_dynamic_descs,
+                           A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
       if (result != VK_SUCCESS)
          return result;
 
-      int sampler_index = 0;
-      for (unsigned i = 0; i < link->sampler_map.num; i++) {
-         for (int j = 0; j < link->sampler_map.array_size[i]; j++) {
-            const uint32_t *sampler = sampler_ptr(descriptors_state,
-                                                  &link->sampler_map,
-                                                  i, j);
-            memcpy(&tex_samp.map[A6XX_TEX_SAMP_DWORDS * sampler_index++],
-                   sampler, A6XX_TEX_SAMP_DWORDS * 4);
+      memcpy(dynamic_desc_set.map, descriptors_state->input_attachments,
+             layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
+
+      if (gmem) {
+         /* Patch input attachments to refer to GMEM instead */
+         for (unsigned i = 0; i < layout->input_attachment_count; i++) {
+            uint32_t *dst =
+               &dynamic_desc_set.map[A6XX_TEX_CONST_DWORDS * i];
+
+            /* The compiler has already laid out input_attachment_idx in the
+             * final order of input attachments, so there's no need to go
+             * through the pipeline layout finding input attachments.
+             */
+            unsigned attachment_idx = input_attachment_idx[i];
+
+            /* It's possible for the pipeline layout to include an input
+             * attachment which doesn't actually exist for the current
+             * subpass. Of course, this is only valid so long as the pipeline
+             * doesn't try to actually load that attachment. Just skip
+             * patching in that scenario to avoid out-of-bounds accesses.
+             */
+            if (attachment_idx >= cmd->state.subpass->input_count)
+               continue;
+
+            uint32_t a = cmd->state.subpass->input_attachments[attachment_idx].attachment;
+            const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
+
+            assert(att->gmem_offset >= 0);
+
+            dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
+            dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
+            dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
+            dst[2] |=
+               A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
+               A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
+            dst[3] = 0;
+            dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
+            dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
+            for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
+               dst[i] = 0;
+
+            if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
+               tu_finishme("patch input attachment pitch for secondary cmd buffer");
          }
       }
-   }
 
-   unsigned tex_samp_reg, tex_const_reg, tex_count_reg;
-   enum a6xx_state_block sb;
+      memcpy(dynamic_desc_set.map + layout->input_attachment_count * A6XX_TEX_CONST_DWORDS,
+             descriptors_state->dynamic_descriptors,
+             layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
+   }
 
-   switch (type) {
-   case MESA_SHADER_VERTEX:
-      sb = SB6_VS_TEX;
-      tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO;
-      tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO;
-      tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
-      break;
-   case MESA_SHADER_FRAGMENT:
-      sb = SB6_FS_TEX;
-      tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO;
-      tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO;
-      tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
+   uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg;
+   uint32_t hlsq_update_value;
+   switch (bind_point) {
+   case VK_PIPELINE_BIND_POINT_GRAPHICS:
+      sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
+      hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
+      hlsq_update_value = 0x7c000;
       break;
-   case MESA_SHADER_COMPUTE:
-      sb = SB6_CS_TEX;
-      tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO;
-      tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO;
-      tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
+   case VK_PIPELINE_BIND_POINT_COMPUTE:
+      sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
+      hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
+      hlsq_update_value = 0x3e00;
       break;
    default:
-      unreachable("bad state block");
-   }
-
-   struct tu_cs cs;
-   result = tu_cs_begin_sub_stream(draw_state, 16, &cs);
-   if (result != VK_SUCCESS)
-      return result;
-
-   if (link->sampler_map.num_desc) {
-      /* output sampler state: */
-      tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
-      tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
-                 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-                 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
-                 CP_LOAD_STATE6_0_NUM_UNIT(link->sampler_map.num_desc));
-      tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
-
-      tu_cs_emit_pkt4(&cs, tex_samp_reg, 2);
-      tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
-   }
-
-   /* emit texture state: */
-   tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
-   tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-      CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
-      CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-      CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
-      CP_LOAD_STATE6_0_NUM_UNIT(link->texture_map.num_desc));
-   tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
-
-   tu_cs_emit_pkt4(&cs, tex_const_reg, 2);
-   tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
-
-   tu_cs_emit_pkt4(&cs, tex_count_reg, 1);
-   tu_cs_emit(&cs, link->texture_map.num_desc);
-
-   *entry = tu_cs_end_sub_stream(draw_state, &cs);
-   return VK_SUCCESS;
-}
-
-static VkResult
-tu6_emit_ibo(struct tu_cmd_buffer *cmd,
-             const struct tu_pipeline *pipeline,
-             struct tu_descriptor_state *descriptors_state,
-             gl_shader_stage type,
-             struct tu_cs_entry *entry)
-{
-   struct tu_cs *draw_state = &cmd->sub_cs;
-   const struct tu_program_descriptor_linkage *link =
-      &pipeline->program.link[type];
-   VkResult result;
-
-   unsigned num_desc = link->ssbo_map.num_desc + link->image_map.num_desc;
-
-   if (num_desc == 0) {
-      *entry = (struct tu_cs_entry) {};
-      return VK_SUCCESS;
-   }
-
-   struct ts_cs_memory ibo_const;
-   result = tu_cs_alloc(draw_state, num_desc,
-                        A6XX_TEX_CONST_DWORDS, &ibo_const);
-   if (result != VK_SUCCESS)
-      return result;
-
-   int ssbo_index = 0;
-   for (unsigned i = 0; i < link->ssbo_map.num; i++) {
-      for (int j = 0; j < link->ssbo_map.array_size[i]; j++) {
-         uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
-
-         uint64_t va = buffer_ptr(descriptors_state, &link->ssbo_map, i, j);
-         /* We don't expose robustBufferAccess, so leave the size unlimited. */
-         uint32_t sz = MAX_STORAGE_BUFFER_RANGE / 4;
-
-         dst[0] = A6XX_IBO_0_FMT(FMT6_32_UINT);
-         dst[1] = A6XX_IBO_1_WIDTH(sz & MASK(15)) |
-                  A6XX_IBO_1_HEIGHT(sz >> 15);
-         dst[2] = A6XX_IBO_2_UNK4 |
-                  A6XX_IBO_2_UNK31 |
-                  A6XX_IBO_2_TYPE(A6XX_TEX_1D);
-         dst[3] = 0;
-         dst[4] = va;
-         dst[5] = va >> 32;
-         for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
-            dst[i] = 0;
-
-         ssbo_index++;
-      }
+      unreachable("bad bind point");
    }
 
-   for (unsigned i = 0; i < link->image_map.num; i++) {
-      for (int j = 0; j < link->image_map.array_size[i]; j++) {
-         uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
-
-         write_image_ibo(cmd, dst,
-                         descriptors_state, &link->image_map, i, j);
-
-         ssbo_index++;
-      }
+   /* Be careful here to *not* refer to the pipeline, so that if only the
+    * pipeline changes we don't have to emit this again (except if there are
+    * dynamic descriptors in the pipeline layout). This means always emitting
+    * all the valid descriptors, which means that we always have to put the
+    * dynamic descriptor in the driver-only slot at the end
+    */
+   uint32_t num_user_sets = util_last_bit(descriptors_state->valid);
+   uint32_t num_sets = num_user_sets;
+   if (num_dynamic_descs > 0) {
+      num_user_sets = MAX_SETS;
+      num_sets = num_user_sets + 1;
    }
 
-   assert(ssbo_index == num_desc);
+   unsigned regs[2] = { sp_bindless_base_reg, hlsq_bindless_base_reg };
 
    struct tu_cs cs;
-   result = tu_cs_begin_sub_stream(draw_state, 7, &cs);
+   result = tu_cs_begin_sub_stream(draw_state, ARRAY_SIZE(regs) * (1 + num_sets * 2) + 2, &cs);
    if (result != VK_SUCCESS)
       return result;
 
-   uint32_t opcode, ibo_addr_reg;
-   enum a6xx_state_block sb;
-   enum a6xx_state_type st;
+   if (num_sets > 0) {
+      for (unsigned i = 0; i < ARRAY_SIZE(regs); i++) {
+         tu_cs_emit_pkt4(&cs, regs[i], num_sets * 2);
+         for (unsigned j = 0; j < num_user_sets; j++) {
+            if (descriptors_state->valid & (1 << j)) {
+               /* magic | 3 copied from the blob */
+               tu_cs_emit_qw(&cs, descriptors_state->sets[j]->va | 3);
+            } else {
+               tu_cs_emit_qw(&cs, 0 | 3);
+            }
+         }
+         if (num_dynamic_descs > 0) {
+            tu_cs_emit_qw(&cs, dynamic_desc_set.iova | 3);
+         }
+      }
 
-   switch (type) {
-   case MESA_SHADER_FRAGMENT:
-      opcode = CP_LOAD_STATE6;
-      st = ST6_SHADER;
-      sb = SB6_IBO;
-      ibo_addr_reg = REG_A6XX_SP_IBO_LO;
-      break;
-   case MESA_SHADER_COMPUTE:
-      opcode = CP_LOAD_STATE6_FRAG;
-      st = ST6_IBO;
-      sb = SB6_CS_SHADER;
-      ibo_addr_reg = REG_A6XX_SP_CS_IBO_LO;
-      break;
-   default:
-      unreachable("unsupported stage for ibos");
+      tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(hlsq_update_value));
    }
 
-   /* emit texture state: */
-   tu_cs_emit_pkt7(&cs, opcode, 3);
-   tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-              CP_LOAD_STATE6_0_STATE_TYPE(st) |
-              CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-              CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
-              CP_LOAD_STATE6_0_NUM_UNIT(num_desc));
-   tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
-
-   tu_cs_emit_pkt4(&cs, ibo_addr_reg, 2);
-   tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
-
    *entry = tu_cs_end_sub_stream(draw_state, &cs);
    return VK_SUCCESS;
 }
@@ -3255,59 +3041,54 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
    if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS)
       tu6_emit_streamout(cmd, cs);
 
-   if (cmd->state.dirty &
-         (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) {
-      struct tu_cs_entry vs_tex, fs_tex_sysmem, fs_tex_gmem, fs_ibo;
-
-      result = tu6_emit_textures(cmd, pipeline, descriptors_state,
-                                 MESA_SHADER_VERTEX, &vs_tex, false);
-      if (result != VK_SUCCESS)
-         return result;
-
-      /* TODO: we could emit just one texture descriptor draw state when there
-       * are no input attachments, which is the most common case. We could
-       * also split out the sampler state, which doesn't change even for input
-       * attachments.
-       */
-      result = tu6_emit_textures(cmd, pipeline, descriptors_state,
-                                 MESA_SHADER_FRAGMENT, &fs_tex_sysmem, true);
-      if (result != VK_SUCCESS)
-         return result;
-
-      result = tu6_emit_textures(cmd, pipeline, descriptors_state,
-                                 MESA_SHADER_FRAGMENT, &fs_tex_gmem, false);
-      if (result != VK_SUCCESS)
-         return result;
-
-      result = tu6_emit_ibo(cmd, pipeline, descriptors_state,
-                            MESA_SHADER_FRAGMENT, &fs_ibo);
+   /* If there are any any dynamic descriptors, then we may need to re-emit
+    * them after every pipeline change in case the number of input attachments
+    * changes. We also always need to re-emit after a pipeline change if there
+    * are any input attachments, because the input attachment index comes from
+    * the pipeline. Finally, it can also happen that the subpass changes
+    * without the pipeline changing, in which case the GMEM descriptors need
+    * to be patched differently.
+    *
+    * TODO: We could probably be clever and avoid re-emitting state on
+    * pipeline changes if the number of input attachments is always 0. We
+    * could also only re-emit dynamic state.
+    */
+   if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS ||
+       ((pipeline->layout->dynamic_offset_count +
+         pipeline->layout->input_attachment_count > 0) &&
+        cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) ||
+       (pipeline->layout->input_attachment_count > 0 &&
+        cmd->state.dirty & TU_CMD_DIRTY_INPUT_ATTACHMENTS)) {
+      struct tu_cs_entry desc_sets, desc_sets_gmem;
+      bool need_gmem_desc_set = pipeline->layout->input_attachment_count > 0;
+
+      result = tu6_emit_descriptor_sets(cmd, pipeline,
+                                        VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                        &desc_sets, false);
       if (result != VK_SUCCESS)
          return result;
 
       draw_state_groups[draw_state_group_count++] =
          (struct tu_draw_state_group) {
-            .id = TU_DRAW_STATE_VS_TEX,
-            .enable_mask = ENABLE_ALL,
-            .ib = vs_tex,
-         };
-      draw_state_groups[draw_state_group_count++] =
-         (struct tu_draw_state_group) {
-            .id = TU_DRAW_STATE_FS_TEX_GMEM,
-            .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
-            .ib = fs_tex_gmem,
-         };
-      draw_state_groups[draw_state_group_count++] =
-         (struct tu_draw_state_group) {
-            .id = TU_DRAW_STATE_FS_TEX_SYSMEM,
-            .enable_mask = CP_SET_DRAW_STATE__0_SYSMEM,
-            .ib = fs_tex_sysmem,
-         };
-      draw_state_groups[draw_state_group_count++] =
-         (struct tu_draw_state_group) {
-            .id = TU_DRAW_STATE_FS_IBO,
-            .enable_mask = ENABLE_DRAW,
-            .ib = fs_ibo,
+            .id = TU_DRAW_STATE_DESC_SETS,
+            .enable_mask = need_gmem_desc_set ? ENABLE_NON_GMEM : ENABLE_ALL,
+            .ib = desc_sets,
          };
+
+      if (need_gmem_desc_set) {
+         result = tu6_emit_descriptor_sets(cmd, pipeline,
+                                           VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                           &desc_sets_gmem, true);
+         if (result != VK_SUCCESS)
+            return result;
+
+         draw_state_groups[draw_state_group_count++] =
+            (struct tu_draw_state_group) {
+               .id = TU_DRAW_STATE_DESC_SETS_GMEM,
+               .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
+               .ib = desc_sets_gmem,
+            };
+      }
    }
 
    struct tu_cs_entry vs_params;
@@ -3356,11 +3137,16 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
       unsigned i;
       for_each_bit(i, descriptors_state->valid) {
          struct tu_descriptor_set *set = descriptors_state->sets[i];
-         for (unsigned j = 0; j < set->layout->buffer_count; ++j)
-            if (set->descriptors[j]) {
-               tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
+         for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
+            if (set->buffers[j]) {
+               tu_bo_list_add(&cmd->bo_list, set->buffers[j],
                               MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
             }
+         }
+         if (set->size > 0) {
+            tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
+                           MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+         }
       }
    }
    if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) {
@@ -3373,10 +3159,16 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
       }
    }
 
+   /* There are too many graphics dirty bits to list here, so just list the
+    * bits to preserve instead. The only things not emitted here are
+    * compute-related state.
+    */
+   cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
+
    /* Fragment shader state overwrites compute shader state, so flag the
     * compute pipeline for re-emit.
     */
-   cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE;
+   cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
    return VK_SUCCESS;
 }
 
@@ -3698,42 +3490,43 @@ tu_dispatch(struct tu_cmd_buffer *cmd,
 
    tu_emit_compute_driver_params(cs, pipeline, info);
 
-   result = tu6_emit_textures(cmd, pipeline, descriptors_state,
-                              MESA_SHADER_COMPUTE, &ib, false);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
-   if (ib.size)
-      tu_cs_emit_ib(cs, &ib);
-
-   result = tu6_emit_ibo(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE, &ib);
-   if (result != VK_SUCCESS) {
-      cmd->record_result = result;
-      return;
-   }
-
-   if (ib.size)
-      tu_cs_emit_ib(cs, &ib);
+   if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) {
+      result = tu6_emit_descriptor_sets(cmd, pipeline,
+                                        VK_PIPELINE_BIND_POINT_COMPUTE, &ib,
+                                        false);
+      if (result != VK_SUCCESS) {
+         cmd->record_result = result;
+         return;
+      }
 
-   /* track BOs */
-   if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
+      /* track BOs */
       unsigned i;
       for_each_bit(i, descriptors_state->valid) {
          struct tu_descriptor_set *set = descriptors_state->sets[i];
-         for (unsigned j = 0; j < set->layout->buffer_count; ++j)
-            if (set->descriptors[j]) {
-               tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
+         for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
+            if (set->buffers[j]) {
+               tu_bo_list_add(&cmd->bo_list, set->buffers[j],
                               MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
             }
+         }
+
+         if (set->size > 0) {
+            tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
+                           MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+         }
       }
    }
 
+   if (ib.size)
+      tu_cs_emit_ib(cs, &ib);
+
+   cmd->state.dirty &=
+      ~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
+
    /* Compute shader state overwrites fragment shader state, so we flag the
     * graphics pipeline for re-emit.
     */
-   cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
+   cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
 
    tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
    tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
index 08562a3e5bb3a06ef3e556ead2531535f942614a..de1683c6bd7eeecdb8c85c473d5f9416a1186333 100644 (file)
 /**
  * @file
  *
- * The texture and sampler descriptors are laid out in a single global space
- * across all shader stages, for both simplicity of implementation and because
- * that seems to be how things have to be structured for border color
- * handling.
- *
- * Each shader stage will declare its texture/sampler count based on the last
- * descriptor set it uses.  At draw emit time (though it really should be
- * CmdBind time), we upload the descriptor sets used by each shader stage to
- * their stage.
+ * We use the bindless descriptor model, which maps fairly closely to how
+ * Vulkan descriptor sets work. The two exceptions are input attachments and
+ * dynamic descriptors, which have to be patched when recording command
+ * buffers. We reserve an extra descriptor set for these. This descriptor set
+ * contains all the input attachments in the pipeline, in order, and then all
+ * the dynamic descriptors. The dynamic descriptors are stored in the CPU-side
+ * datastructure for each tu_descriptor_set, and then combined into one big
+ * descriptor set at CmdBindDescriptors time/draw time.
  */
 
 #include "tu_private.h"
@@ -77,32 +76,27 @@ create_sorted_bindings(const VkDescriptorSetLayoutBinding *bindings,
 }
 
 static uint32_t
-descriptor_size(enum VkDescriptorType type)
+descriptor_size(VkDescriptorType type)
 {
    switch (type) {
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-      return 0;
-   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
-   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
-      /* 64bit pointer */
-      return 8;
-   case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
    case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
-   case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
-   case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
-      return A6XX_TEX_CONST_DWORDS * 4;
-   case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-      /* We may need the IBO or the TEX representation, or both. */
-      return A6XX_TEX_CONST_DWORDS * 4 * 2;
+      /* These are remapped to the special driver-managed descriptor set,
+       * hence they don't take up any space in the original descriptor set:
+       */
+      return 0;
    case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
-      /* texture const + texture sampler */
-      return (A6XX_TEX_CONST_DWORDS + A6XX_TEX_SAMP_DWORDS) * 4;
-   case VK_DESCRIPTOR_TYPE_SAMPLER:
-      return A6XX_TEX_SAMP_DWORDS * 4;
+      /* We make offsets and sizes all 16 dwords, to match how the hardware
+       * interprets indices passed to sample/load/store instructions in
+       * multiples of 16 dwords.  This means that "normal" descriptors are all
+       * of size 16, with padding for smaller descriptors like uniform storage
+       * descriptors which are less than 16 dwords. However combined images
+       * and samplers are actually two descriptors, so they have size 2.
+       */
+      return A6XX_TEX_CONST_DWORDS * 4 * 2;
    default:
-      unreachable("unknown descriptor type\n");
-      return 0;
+      return A6XX_TEX_CONST_DWORDS * 4;
    }
 }
 
@@ -145,7 +139,7 @@ tu_CreateDescriptorSetLayout(
 
    set_layout->flags = pCreateInfo->flags;
 
-   /* We just allocate all the samplers at the end of the struct */
+   /* We just allocate all the immutable samplers at the end of the struct */
    struct tu_sampler *samplers = (void*) &set_layout->binding[max_binding + 1];
 
    VkDescriptorSetLayoutBinding *bindings = create_sorted_bindings(
@@ -157,41 +151,27 @@ tu_CreateDescriptorSetLayout(
 
    set_layout->binding_count = max_binding + 1;
    set_layout->shader_stages = 0;
-   set_layout->dynamic_shader_stages = 0;
    set_layout->has_immutable_samplers = false;
    set_layout->size = 0;
+   set_layout->dynamic_ubo = 0;
 
    memset(set_layout->binding, 0,
           size - sizeof(struct tu_descriptor_set_layout));
 
-   uint32_t buffer_count = 0;
    uint32_t dynamic_offset_count = 0;
+   uint32_t input_attachment_count = 0;
+   uint32_t buffer_count = 0;
 
    for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
       const VkDescriptorSetLayoutBinding *binding = bindings + j;
       uint32_t b = binding->binding;
-      uint32_t alignment = 4;
-      unsigned binding_buffer_count = 1;
-
-      switch (binding->descriptorType) {
-      case VK_DESCRIPTOR_TYPE_SAMPLER:
-         binding_buffer_count = 0;
-         break;
-      case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-      case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-         assert(!(pCreateInfo->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
-         set_layout->binding[b].dynamic_offset_count = 1;
-         break;
-      default:
-         break;
-      }
 
-      set_layout->size = align(set_layout->size, alignment);
       set_layout->binding[b].type = binding->descriptorType;
       set_layout->binding[b].array_size = binding->descriptorCount;
       set_layout->binding[b].offset = set_layout->size;
       set_layout->binding[b].buffer_offset = buffer_count;
       set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count;
+      set_layout->binding[b].input_attachment_offset = input_attachment_count;
       set_layout->binding[b].size = descriptor_size(binding->descriptorType);
 
       if (variable_flags && binding->binding < variable_flags->bindingCount &&
@@ -219,16 +199,29 @@ tu_CreateDescriptorSetLayout(
 
       set_layout->size +=
          binding->descriptorCount * set_layout->binding[b].size;
-      buffer_count += binding->descriptorCount * binding_buffer_count;
-      dynamic_offset_count += binding->descriptorCount *
-                              set_layout->binding[b].dynamic_offset_count;
+      if (binding->descriptorType != VK_DESCRIPTOR_TYPE_SAMPLER &&
+          binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
+         buffer_count += binding->descriptorCount;
+      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+          binding->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
+         if (binding->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
+            STATIC_ASSERT(MAX_DYNAMIC_BUFFERS <= 8 * sizeof(set_layout->dynamic_ubo));
+            set_layout->dynamic_ubo |=
+               ((1u << binding->descriptorCount) - 1) << dynamic_offset_count;
+         }
+
+         dynamic_offset_count += binding->descriptorCount;
+      }
+      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT)
+         input_attachment_count += binding->descriptorCount;
       set_layout->shader_stages |= binding->stageFlags;
    }
 
    free(bindings);
 
-   set_layout->buffer_count = buffer_count;
    set_layout->dynamic_offset_count = dynamic_offset_count;
+   set_layout->input_attachment_count = input_attachment_count;
+   set_layout->buffer_count = buffer_count;
 
    *pSetLayout = tu_descriptor_set_layout_to_handle(set_layout);
 
@@ -333,19 +326,22 @@ tu_CreatePipelineLayout(VkDevice _device,
       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    layout->num_sets = pCreateInfo->setLayoutCount;
+   layout->input_attachment_count = 0;
+   layout->dynamic_offset_count = 0;
 
-   unsigned dynamic_offset_count = 0;
+   unsigned dynamic_offset_count = 0, input_attachment_count = 0;
 
    _mesa_sha1_init(&ctx);
    for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
       TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout,
                      pCreateInfo->pSetLayouts[set]);
       layout->set[set].layout = set_layout;
-
       layout->set[set].dynamic_offset_start = dynamic_offset_count;
+      layout->set[set].input_attachment_start = input_attachment_count;
+      dynamic_offset_count += set_layout->dynamic_offset_count;
+      input_attachment_count += set_layout->input_attachment_count;
+
       for (uint32_t b = 0; b < set_layout->binding_count; b++) {
-         dynamic_offset_count += set_layout->binding[b].array_size *
-                                 set_layout->binding[b].dynamic_offset_count;
          if (set_layout->binding[b].immutable_samplers_offset)
             _mesa_sha1_update(
                &ctx,
@@ -358,6 +354,7 @@ tu_CreatePipelineLayout(VkDevice _device,
    }
 
    layout->dynamic_offset_count = dynamic_offset_count;
+   layout->input_attachment_count = input_attachment_count;
    layout->push_constant_size = 0;
 
    for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) {
@@ -407,10 +404,11 @@ tu_descriptor_set_create(struct tu_device *device,
       buffer_count = layout->binding[layout->binding_count - 1].buffer_offset +
                      *variable_count * stride;
    }
-   unsigned range_offset = sizeof(struct tu_descriptor_set) +
+   unsigned dynamic_offset = sizeof(struct tu_descriptor_set) +
       sizeof(struct tu_bo *) * buffer_count;
-   unsigned mem_size = range_offset +
-      sizeof(struct tu_descriptor_range) * layout->dynamic_offset_count;
+   unsigned mem_size = dynamic_offset +
+      A6XX_TEX_CONST_DWORDS * 4 * (layout->dynamic_offset_count +
+                                   layout->input_attachment_count);;
 
    if (pool->host_memory_base) {
       if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
@@ -428,18 +426,16 @@ tu_descriptor_set_create(struct tu_device *device,
 
    memset(set, 0, mem_size);
 
-   if (layout->dynamic_offset_count) {
-      set->dynamic_descriptors = (struct tu_descriptor_range*)((uint8_t*)set + range_offset);
+   if (layout->dynamic_offset_count + layout->input_attachment_count > 0) {
+      set->dynamic_descriptors = (uint32_t *)((uint8_t*)set + dynamic_offset);
    }
 
    set->layout = layout;
+   set->pool = pool;
    uint32_t layout_size = layout->size;
    if (variable_count) {
       assert(layout->has_variable_descriptors);
       uint32_t stride = layout->binding[layout->binding_count - 1].size;
-      if (layout->binding[layout->binding_count - 1].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
-         stride = 1;
-
       layout_size = layout->binding[layout->binding_count - 1].offset +
                     *variable_count * stride;
    }
@@ -527,7 +523,7 @@ tu_CreateDescriptorPool(VkDevice _device,
    TU_FROM_HANDLE(tu_device, device, _device);
    struct tu_descriptor_pool *pool;
    uint64_t size = sizeof(struct tu_descriptor_pool);
-   uint64_t bo_size = 0, bo_count = 0, range_count = 0;
+   uint64_t bo_size = 0, bo_count = 0, dynamic_count = 0;
 
    for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
       if (pCreateInfo->pPoolSizes[i].type != VK_DESCRIPTOR_TYPE_SAMPLER)
@@ -536,7 +532,8 @@ tu_CreateDescriptorPool(VkDevice _device,
       switch(pCreateInfo->pPoolSizes[i].type) {
       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
       case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-         range_count += pCreateInfo->pPoolSizes[i].descriptorCount;
+      case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+         dynamic_count += pCreateInfo->pPoolSizes[i].descriptorCount;
       default:
          break;
       }
@@ -548,7 +545,7 @@ tu_CreateDescriptorPool(VkDevice _device,
    if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
       uint64_t host_size = pCreateInfo->maxSets * sizeof(struct tu_descriptor_set);
       host_size += sizeof(struct tu_bo*) * bo_count;
-      host_size += sizeof(struct tu_descriptor_range) * range_count;
+      host_size += A6XX_TEX_CONST_DWORDS * 4 * dynamic_count;
       size += host_size;
    } else {
       size += sizeof(struct tu_descriptor_pool_entry) * pCreateInfo->maxSets;
@@ -708,6 +705,16 @@ static void write_texel_buffer_descriptor(struct tu_device *device,
       *buffer_list = view->buffer->bo;
 }
 
+static uint32_t get_range(struct tu_buffer *buf, VkDeviceSize offset,
+                          VkDeviceSize range)
+{
+   if (range == VK_WHOLE_SIZE) {
+      return buf->size - offset;
+   } else {
+      return range;
+   }
+}
+
 static void write_buffer_descriptor(struct tu_device *device,
                                     struct tu_cmd_buffer *cmd_buffer,
                                     unsigned *dst,
@@ -717,8 +724,18 @@ static void write_buffer_descriptor(struct tu_device *device,
    TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer);
 
    uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset;
-   dst[0] = va;
-   dst[1] = va >> 32;
+   uint32_t range = get_range(buffer, buffer_info->offset, buffer_info->range);
+   range = ALIGN_POT(range, 4) / 4;
+   dst[0] =
+      A6XX_IBO_0_TILE_MODE(TILE6_LINEAR) | A6XX_IBO_0_FMT(FMT6_32_UINT);
+   dst[1] = range;
+   dst[2] =
+      A6XX_IBO_2_UNK4 | A6XX_IBO_2_TYPE(A6XX_TEX_1D) | A6XX_IBO_2_UNK31;
+   dst[3] = 0;
+   dst[4] = A6XX_IBO_4_BASE_LO(va);
+   dst[5] = A6XX_IBO_5_BASE_HI(va >> 32);
+   for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
+      dst[i] = 0;
 
    if (cmd_buffer)
       tu_bo_list_add(&cmd_buffer->bo_list, buffer->bo, MSM_SUBMIT_BO_READ);
@@ -726,22 +743,25 @@ static void write_buffer_descriptor(struct tu_device *device,
       *buffer_list = buffer->bo;
 }
 
-static void write_dynamic_buffer_descriptor(struct tu_device *device,
-                                            struct tu_descriptor_range *range,
-                                            struct tu_bo **buffer_list,
-                                            const VkDescriptorBufferInfo *buffer_info)
+static void write_ubo_descriptor(struct tu_device *device,
+                                 struct tu_cmd_buffer *cmd_buffer,
+                                 unsigned *dst,
+                                 struct tu_bo **buffer_list,
+                                 const VkDescriptorBufferInfo *buffer_info)
 {
    TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer);
-   uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset;
-   unsigned size = buffer_info->range;
 
-   if (buffer_info->range == VK_WHOLE_SIZE)
-      size = buffer->size - buffer_info->offset;
-
-   range->va = va;
-   range->size = size;
+   uint32_t range = get_range(buffer, buffer_info->offset, buffer_info->range);
+   /* The HW range is in vec4 units */
+   range = ALIGN_POT(range, 16) / 16;
+   uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset;
+   dst[0] = A6XX_UBO_0_BASE_LO(va);
+   dst[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(range);
 
-   *buffer_list = buffer->bo;
+   if (cmd_buffer)
+      tu_bo_list_add(&cmd_buffer->bo_list, buffer->bo, MSM_SUBMIT_BO_READ);
+   else
+      *buffer_list = buffer->bo;
 }
 
 static void
@@ -754,10 +774,10 @@ write_image_descriptor(struct tu_device *device,
 {
    TU_FROM_HANDLE(tu_image_view, iview, image_info->imageView);
 
-   memcpy(dst, iview->descriptor, sizeof(iview->descriptor));
    if (descriptor_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
-      memcpy(&dst[A6XX_TEX_CONST_DWORDS], iview->storage_descriptor,
-             sizeof(iview->storage_descriptor));
+      memcpy(dst, iview->storage_descriptor, sizeof(iview->storage_descriptor));
+   } else {
+      memcpy(dst, iview->descriptor, sizeof(iview->descriptor));
    }
 
    if (cmd_buffer)
@@ -782,7 +802,7 @@ write_combined_image_sampler_descriptor(struct tu_device *device,
                           descriptor_type, image_info);
    /* copy over sampler state */
    if (has_sampler) {
-      memcpy(dst + sampler_offset / sizeof(*dst), sampler, sizeof(*sampler));
+      memcpy(dst + A6XX_TEX_CONST_DWORDS, sampler, sizeof(*sampler));
    }
 }
 
@@ -813,26 +833,37 @@ tu_update_descriptor_sets(struct tu_device *device,
       const struct tu_descriptor_set_binding_layout *binding_layout =
          set->layout->binding + writeset->dstBinding;
       uint32_t *ptr = set->mapped_ptr;
-      struct tu_bo **buffer_list = set->descriptors;
+      struct tu_bo **buffer_list = set->buffers;
 
       ptr += binding_layout->offset / 4;
 
-      ptr += binding_layout->size * writeset->dstArrayElement / 4;
+      ptr += (binding_layout->size / 4) * writeset->dstArrayElement;
       buffer_list += binding_layout->buffer_offset;
       buffer_list += writeset->dstArrayElement;
       for (j = 0; j < writeset->descriptorCount; ++j) {
          switch(writeset->descriptorType) {
-         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
-            unsigned idx = writeset->dstArrayElement + j;
-            idx += binding_layout->dynamic_offset_offset;
+         case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: {
             assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
-            write_dynamic_buffer_descriptor(device, set->dynamic_descriptors + idx,
-                        buffer_list, writeset->pBufferInfo + j);
+            unsigned idx = writeset->dstArrayElement + j;
+            idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset;
+            write_ubo_descriptor(device, cmd_buffer,
+                                 set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
+                                 buffer_list, writeset->pBufferInfo + j);
             break;
          }
-
          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+            write_ubo_descriptor(device, cmd_buffer, ptr, buffer_list,
+                     writeset->pBufferInfo + j);
+            break;
+         case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+            assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
+            unsigned idx = writeset->dstArrayElement + j;
+            idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset;
+            write_buffer_descriptor(device, cmd_buffer,
+                                    set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
+                                    buffer_list, writeset->pBufferInfo + j);
+            break;
+         }
          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
             write_buffer_descriptor(device, cmd_buffer, ptr, buffer_list,
                      writeset->pBufferInfo + j);
@@ -844,11 +875,19 @@ tu_update_descriptor_sets(struct tu_device *device,
             break;
          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
-         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
             write_image_descriptor(device, cmd_buffer, ptr, buffer_list,
                                    writeset->descriptorType,
                                    writeset->pImageInfo + j);
             break;
+         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
+            unsigned idx = writeset->dstArrayElement + j;
+            idx += binding_layout->input_attachment_offset;
+            write_image_descriptor(device, cmd_buffer,
+                                    set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
+                                    buffer_list, writeset->descriptorType,
+                                    writeset->pImageInfo + j);
+            break;
+         }
          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
             write_combined_image_sampler_descriptor(device, cmd_buffer,
                                                     A6XX_TEX_CONST_DWORDS * 4,
@@ -881,8 +920,8 @@ tu_update_descriptor_sets(struct tu_device *device,
          dst_set->layout->binding + copyset->dstBinding;
       uint32_t *src_ptr = src_set->mapped_ptr;
       uint32_t *dst_ptr = dst_set->mapped_ptr;
-      struct tu_bo **src_buffer_list = src_set->descriptors;
-      struct tu_bo **dst_buffer_list = dst_set->descriptors;
+      struct tu_bo **src_buffer_list = src_set->buffers;
+      struct tu_bo **dst_buffer_list = dst_set->buffers;
 
       src_ptr += src_binding_layout->offset / 4;
       dst_ptr += dst_binding_layout->offset / 4;
@@ -902,18 +941,33 @@ tu_update_descriptor_sets(struct tu_device *device,
          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
             unsigned src_idx = copyset->srcArrayElement + j;
             unsigned dst_idx = copyset->dstArrayElement + j;
-            struct tu_descriptor_range *src_range, *dst_range;
+            src_idx += src_set->layout->input_attachment_count;
+            dst_idx += dst_set->layout->input_attachment_count;
             src_idx += src_binding_layout->dynamic_offset_offset;
             dst_idx += dst_binding_layout->dynamic_offset_offset;
 
-            src_range = src_set->dynamic_descriptors + src_idx;
-            dst_range = dst_set->dynamic_descriptors + dst_idx;
-            *dst_range = *src_range;
+            uint32_t *src_dynamic, *dst_dynamic;
+            src_dynamic = src_set->dynamic_descriptors + src_idx * A6XX_TEX_CONST_DWORDS;
+            dst_dynamic = dst_set->dynamic_descriptors + dst_idx * A6XX_TEX_CONST_DWORDS;
+            memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4);
+            break;
+         }
+         case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
+            unsigned src_idx = copyset->srcArrayElement + j;
+            unsigned dst_idx = copyset->dstArrayElement + j;
+            src_idx += src_binding_layout->input_attachment_offset;
+            dst_idx += dst_binding_layout->input_attachment_offset;
+
+            uint32_t *src_dynamic, *dst_dynamic;
+            src_dynamic = src_set->dynamic_descriptors + src_idx * A6XX_TEX_CONST_DWORDS;
+            dst_dynamic = dst_set->dynamic_descriptors + dst_idx * A6XX_TEX_CONST_DWORDS;
+            memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4);
             break;
          }
          default:
             memcpy(dst_ptr, src_ptr, src_binding_layout->size);
          }
+
          src_ptr += src_binding_layout->size / 4;
          dst_ptr += dst_binding_layout->size / 4;
 
index 282d75895c912934c29d28fb03917f520ffa6a5d..3a24822eb6736ce8d43b7cd8acc83d50a969fa29 100644 (file)
 
 #include <vulkan/vulkan.h>
 
-#define MAX_SETS 32
+/* The hardware supports 5 descriptor sets, but we reserve 1 for dynamic
+ * descriptors and input attachments.
+ */
+#define MAX_SETS 4
 
 struct tu_descriptor_set_binding_layout
 {
@@ -35,13 +38,24 @@ struct tu_descriptor_set_binding_layout
    /* Number of array elements in this binding */
    uint32_t array_size;
 
+   /* The size in bytes of each Vulkan descriptor. */
+   uint32_t size;
+
    uint32_t offset;
+
+   /* For descriptors that point to a buffer, index into the array of BO's to
+    * be added to the cmdbuffer's used BO list.
+    */
    uint32_t buffer_offset;
-   uint16_t dynamic_offset_offset;
 
-   uint16_t dynamic_offset_count;
-   /* redundant with the type, each for a single array element */
-   uint32_t size;
+   /* Index into the pDynamicOffsets array for dynamic descriptors, as well as
+    * the array of dynamic descriptors (offsetted by
+    * tu_pipeline_layout::set::dynamic_offset_start).
+    */
+   uint32_t dynamic_offset_offset;
+
+   /* Index into the array of dynamic input attachment descriptors */
+   uint32_t input_attachment_offset;
 
    /* Offset in the tu_descriptor_set_layout of the immutable samplers, or 0
     * if there are no immutable samplers. */
@@ -61,14 +75,20 @@ struct tu_descriptor_set_layout
 
    /* Shader stages affected by this descriptor set */
    uint16_t shader_stages;
-   uint16_t dynamic_shader_stages;
-
-   /* Number of buffers in this descriptor set */
-   uint32_t buffer_count;
 
    /* Number of dynamic offsets used by this descriptor set */
    uint16_t dynamic_offset_count;
 
+   /* Number of input attachments used by the descriptor set */
+   uint16_t input_attachment_count;
+
+   /* A bitfield of which dynamic buffers are ubo's, to make the
+    * descriptor-binding-time patching easier.
+    */
+   uint32_t dynamic_ubo;
+
+   uint32_t buffer_count;
+
    bool has_immutable_samplers;
    bool has_variable_descriptors;
 
@@ -83,11 +103,13 @@ struct tu_pipeline_layout
       struct tu_descriptor_set_layout *layout;
       uint32_t size;
       uint32_t dynamic_offset_start;
+      uint32_t input_attachment_start;
    } set[MAX_SETS];
 
    uint32_t num_sets;
    uint32_t push_constant_size;
    uint32_t dynamic_offset_count;
+   uint32_t input_attachment_count;
 
    unsigned char sha1[20];
 };
index 394c3ce18009d18f06352dfb703ed0c9a39e4d43..d71a4809bc7b69e30ba225e263f534a41198da2f 100644 (file)
@@ -732,18 +732,13 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
    VkSampleCountFlags sample_counts =
       VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
 
-   /* make sure that the entire descriptor set is addressable with a signed
-    * 32-bit int. So the sum of all limits scaled by descriptor size has to
-    * be at most 2 GiB. the combined image & samples object count as one of
-    * both. This limit is for the pipeline layout, not for the set layout, but
-    * there is no set limit, so we just set a pipeline limit. I don't think
-    * any app is going to hit this soon. */
-   size_t max_descriptor_set_size =
-      ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
-      (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
-       32 /* storage buffer, 32 due to potential space wasted on alignment */ +
-       32 /* sampler, largest when combined with image */ +
-       64 /* sampled image */ + 64 /* storage image */);
+   /* I have no idea what the maximum size is, but the hardware supports very
+    * large numbers of descriptors (at least 2^16). This limit is based on
+    * CP_LOAD_STATE6, which has a 28-bit field for the DWORD offset, so that
+    * we don't have to think about what to do if that overflows, but really
+    * nothing is likely to get close to this.
+    */
+   const size_t max_descriptor_set_size = (1 << 28) / A6XX_TEX_CONST_DWORDS;
 
    VkPhysicalDeviceLimits limits = {
       .maxImageDimension1D = (1 << 14),
@@ -752,7 +747,7 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxImageDimensionCube = (1 << 14),
       .maxImageArrayLayers = (1 << 11),
       .maxTexelBufferElements = 128 * 1024 * 1024,
-      .maxUniformBufferRange = UINT32_MAX,
+      .maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE,
       .maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE,
       .maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
       .maxMemoryAllocationCount = UINT32_MAX,
@@ -765,7 +760,7 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxPerStageDescriptorStorageBuffers = max_descriptor_set_size,
       .maxPerStageDescriptorSampledImages = max_descriptor_set_size,
       .maxPerStageDescriptorStorageImages = max_descriptor_set_size,
-      .maxPerStageDescriptorInputAttachments = max_descriptor_set_size,
+      .maxPerStageDescriptorInputAttachments = MAX_RTS,
       .maxPerStageResources = max_descriptor_set_size,
       .maxDescriptorSetSamplers = max_descriptor_set_size,
       .maxDescriptorSetUniformBuffers = max_descriptor_set_size,
@@ -774,7 +769,7 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS,
       .maxDescriptorSetSampledImages = max_descriptor_set_size,
       .maxDescriptorSetStorageImages = max_descriptor_set_size,
-      .maxDescriptorSetInputAttachments = max_descriptor_set_size,
+      .maxDescriptorSetInputAttachments = MAX_RTS,
       .maxVertexInputAttributes = 32,
       .maxVertexInputBindings = 32,
       .maxVertexInputAttributeOffset = 4095,
@@ -814,8 +809,8 @@ tu_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
       .viewportSubPixelBits = 8,
       .minMemoryMapAlignment = 4096, /* A page */
       .minTexelBufferOffsetAlignment = 64,
-      .minUniformBufferOffsetAlignment = 4,
-      .minStorageBufferOffsetAlignment = 4,
+      .minUniformBufferOffsetAlignment = 64,
+      .minStorageBufferOffsetAlignment = 64,
       .minTexelOffset = -32,
       .maxTexelOffset = 31,
       .minTexelGatherOffset = -32,
@@ -1715,7 +1710,7 @@ tu_GetBufferMemoryRequirements(VkDevice _device,
    TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
 
    pMemoryRequirements->memoryTypeBits = 1;
-   pMemoryRequirements->alignment = 16;
+   pMemoryRequirements->alignment = 64;
    pMemoryRequirements->size =
       align64(buffer->size, pMemoryRequirements->alignment);
 }
index 5d36dfcaf3fc41fab1bb8a72968bd7dc9400d72e..dc2a568a59c5f0bc0564fd75a4695182e6621d43 100644 (file)
@@ -333,13 +333,18 @@ tu6_blend_op(VkBlendOp op)
    }
 }
 
-static unsigned
-tu_shader_nibo(const struct tu_shader *shader)
-{
-   /* Don't use ir3_shader_nibo(), because that would include declared but
-    * unused storage images and SSBOs.
-    */
-   return shader->ssbo_map.num_desc + shader->image_map.num_desc;
+static uint32_t
+emit_xs_config(const struct ir3_shader_variant *sh)
+{
+   if (sh->instrlen) {
+      return A6XX_SP_VS_CONFIG_ENABLED |
+         COND(sh->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
+         COND(sh->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
+         COND(sh->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
+         COND(sh->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO);
+   } else {
+      return 0;
+   }
 }
 
 static void
@@ -356,16 +361,11 @@ tu6_emit_vs_config(struct tu_cs *cs, struct tu_shader *shader,
    if (vs->need_fine_derivatives)
       sp_vs_ctrl |= A6XX_SP_VS_CTRL_REG0_DIFF_FINE;
 
-   uint32_t sp_vs_config = A6XX_SP_VS_CONFIG_NTEX(shader->texture_map.num_desc) |
-                           A6XX_SP_VS_CONFIG_NSAMP(shader->sampler_map.num_desc);
-   if (vs->instrlen)
-      sp_vs_config |= A6XX_SP_VS_CONFIG_ENABLED;
-
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CTRL_REG0, 1);
    tu_cs_emit(cs, sp_vs_ctrl);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CONFIG, 2);
-   tu_cs_emit(cs, sp_vs_config);
+   tu_cs_emit(cs, emit_xs_config(vs));
    tu_cs_emit(cs, vs->instrlen);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_VS_CNTL, 1);
@@ -377,15 +377,11 @@ static void
 tu6_emit_hs_config(struct tu_cs *cs, struct tu_shader *shader,
                    const struct ir3_shader_variant *hs)
 {
-   uint32_t sp_hs_config = 0;
-   if (hs->instrlen)
-      sp_hs_config |= A6XX_SP_HS_CONFIG_ENABLED;
-
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
    tu_cs_emit(cs, 0);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CONFIG, 2);
-   tu_cs_emit(cs, sp_hs_config);
+   tu_cs_emit(cs, emit_xs_config(hs));
    tu_cs_emit(cs, hs->instrlen);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_HS_CNTL, 1);
@@ -396,12 +392,8 @@ static void
 tu6_emit_ds_config(struct tu_cs *cs, struct tu_shader *shader,
                    const struct ir3_shader_variant *ds)
 {
-   uint32_t sp_ds_config = 0;
-   if (ds->instrlen)
-      sp_ds_config |= A6XX_SP_DS_CONFIG_ENABLED;
-
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_DS_CONFIG, 2);
-   tu_cs_emit(cs, sp_ds_config);
+   tu_cs_emit(cs, emit_xs_config(ds));
    tu_cs_emit(cs, ds->instrlen);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_DS_CNTL, 1);
@@ -417,11 +409,7 @@ tu6_emit_gs_config(struct tu_cs *cs, struct tu_shader *shader,
    tu_cs_emit(cs, 0);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CONFIG, 2);
-   tu_cs_emit(cs, COND(has_gs,
-                       A6XX_SP_GS_CONFIG_ENABLED |
-                       A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(gs)) |
-                       A6XX_SP_GS_CONFIG_NTEX(gs->num_samp) |
-                       A6XX_SP_GS_CONFIG_NSAMP(gs->num_samp)));
+   tu_cs_emit(cs, emit_xs_config(gs));
    tu_cs_emit(cs, gs->instrlen);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_GS_CNTL, 1);
@@ -445,31 +433,16 @@ tu6_emit_fs_config(struct tu_cs *cs, struct tu_shader *shader,
    if (fs->need_fine_derivatives)
       sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_DIFF_FINE;
 
-   uint32_t sp_fs_config = 0;
-   unsigned shader_nibo = 0;
-   if (shader) {
-      shader_nibo = tu_shader_nibo(shader);
-      sp_fs_config = A6XX_SP_FS_CONFIG_NTEX(shader->texture_map.num_desc) |
-                     A6XX_SP_FS_CONFIG_NSAMP(shader->sampler_map.num_desc) |
-                     A6XX_SP_FS_CONFIG_NIBO(shader_nibo);
-   }
-
-   if (fs->instrlen)
-      sp_fs_config |= A6XX_SP_FS_CONFIG_ENABLED;
-
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CTRL_REG0, 1);
    tu_cs_emit(cs, sp_fs_ctrl);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CONFIG, 2);
-   tu_cs_emit(cs, sp_fs_config);
+   tu_cs_emit(cs, emit_xs_config(fs));
    tu_cs_emit(cs, fs->instrlen);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL, 1);
    tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(fs->constlen, 4)) |
                   A6XX_HLSQ_FS_CNTL_ENABLED);
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_IBO_COUNT, 1);
-   tu_cs_emit(cs, shader_nibo);
 }
 
 static void
@@ -485,10 +458,7 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
               A6XX_HLSQ_CS_CNTL_ENABLED);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CONFIG, 2);
-   tu_cs_emit(cs, A6XX_SP_CS_CONFIG_ENABLED |
-              A6XX_SP_CS_CONFIG_NIBO(tu_shader_nibo(shader)) |
-              A6XX_SP_CS_CONFIG_NTEX(shader->texture_map.num_desc) |
-              A6XX_SP_CS_CONFIG_NSAMP(shader->sampler_map.num_desc));
+   tu_cs_emit(cs, emit_xs_config(v));
    tu_cs_emit(cs, v->instrlen);
 
    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CTRL_REG0, 1);
@@ -514,9 +484,6 @@ tu6_emit_cs_config(struct tu_cs *cs, const struct tu_shader *shader,
               A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
               A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
    tu_cs_emit(cs, 0x2fc);             /* HLSQ_CS_UNKNOWN_B998 */
-
-   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_IBO_COUNT, 1);
-   tu_cs_emit(cs, tu_shader_nibo(shader));
 }
 
 static void
@@ -1013,6 +980,16 @@ tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
                      A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
    }
 
+   if (fs->num_sampler_prefetch > 0) {
+      tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
+      for (int i = 0; i < fs->num_sampler_prefetch; i++) {
+         const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
+         tu_cs_emit(cs,
+                    A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
+                    A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
+      }
+   }
+
    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
    tu_cs_emit(cs, 0x7);
    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
@@ -1949,11 +1926,6 @@ tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
    link->const_state = v->shader->const_state;
    link->constlen = v->constlen;
    link->push_consts = shader->push_consts;
-   link->texture_map = shader->texture_map;
-   link->sampler_map = shader->sampler_map;
-   link->ubo_map = shader->ubo_map;
-   link->ssbo_map = shader->ssbo_map;
-   link->image_map = shader->image_map;
 }
 
 static void
@@ -1984,6 +1956,12 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
                               builder->shaders[i],
                               &builder->shaders[i]->variants[0]);
    }
+
+   if (builder->shaders[MESA_SHADER_FRAGMENT]) {
+      memcpy(pipeline->program.input_attachment_idx,
+             builder->shaders[MESA_SHADER_FRAGMENT]->attachment_idx,
+             sizeof(pipeline->program.input_attachment_idx));
+   }
 }
 
 static void
@@ -2209,6 +2187,8 @@ tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
    if (result != VK_SUCCESS)
       return result;
 
+   (*pipeline)->layout = builder->layout;
+
    /* compile and upload shaders */
    result = tu_pipeline_builder_compile_shaders(builder);
    if (result == VK_SUCCESS)
index 3c50b2ec0194b387be19ee19e8ff9b0b1fd5bbcf..87e4df85ff5ea27f8c2c8f60fdf7c9c458531761 100644 (file)
@@ -98,6 +98,12 @@ typedef uint32_t xcb_window_t;
 #define MAX_VIEWS 8
 /* The Qualcomm driver exposes 0x20000058 */
 #define MAX_STORAGE_BUFFER_RANGE 0x20000000
+/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
+ * expose the same maximum range.
+ * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
+ * range might be higher.
+ */
+#define MAX_UNIFORM_BUFFER_RANGE 0x10000
 
 #define NUM_DEPTH_CLEAR_PIPELINES 3
 
@@ -615,13 +621,15 @@ struct tu_descriptor_range
 struct tu_descriptor_set
 {
    const struct tu_descriptor_set_layout *layout;
+   struct tu_descriptor_pool *pool;
    uint32_t size;
 
    uint64_t va;
    uint32_t *mapped_ptr;
-   struct tu_descriptor_range *dynamic_descriptors;
 
-   struct tu_bo *descriptors[0];
+   uint32_t *dynamic_descriptors;
+
+   struct tu_bo *buffers[0];
 };
 
 struct tu_push_descriptor_set
@@ -806,7 +814,8 @@ struct tu_descriptor_state
    uint32_t valid;
    struct tu_push_descriptor_set push_set;
    bool push_dirty;
-   uint64_t dynamic_buffers[MAX_DYNAMIC_BUFFERS];
+   uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS];
+   uint32_t input_attachments[MAX_RTS * A6XX_TEX_CONST_DWORDS];
 };
 
 struct tu_tile
@@ -845,8 +854,10 @@ enum tu_cmd_dirty_bits
    TU_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1,
    TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 2,
    TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3,
-   TU_CMD_DIRTY_PUSH_CONSTANTS = 1 << 4,
-   TU_CMD_DIRTY_STREAMOUT_BUFFERS = 1 << 5,
+   TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 4,
+   TU_CMD_DIRTY_PUSH_CONSTANTS = 1 << 5,
+   TU_CMD_DIRTY_STREAMOUT_BUFFERS = 1 << 6,
+   TU_CMD_DIRTY_INPUT_ATTACHMENTS = 1 << 7,
 
    TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16,
    TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17,
@@ -1118,17 +1129,6 @@ struct tu_shader_compile_options
    bool include_binning_pass;
 };
 
-struct tu_descriptor_map
-{
-   /* TODO: avoid fixed size array/justify the size */
-   unsigned num; /* number of array entries */
-   unsigned num_desc; /* Number of descriptors (sum of array_size[]) */
-   int set[128];
-   int binding[128];
-   int value[128];
-   int array_size[128];
-};
-
 struct tu_push_constant_range
 {
    uint32_t lo;
@@ -1140,11 +1140,7 @@ struct tu_shader
    struct ir3_shader ir3_shader;
 
    struct tu_push_constant_range push_consts;
-   struct tu_descriptor_map texture_map;
-   struct tu_descriptor_map sampler_map;
-   struct tu_descriptor_map ubo_map;
-   struct tu_descriptor_map ssbo_map;
-   struct tu_descriptor_map image_map;
+   unsigned attachment_idx[MAX_RTS];
 
    /* This may be true for vertex shaders.  When true, variants[1] is the
     * binning variant and binning_binary is non-NULL.
@@ -1189,11 +1185,6 @@ struct tu_program_descriptor_linkage
    uint32_t constlen;
 
    struct tu_push_constant_range push_consts;
-   struct tu_descriptor_map texture_map;
-   struct tu_descriptor_map sampler_map;
-   struct tu_descriptor_map ubo_map;
-   struct tu_descriptor_map ssbo_map;
-   struct tu_descriptor_map image_map;
 };
 
 struct tu_pipeline
@@ -1216,6 +1207,7 @@ struct tu_pipeline
       struct tu_cs_entry binning_state_ib;
 
       struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
+      unsigned input_attachment_idx[MAX_RTS];
    } program;
 
    struct
index deb6d895feb49c067e87535716b4b5e5eadeff41..85bf6bbc50ffa269213f5342b8fea34f49c7bc8a 100644 (file)
@@ -83,129 +83,6 @@ tu_spirv_to_nir(struct ir3_compiler *compiler,
    return nir;
 }
 
-static unsigned
-map_add(struct tu_descriptor_map *map, int set, int binding, int value,
-        int array_size)
-{
-   unsigned index = 0;
-   for (unsigned i = 0; i < map->num; i++) {
-      if (set == map->set[i] && binding == map->binding[i]) {
-         assert(value == map->value[i]);
-         assert(array_size == map->array_size[i]);
-         return index;
-      }
-      index += map->array_size[i];
-   }
-
-   assert(index == map->num_desc);
-
-   map->set[map->num] = set;
-   map->binding[map->num] = binding;
-   map->value[map->num] = value;
-   map->array_size[map->num] = array_size;
-   map->num++;
-   map->num_desc += array_size;
-
-   return index;
-}
-
-static void
-lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
-                        struct tu_shader *shader,
-                        const struct tu_pipeline_layout *layout)
-{
-   nir_ssa_def *index = NULL;
-   unsigned base_index = 0;
-   unsigned array_elements = 1;
-   nir_tex_src *src = &instr->src[src_idx];
-   bool is_sampler = src->src_type == nir_tex_src_sampler_deref;
-
-   /* We compute first the offsets */
-   nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr);
-   while (deref->deref_type != nir_deref_type_var) {
-      assert(deref->parent.is_ssa);
-      nir_deref_instr *parent =
-         nir_instr_as_deref(deref->parent.ssa->parent_instr);
-
-      assert(deref->deref_type == nir_deref_type_array);
-
-      if (nir_src_is_const(deref->arr.index) && index == NULL) {
-         /* We're still building a direct index */
-         base_index += nir_src_as_uint(deref->arr.index) * array_elements;
-      } else {
-         if (index == NULL) {
-            /* We used to be direct but not anymore */
-            index = nir_imm_int(b, base_index);
-            base_index = 0;
-         }
-
-         index = nir_iadd(b, index,
-                          nir_imul(b, nir_imm_int(b, array_elements),
-                                   nir_ssa_for_src(b, deref->arr.index, 1)));
-      }
-
-      array_elements *= glsl_get_length(parent->type);
-
-      deref = parent;
-   }
-
-   if (index)
-      index = nir_umin(b, index, nir_imm_int(b, array_elements - 1));
-
-   /* We have the offsets, we apply them, rewriting the source or removing
-    * instr if needed
-    */
-   if (index) {
-      nir_instr_rewrite_src(&instr->instr, &src->src,
-                            nir_src_for_ssa(index));
-
-      src->src_type = is_sampler ?
-         nir_tex_src_sampler_offset :
-         nir_tex_src_texture_offset;
-   } else {
-      nir_tex_instr_remove_src(instr, src_idx);
-   }
-
-   uint32_t set = deref->var->data.descriptor_set;
-   uint32_t binding = deref->var->data.binding;
-   struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
-   struct tu_descriptor_set_binding_layout *binding_layout =
-      &set_layout->binding[binding];
-
-   int desc_index = map_add(is_sampler ?
-                            &shader->sampler_map : &shader->texture_map,
-                            deref->var->data.descriptor_set,
-                            deref->var->data.binding,
-                            deref->var->data.index,
-                            binding_layout->array_size) + base_index;
-   if (is_sampler)
-      instr->sampler_index = desc_index;
-   else
-      instr->texture_index = desc_index;
-}
-
-static bool
-lower_sampler(nir_builder *b, nir_tex_instr *instr, struct tu_shader *shader,
-                const struct tu_pipeline_layout *layout)
-{
-   int texture_idx =
-      nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
-
-   if (texture_idx >= 0)
-      lower_tex_src_to_offset(b, instr, texture_idx, shader, layout);
-
-   int sampler_idx =
-      nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
-
-   if (sampler_idx >= 0)
-      lower_tex_src_to_offset(b, instr, sampler_idx, shader, layout);
-
-   if (texture_idx < 0 && sampler_idx < 0)
-      return false;
-
-   return true;
-}
-
 static void
 lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
                          struct tu_shader *shader)
@@ -234,66 +111,108 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr,
                             struct tu_shader *shader,
                             const struct tu_pipeline_layout *layout)
 {
-   nir_const_value *const_val = nir_src_as_const_value(instr->src[0]);
+   nir_ssa_def *vulkan_idx = instr->src[0].ssa;
 
    unsigned set = nir_intrinsic_desc_set(instr);
    unsigned binding = nir_intrinsic_binding(instr);
    struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
    struct tu_descriptor_set_binding_layout *binding_layout =
       &set_layout->binding[binding];
-   unsigned index = 0;
+   uint32_t base;
 
-   switch (nir_intrinsic_desc_type(instr)) {
-   case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+   switch (binding_layout->type) {
    case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
-      if (!const_val)
-         tu_finishme("non-constant vulkan_resource_index array index");
-      /* skip index 0 which is used for push constants */
-      index = map_add(&shader->ubo_map, set, binding, 0,
-                      binding_layout->array_size) + 1;
-      index += const_val->u32;
-      break;
-   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
    case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
-      if (!const_val)
-         tu_finishme("non-constant vulkan_resource_index array index");
-      index = map_add(&shader->ssbo_map, set, binding, 0,
-                      binding_layout->array_size);
-      index += const_val->u32;
+      base = layout->set[set].dynamic_offset_start +
+         binding_layout->dynamic_offset_offset +
+         layout->input_attachment_count;
+      set = MAX_SETS;
       break;
    default:
-      tu_finishme("unsupported desc_type for vulkan_resource_index");
+      base = binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS);
       break;
    }
 
+   nir_intrinsic_instr *bindless =
+      nir_intrinsic_instr_create(b->shader,
+                                 nir_intrinsic_bindless_resource_ir3);
+   bindless->num_components = 1;
+   nir_ssa_dest_init(&bindless->instr, &bindless->dest,
+                     1, 32, NULL);
+   nir_intrinsic_set_desc_set(bindless, set);
+   bindless->src[0] = nir_src_for_ssa(nir_iadd(b, nir_imm_int(b, base), vulkan_idx));
+   nir_builder_instr_insert(b, &bindless->instr);
+
    nir_ssa_def_rewrite_uses(&instr->dest.ssa,
-                            nir_src_for_ssa(nir_imm_int(b, index)));
+                            nir_src_for_ssa(&bindless->dest.ssa));
    nir_instr_remove(&instr->instr);
 }
 
-static void
-lower_image_deref(nir_builder *b,
-                  nir_intrinsic_instr *instr, struct tu_shader *shader,
-                  const struct tu_pipeline_layout *layout)
+static nir_ssa_def *
+build_bindless(nir_builder *b, nir_deref_instr *deref, bool is_sampler,
+               struct tu_shader *shader,
+               const struct tu_pipeline_layout *layout)
 {
-   nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
    nir_variable *var = nir_deref_instr_get_variable(deref);
 
-   uint32_t set = var->data.descriptor_set;
-   uint32_t binding = var->data.binding;
-   struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
-   struct tu_descriptor_set_binding_layout *binding_layout =
-      &set_layout->binding[binding];
+   unsigned set = var->data.descriptor_set;
+   unsigned binding = var->data.binding;
+   const struct tu_descriptor_set_binding_layout *bind_layout =
+      &layout->set[set].layout->binding[binding];
+
+   nir_ssa_def *desc_offset;
+   unsigned descriptor_stride;
+   if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) {
+      unsigned offset =
+         layout->set[set].input_attachment_start +
+         bind_layout->input_attachment_offset;
+      desc_offset = nir_imm_int(b, offset);
+      set = MAX_SETS;
+      descriptor_stride = 1;
+   } else {
+      unsigned offset = 0;
+      /* Samplers come second in combined image/sampler descriptors, see
+       * write_combined_image_sampler_descriptor().
+       */
+      if (is_sampler && bind_layout->type ==
+          VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
+         offset = 1;
+      }
+      desc_offset =
+         nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
+                     offset);
+      descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
+   }
 
-   nir_ssa_def *index = nir_imm_int(b,
-                                    map_add(&shader->image_map,
-                                            set, binding, var->data.index,
-                                            binding_layout->array_size));
    if (deref->deref_type != nir_deref_type_var) {
       assert(deref->deref_type == nir_deref_type_array);
-      index = nir_iadd(b, index, nir_ssa_for_src(b, deref->arr.index, 1));
+
+      nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1);
+      desc_offset = nir_iadd(b, desc_offset,
+                             nir_imul_imm(b, arr_index, descriptor_stride));
    }
-   nir_rewrite_image_intrinsic(instr, index, false);
+
+   nir_intrinsic_instr *bindless =
+      nir_intrinsic_instr_create(b->shader,
+                                 nir_intrinsic_bindless_resource_ir3);
+   bindless->num_components = 1;
+   nir_ssa_dest_init(&bindless->instr, &bindless->dest,
+                     1, 32, NULL);
+   nir_intrinsic_set_desc_set(bindless, set);
+   bindless->src[0] = nir_src_for_ssa(desc_offset);
+   nir_builder_instr_insert(b, &bindless->instr);
+
+   return &bindless->dest.ssa;
+}
+
+static void
+lower_image_deref(nir_builder *b,
+                  nir_intrinsic_instr *instr, struct tu_shader *shader,
+                  const struct tu_pipeline_layout *layout)
+{
+   nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+   nir_ssa_def *bindless = build_bindless(b, deref, false, shader, layout);
+   nir_rewrite_image_intrinsic(instr, bindless, true);
 }
 
 static bool
@@ -331,9 +250,6 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
    case nir_intrinsic_image_deref_atomic_comp_swap:
    case nir_intrinsic_image_deref_size:
    case nir_intrinsic_image_deref_samples:
-   case nir_intrinsic_image_deref_load_param_intel:
-   case nir_intrinsic_image_deref_load_raw_intel:
-   case nir_intrinsic_image_deref_store_raw_intel:
       lower_image_deref(b, instr, shader, layout);
       return true;
 
@@ -342,6 +258,59 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
    }
 }
 
+static bool
+lower_tex(nir_builder *b, nir_tex_instr *tex,
+          struct tu_shader *shader, const struct tu_pipeline_layout *layout)
+{
+   int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
+   if (sampler_src_idx >= 0) {
+      nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
+      nir_ssa_def *bindless = build_bindless(b, deref, true, shader, layout);
+      nir_instr_rewrite_src(&tex->instr, &tex->src[sampler_src_idx].src,
+                            nir_src_for_ssa(bindless));
+      tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
+   }
+
+   int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+   if (tex_src_idx >= 0) {
+      nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
+      nir_ssa_def *bindless = build_bindless(b, deref, false, shader, layout);
+      nir_instr_rewrite_src(&tex->instr, &tex->src[tex_src_idx].src,
+                            nir_src_for_ssa(bindless));
+      tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
+   }
+
+   return true;
+}
+
+static bool
+lower_impl(nir_function_impl *impl, struct tu_shader *shader,
+            const struct tu_pipeline_layout *layout)
+{
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   bool progress = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         b.cursor = nir_before_instr(instr);
+         switch (instr->type) {
+         case nir_instr_type_tex:
+            progress |= lower_tex(&b, nir_instr_as_tex(instr), shader, layout);
+            break;
+         case nir_instr_type_intrinsic:
+            progress |= lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader, layout);
+            break;
+         default:
+            break;
+         }
+      }
+   }
+
+   return progress;
+}
+
+
 /* Figure out the range of push constants that we're actually going to push to
  * the shader, and tell the backend to reserve this range when pushing UBO
  * constants.
@@ -391,31 +360,36 @@ gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader)
       align(tu_shader->push_consts.count, 4);
 }
 
-static bool
-lower_impl(nir_function_impl *impl, struct tu_shader *shader,
-            const struct tu_pipeline_layout *layout)
+/* Gather the InputAttachmentIndex for each input attachment from the NIR
+ * shader and organize the info in a way so that draw-time patching is easy.
+ */
+static void
+gather_input_attachments(nir_shader *shader, struct tu_shader *tu_shader,
+                         const struct tu_pipeline_layout *layout)
 {
-   nir_builder b;
-   nir_builder_init(&b, impl);
-   bool progress = false;
+   nir_foreach_variable(var, &shader->uniforms) {
+      const struct glsl_type *glsl_type = glsl_without_array(var->type);
 
-   nir_foreach_block(block, impl) {
-      nir_foreach_instr_safe(instr, block) {
-         b.cursor = nir_before_instr(instr);
-         switch (instr->type) {
-         case nir_instr_type_tex:
-            progress |= lower_sampler(&b, nir_instr_as_tex(instr), shader, layout);
-            break;
-         case nir_instr_type_intrinsic:
-            progress |= lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader, layout);
-            break;
-         default:
-            break;
-         }
+      if (!glsl_type_is_image(glsl_type))
+         continue;
+
+      enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type);
+
+      const uint32_t set = var->data.descriptor_set;
+      const uint32_t binding = var->data.binding;
+      const struct tu_descriptor_set_binding_layout *bind_layout =
+            &layout->set[set].layout->binding[binding];
+      const uint32_t array_size = bind_layout->array_size;
+
+      if (dim == GLSL_SAMPLER_DIM_SUBPASS ||
+          dim == GLSL_SAMPLER_DIM_SUBPASS_MS) {
+         unsigned offset =
+            layout->set[set].input_attachment_start +
+            bind_layout->input_attachment_offset;
+         for (unsigned i = 0; i < array_size; i++)
+            tu_shader->attachment_idx[offset + i] = var->data.index + i;
       }
    }
-
-   return progress;
 }
 
 static bool
@@ -425,18 +399,13 @@ tu_lower_io(nir_shader *shader, struct tu_shader *tu_shader,
    bool progress = false;
 
    gather_push_constants(shader, tu_shader);
+   gather_input_attachments(shader, tu_shader, layout);
 
    nir_foreach_function(function, shader) {
       if (function->impl)
          progress |= lower_impl(function->impl, tu_shader, layout);
    }
 
-   /* spirv_to_nir produces num_ssbos equal to the number of SSBO-containing
-    * variables, while ir3 wants the number of descriptors (like the gallium
-    * path).
-    */
-   shader->info.num_ssbos = tu_shader->ssbo_map.num_desc;
-
    return progress;
 }