descriptors_state->sets[idx] = set;
descriptors_state->valid |= (1u << idx);
+ /* Note: the actual input attachment indices come from the shader
+ * itself, so we can't generate the patched versions of these until
+ * draw time when both the pipeline and descriptors are bound and
+ * we're inside the render pass.
+ */
+ unsigned dst_idx = layout->set[idx].input_attachment_start;
+ memcpy(&descriptors_state->input_attachments[dst_idx * A6XX_TEX_CONST_DWORDS],
+ set->dynamic_descriptors,
+ set->layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
+
for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) {
- unsigned idx = j + layout->set[i + firstSet].dynamic_offset_start;
+ /* Dynamic buffers come after input attachments in the descriptor set
+ * itself, but due to how the Vulkan descriptor set binding works, we
+ * have to put input attachments and dynamic buffers in separate
+ * buffers in the descriptor_state and then combine them at draw
+ * time. Binding a descriptor set only invalidates the descriptor
+ * sets after it, but if we try to tightly pack the descriptors after
+ * the input attachments then we could corrupt dynamic buffers in the
+ * descriptor set before it, or we'd have to move all the dynamic
+ * buffers over. We just put them into separate buffers to make
+ * binding as well as the later patching of input attachments easy.
+ */
+ unsigned src_idx = j + set->layout->input_attachment_count;
+ unsigned dst_idx = j + layout->set[idx].dynamic_offset_start;
assert(dyn_idx < dynamicOffsetCount);
- descriptors_state->dynamic_buffers[idx] =
- set->dynamic_descriptors[j].va + pDynamicOffsets[dyn_idx];
+ uint32_t *dst =
+ &descriptors_state->dynamic_descriptors[dst_idx * A6XX_TEX_CONST_DWORDS];
+ uint32_t *src =
+ &set->dynamic_descriptors[src_idx * A6XX_TEX_CONST_DWORDS];
+ uint32_t offset = pDynamicOffsets[dyn_idx];
+
+ /* Patch the storage/uniform descriptors right away. */
+ if (layout->set[idx].layout->dynamic_ubo & (1 << j)) {
+ /* Note: we can assume here that the addition won't roll over and
+ * change the SIZE field.
+ */
+ uint64_t va = src[0] | ((uint64_t)src[1] << 32);
+ va += offset;
+ dst[0] = va;
+ dst[1] = va >> 32;
+ } else {
+ memcpy(dst, src, A6XX_TEX_CONST_DWORDS * 4);
+ /* Note: A6XX_IBO_5_DEPTH is always 0 */
+ uint64_t va = dst[4] | ((uint64_t)dst[5] << 32);
+ va += offset;
+ dst[4] = va;
+ dst[5] = va >> 32;
+ }
}
}
- cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
+ if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE)
+ cmd_buffer->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
+ else
+ cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
}
void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
tu_bo_list_add(&cmd->bo_list, iview->image->bo,
MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
}
+
+ /* Flag input attachment descriptors for re-emission if necessary */
+ cmd->state.dirty |= TU_CMD_DIRTY_INPUT_ATTACHMENTS;
}
void
tu6_emit_mrt(cmd, cmd->state.subpass, cs);
tu6_emit_msaa(cs, cmd->state.subpass->samples);
tu6_emit_render_cntl(cmd, cmd->state.subpass, cs, false);
+
+ /* Flag input attachment descriptors for re-emission if necessary */
+ cmd->state.dirty |= TU_CMD_DIRTY_INPUT_ATTACHMENTS;
}
void
#define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
#define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM)
+#define ENABLE_NON_GMEM (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_SYSMEM)
enum tu_draw_state_group_id
{
TU_DRAW_STATE_BLEND,
TU_DRAW_STATE_VS_CONST,
TU_DRAW_STATE_FS_CONST,
- TU_DRAW_STATE_VS_TEX,
- TU_DRAW_STATE_FS_TEX_SYSMEM,
- TU_DRAW_STATE_FS_TEX_GMEM,
- TU_DRAW_STATE_FS_IBO,
+ TU_DRAW_STATE_DESC_SETS,
+ TU_DRAW_STATE_DESC_SETS_GMEM,
TU_DRAW_STATE_VS_PARAMS,
TU_DRAW_STATE_COUNT,
struct tu_cs_entry ib;
};
-const static void *
-sampler_ptr(struct tu_descriptor_state *descriptors_state,
- const struct tu_descriptor_map *map, unsigned i,
- unsigned array_index)
-{
- assert(descriptors_state->valid & (1 << map->set[i]));
-
- struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
- assert(map->binding[i] < set->layout->binding_count);
-
- const struct tu_descriptor_set_binding_layout *layout =
- &set->layout->binding[map->binding[i]];
-
- if (layout->immutable_samplers_offset) {
- const uint32_t *immutable_samplers =
- tu_immutable_samplers(set->layout, layout);
-
- return &immutable_samplers[array_index * A6XX_TEX_SAMP_DWORDS];
- }
-
- switch (layout->type) {
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- return &set->mapped_ptr[layout->offset / 4];
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- return &set->mapped_ptr[layout->offset / 4 + A6XX_TEX_CONST_DWORDS +
- array_index * (A6XX_TEX_CONST_DWORDS + A6XX_TEX_SAMP_DWORDS)];
- default:
- unreachable("unimplemented descriptor type");
- break;
- }
-}
-
-static void
-write_tex_const(struct tu_cmd_buffer *cmd,
- uint32_t *dst,
- struct tu_descriptor_state *descriptors_state,
- const struct tu_descriptor_map *map,
- unsigned i, unsigned array_index, bool is_sysmem)
-{
- assert(descriptors_state->valid & (1 << map->set[i]));
-
- struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
- assert(map->binding[i] < set->layout->binding_count);
-
- const struct tu_descriptor_set_binding_layout *layout =
- &set->layout->binding[map->binding[i]];
-
- switch (layout->type) {
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
- array_index * A6XX_TEX_CONST_DWORDS],
- A6XX_TEX_CONST_DWORDS * 4);
- break;
- case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
- array_index *
- (A6XX_TEX_CONST_DWORDS +
- A6XX_TEX_SAMP_DWORDS)],
- A6XX_TEX_CONST_DWORDS * 4);
- break;
- default:
- unreachable("unimplemented descriptor type");
- break;
- }
-
- if (layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT && !is_sysmem) {
- const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
- uint32_t a = cmd->state.subpass->input_attachments[map->value[i] +
- array_index].attachment;
- const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
-
- assert(att->gmem_offset >= 0);
-
- dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
- dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
- dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
- dst[2] |=
- A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
- A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
- dst[3] = 0;
- dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
- dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
- for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
- dst[i] = 0;
-
- if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
- tu_finishme("patch input attachment pitch for secondary cmd buffer");
- }
-}
-
-static void
-write_image_ibo(struct tu_cmd_buffer *cmd,
- uint32_t *dst,
- struct tu_descriptor_state *descriptors_state,
- const struct tu_descriptor_map *map,
- unsigned i, unsigned array_index)
-{
- assert(descriptors_state->valid & (1 << map->set[i]));
-
- struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
- assert(map->binding[i] < set->layout->binding_count);
-
- const struct tu_descriptor_set_binding_layout *layout =
- &set->layout->binding[map->binding[i]];
-
- assert(layout->type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE);
-
- memcpy(dst, &set->mapped_ptr[layout->offset / 4 +
- (array_index * 2 + 1) * A6XX_TEX_CONST_DWORDS],
- A6XX_TEX_CONST_DWORDS * 4);
-}
-
-static uint64_t
-buffer_ptr(struct tu_descriptor_state *descriptors_state,
- const struct tu_descriptor_map *map,
- unsigned i, unsigned array_index)
-{
- assert(descriptors_state->valid & (1 << map->set[i]));
-
- struct tu_descriptor_set *set = descriptors_state->sets[map->set[i]];
- assert(map->binding[i] < set->layout->binding_count);
-
- const struct tu_descriptor_set_binding_layout *layout =
- &set->layout->binding[map->binding[i]];
-
- switch (layout->type) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- return descriptors_state->dynamic_buffers[layout->dynamic_offset_offset +
- array_index];
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- return (uint64_t) set->mapped_ptr[layout->offset / 4 + array_index * 2 + 1] << 32 |
- set->mapped_ptr[layout->offset / 4 + array_index * 2];
- default:
- unreachable("unimplemented descriptor type");
- break;
- }
-}
-
static inline uint32_t
tu6_stage2opcode(gl_shader_stage type)
{
debug_assert((size % 16) == 0);
debug_assert((offset % 16) == 0);
- /* Look through the UBO map to find our UBO index, and get the VA for
- * that UBO.
+ /* Dig out the descriptor from the descriptor state and read the VA from
+ * it.
*/
- uint64_t va = 0;
- uint32_t ubo_idx = state->range[i].block - 1;
- uint32_t ubo_map_base = 0;
- for (int j = 0; j < link->ubo_map.num; j++) {
- if (ubo_idx >= ubo_map_base &&
- ubo_idx < ubo_map_base + link->ubo_map.array_size[j]) {
- va = buffer_ptr(descriptors_state, &link->ubo_map, j,
- ubo_idx - ubo_map_base);
- break;
- }
- ubo_map_base += link->ubo_map.array_size[j];
- }
+ assert(state->range[i].bindless);
+ uint32_t *base = state->range[i].bindless_base == MAX_SETS ?
+ descriptors_state->dynamic_descriptors :
+ descriptors_state->sets[state->range[i].bindless_base]->mapped_ptr;
+ unsigned block = state->range[i].block;
+ /* If the block in the shader here is in the dynamic descriptor set, it
+ * is an index into the dynamic descriptor set which is combined from
+ * dynamic descriptors and input attachments on-the-fly, and we don't
+ * have access to it here. Instead we work backwards to get the index
+ * into dynamic_descriptors.
+ */
+ if (state->range[i].bindless_base == MAX_SETS)
+ block -= pipeline->layout->input_attachment_count;
+ uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS;
+ uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32);
assert(va);
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
}
}
-static void
-tu6_emit_ubos(struct tu_cs *cs, const struct tu_pipeline *pipeline,
- struct tu_descriptor_state *descriptors_state,
- gl_shader_stage type)
-{
- const struct tu_program_descriptor_linkage *link =
- &pipeline->program.link[type];
-
- uint32_t num = MIN2(link->ubo_map.num_desc, link->const_state.num_ubos);
- uint32_t anum = align(num, 2);
-
- if (!num)
- return;
-
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + (2 * anum));
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(link->const_state.offsets.ubo) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
- CP_LOAD_STATE6_0_NUM_UNIT(anum/2));
- tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
- tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
-
- unsigned emitted = 0;
- for (unsigned i = 0; emitted < num && i < link->ubo_map.num; i++) {
- for (unsigned j = 0; emitted < num && j < link->ubo_map.array_size[i]; j++) {
- tu_cs_emit_qw(cs, buffer_ptr(descriptors_state, &link->ubo_map, i, j));
- emitted++;
- }
- }
-
- for (; emitted < anum; emitted++) {
- tu_cs_emit(cs, 0xffffffff);
- tu_cs_emit(cs, 0xffffffff);
- }
-}
-
static struct tu_cs_entry
tu6_emit_consts(struct tu_cmd_buffer *cmd,
const struct tu_pipeline *pipeline,
tu_cs_begin_sub_stream(&cmd->sub_cs, 512, &cs); /* TODO: maximum size? */
tu6_emit_user_consts(&cs, pipeline, descriptors_state, type, cmd->push_constants);
- tu6_emit_ubos(&cs, pipeline, descriptors_state, type);
return tu_cs_end_sub_stream(&cmd->sub_cs, &cs);
}
}
static VkResult
-tu6_emit_textures(struct tu_cmd_buffer *cmd,
- const struct tu_pipeline *pipeline,
- struct tu_descriptor_state *descriptors_state,
- gl_shader_stage type,
- struct tu_cs_entry *entry,
- bool is_sysmem)
+tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
+ const struct tu_pipeline *pipeline,
+ VkPipelineBindPoint bind_point,
+ struct tu_cs_entry *entry,
+ bool gmem)
{
struct tu_cs *draw_state = &cmd->sub_cs;
- const struct tu_program_descriptor_linkage *link =
- &pipeline->program.link[type];
+ struct tu_pipeline_layout *layout = pipeline->layout;
+ struct tu_descriptor_state *descriptors_state =
+ tu_get_descriptors_state(cmd, bind_point);
+ const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
+ const uint32_t *input_attachment_idx =
+ pipeline->program.input_attachment_idx;
+ uint32_t num_dynamic_descs = layout->dynamic_offset_count +
+ layout->input_attachment_count;
+ struct ts_cs_memory dynamic_desc_set;
VkResult result;
- if (link->texture_map.num_desc == 0 && link->sampler_map.num_desc == 0) {
- *entry = (struct tu_cs_entry) {};
- return VK_SUCCESS;
- }
-
- /* allocate and fill texture state */
- struct ts_cs_memory tex_const;
- result = tu_cs_alloc(draw_state, link->texture_map.num_desc,
- A6XX_TEX_CONST_DWORDS, &tex_const);
- if (result != VK_SUCCESS)
- return result;
-
- int tex_index = 0;
- for (unsigned i = 0; i < link->texture_map.num; i++) {
- for (int j = 0; j < link->texture_map.array_size[i]; j++) {
- write_tex_const(cmd,
- &tex_const.map[A6XX_TEX_CONST_DWORDS * tex_index++],
- descriptors_state, &link->texture_map, i, j,
- is_sysmem);
- }
- }
-
- /* allocate and fill sampler state */
- struct ts_cs_memory tex_samp = { 0 };
- if (link->sampler_map.num_desc) {
- result = tu_cs_alloc(draw_state, link->sampler_map.num_desc,
- A6XX_TEX_SAMP_DWORDS, &tex_samp);
+ if (num_dynamic_descs > 0) {
+ /* allocate and fill out dynamic descriptor set */
+ result = tu_cs_alloc(draw_state, num_dynamic_descs,
+ A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
if (result != VK_SUCCESS)
return result;
- int sampler_index = 0;
- for (unsigned i = 0; i < link->sampler_map.num; i++) {
- for (int j = 0; j < link->sampler_map.array_size[i]; j++) {
- const uint32_t *sampler = sampler_ptr(descriptors_state,
- &link->sampler_map,
- i, j);
- memcpy(&tex_samp.map[A6XX_TEX_SAMP_DWORDS * sampler_index++],
- sampler, A6XX_TEX_SAMP_DWORDS * 4);
+ memcpy(dynamic_desc_set.map, descriptors_state->input_attachments,
+ layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4);
+
+ if (gmem) {
+ /* Patch input attachments to refer to GMEM instead */
+ for (unsigned i = 0; i < layout->input_attachment_count; i++) {
+ uint32_t *dst =
+ &dynamic_desc_set.map[A6XX_TEX_CONST_DWORDS * i];
+
+ /* The compiler has already laid out input_attachment_idx in the
+ * final order of input attachments, so there's no need to go
+ * through the pipeline layout finding input attachments.
+ */
+ unsigned attachment_idx = input_attachment_idx[i];
+
+ /* It's possible for the pipeline layout to include an input
+ * attachment which doesn't actually exist for the current
+ * subpass. Of course, this is only valid so long as the pipeline
+ * doesn't try to actually load that attachment. Just skip
+ * patching in that scenario to avoid out-of-bounds accesses.
+ */
+ if (attachment_idx >= cmd->state.subpass->input_count)
+ continue;
+
+ uint32_t a = cmd->state.subpass->input_attachments[attachment_idx].attachment;
+ const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
+
+ assert(att->gmem_offset >= 0);
+
+ dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
+ dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
+ dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK);
+ dst[2] |=
+ A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
+ A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp);
+ dst[3] = 0;
+ dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset;
+ dst[5] = A6XX_TEX_CONST_5_DEPTH(1);
+ for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
+ dst[i] = 0;
+
+ if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
+ tu_finishme("patch input attachment pitch for secondary cmd buffer");
}
}
- }
- unsigned tex_samp_reg, tex_const_reg, tex_count_reg;
- enum a6xx_state_block sb;
+ memcpy(dynamic_desc_set.map + layout->input_attachment_count * A6XX_TEX_CONST_DWORDS,
+ descriptors_state->dynamic_descriptors,
+ layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4);
+ }
- switch (type) {
- case MESA_SHADER_VERTEX:
- sb = SB6_VS_TEX;
- tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO;
- tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO;
- tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
- break;
- case MESA_SHADER_FRAGMENT:
- sb = SB6_FS_TEX;
- tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO;
- tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO;
- tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
+ uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg;
+ uint32_t hlsq_update_value;
+ switch (bind_point) {
+ case VK_PIPELINE_BIND_POINT_GRAPHICS:
+ sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0);
+ hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0);
+ hlsq_update_value = 0x7c000;
break;
- case MESA_SHADER_COMPUTE:
- sb = SB6_CS_TEX;
- tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO;
- tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO;
- tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
+ case VK_PIPELINE_BIND_POINT_COMPUTE:
+ sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0);
+ hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0);
+ hlsq_update_value = 0x3e00;
break;
default:
- unreachable("bad state block");
- }
-
- struct tu_cs cs;
- result = tu_cs_begin_sub_stream(draw_state, 16, &cs);
- if (result != VK_SUCCESS)
- return result;
-
- if (link->sampler_map.num_desc) {
- /* output sampler state: */
- tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
- tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
- CP_LOAD_STATE6_0_NUM_UNIT(link->sampler_map.num_desc));
- tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
-
- tu_cs_emit_pkt4(&cs, tex_samp_reg, 2);
- tu_cs_emit_qw(&cs, tex_samp.iova); /* SRC_ADDR_LO/HI */
- }
-
- /* emit texture state: */
- tu_cs_emit_pkt7(&cs, tu6_stage2opcode(type), 3);
- tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
- CP_LOAD_STATE6_0_NUM_UNIT(link->texture_map.num_desc));
- tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
-
- tu_cs_emit_pkt4(&cs, tex_const_reg, 2);
- tu_cs_emit_qw(&cs, tex_const.iova); /* SRC_ADDR_LO/HI */
-
- tu_cs_emit_pkt4(&cs, tex_count_reg, 1);
- tu_cs_emit(&cs, link->texture_map.num_desc);
-
- *entry = tu_cs_end_sub_stream(draw_state, &cs);
- return VK_SUCCESS;
-}
-
-static VkResult
-tu6_emit_ibo(struct tu_cmd_buffer *cmd,
- const struct tu_pipeline *pipeline,
- struct tu_descriptor_state *descriptors_state,
- gl_shader_stage type,
- struct tu_cs_entry *entry)
-{
- struct tu_cs *draw_state = &cmd->sub_cs;
- const struct tu_program_descriptor_linkage *link =
- &pipeline->program.link[type];
- VkResult result;
-
- unsigned num_desc = link->ssbo_map.num_desc + link->image_map.num_desc;
-
- if (num_desc == 0) {
- *entry = (struct tu_cs_entry) {};
- return VK_SUCCESS;
- }
-
- struct ts_cs_memory ibo_const;
- result = tu_cs_alloc(draw_state, num_desc,
- A6XX_TEX_CONST_DWORDS, &ibo_const);
- if (result != VK_SUCCESS)
- return result;
-
- int ssbo_index = 0;
- for (unsigned i = 0; i < link->ssbo_map.num; i++) {
- for (int j = 0; j < link->ssbo_map.array_size[i]; j++) {
- uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
-
- uint64_t va = buffer_ptr(descriptors_state, &link->ssbo_map, i, j);
- /* We don't expose robustBufferAccess, so leave the size unlimited. */
- uint32_t sz = MAX_STORAGE_BUFFER_RANGE / 4;
-
- dst[0] = A6XX_IBO_0_FMT(FMT6_32_UINT);
- dst[1] = A6XX_IBO_1_WIDTH(sz & MASK(15)) |
- A6XX_IBO_1_HEIGHT(sz >> 15);
- dst[2] = A6XX_IBO_2_UNK4 |
- A6XX_IBO_2_UNK31 |
- A6XX_IBO_2_TYPE(A6XX_TEX_1D);
- dst[3] = 0;
- dst[4] = va;
- dst[5] = va >> 32;
- for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
- dst[i] = 0;
-
- ssbo_index++;
- }
+ unreachable("bad bind point");
}
- for (unsigned i = 0; i < link->image_map.num; i++) {
- for (int j = 0; j < link->image_map.array_size[i]; j++) {
- uint32_t *dst = &ibo_const.map[A6XX_TEX_CONST_DWORDS * ssbo_index];
-
- write_image_ibo(cmd, dst,
- descriptors_state, &link->image_map, i, j);
-
- ssbo_index++;
- }
+ /* Be careful here to *not* refer to the pipeline, so that if only the
+ * pipeline changes we don't have to emit this again (except if there are
+ * dynamic descriptors in the pipeline layout). This means always emitting
+ * all the valid descriptors, which means that we always have to put the
+ * dynamic descriptor in the driver-only slot at the end
+ */
+ uint32_t num_user_sets = util_last_bit(descriptors_state->valid);
+ uint32_t num_sets = num_user_sets;
+ if (num_dynamic_descs > 0) {
+ num_user_sets = MAX_SETS;
+ num_sets = num_user_sets + 1;
}
- assert(ssbo_index == num_desc);
+ unsigned regs[2] = { sp_bindless_base_reg, hlsq_bindless_base_reg };
struct tu_cs cs;
- result = tu_cs_begin_sub_stream(draw_state, 7, &cs);
+ result = tu_cs_begin_sub_stream(draw_state, ARRAY_SIZE(regs) * (1 + num_sets * 2) + 2, &cs);
if (result != VK_SUCCESS)
return result;
- uint32_t opcode, ibo_addr_reg;
- enum a6xx_state_block sb;
- enum a6xx_state_type st;
+ if (num_sets > 0) {
+ for (unsigned i = 0; i < ARRAY_SIZE(regs); i++) {
+ tu_cs_emit_pkt4(&cs, regs[i], num_sets * 2);
+ for (unsigned j = 0; j < num_user_sets; j++) {
+ if (descriptors_state->valid & (1 << j)) {
+ /* magic | 3 copied from the blob */
+ tu_cs_emit_qw(&cs, descriptors_state->sets[j]->va | 3);
+ } else {
+ tu_cs_emit_qw(&cs, 0 | 3);
+ }
+ }
+ if (num_dynamic_descs > 0) {
+ tu_cs_emit_qw(&cs, dynamic_desc_set.iova | 3);
+ }
+ }
- switch (type) {
- case MESA_SHADER_FRAGMENT:
- opcode = CP_LOAD_STATE6;
- st = ST6_SHADER;
- sb = SB6_IBO;
- ibo_addr_reg = REG_A6XX_SP_IBO_LO;
- break;
- case MESA_SHADER_COMPUTE:
- opcode = CP_LOAD_STATE6_FRAG;
- st = ST6_IBO;
- sb = SB6_CS_SHADER;
- ibo_addr_reg = REG_A6XX_SP_CS_IBO_LO;
- break;
- default:
- unreachable("unsupported stage for ibos");
+ tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(hlsq_update_value));
}
- /* emit texture state: */
- tu_cs_emit_pkt7(&cs, opcode, 3);
- tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
- CP_LOAD_STATE6_0_STATE_TYPE(st) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
- CP_LOAD_STATE6_0_NUM_UNIT(num_desc));
- tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
-
- tu_cs_emit_pkt4(&cs, ibo_addr_reg, 2);
- tu_cs_emit_qw(&cs, ibo_const.iova); /* SRC_ADDR_LO/HI */
-
*entry = tu_cs_end_sub_stream(draw_state, &cs);
return VK_SUCCESS;
}
if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS)
tu6_emit_streamout(cmd, cs);
- if (cmd->state.dirty &
- (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) {
- struct tu_cs_entry vs_tex, fs_tex_sysmem, fs_tex_gmem, fs_ibo;
-
- result = tu6_emit_textures(cmd, pipeline, descriptors_state,
- MESA_SHADER_VERTEX, &vs_tex, false);
- if (result != VK_SUCCESS)
- return result;
-
- /* TODO: we could emit just one texture descriptor draw state when there
- * are no input attachments, which is the most common case. We could
- * also split out the sampler state, which doesn't change even for input
- * attachments.
- */
- result = tu6_emit_textures(cmd, pipeline, descriptors_state,
- MESA_SHADER_FRAGMENT, &fs_tex_sysmem, true);
- if (result != VK_SUCCESS)
- return result;
-
- result = tu6_emit_textures(cmd, pipeline, descriptors_state,
- MESA_SHADER_FRAGMENT, &fs_tex_gmem, false);
- if (result != VK_SUCCESS)
- return result;
-
- result = tu6_emit_ibo(cmd, pipeline, descriptors_state,
- MESA_SHADER_FRAGMENT, &fs_ibo);
+ /* If there are any any dynamic descriptors, then we may need to re-emit
+ * them after every pipeline change in case the number of input attachments
+ * changes. We also always need to re-emit after a pipeline change if there
+ * are any input attachments, because the input attachment index comes from
+ * the pipeline. Finally, it can also happen that the subpass changes
+ * without the pipeline changing, in which case the GMEM descriptors need
+ * to be patched differently.
+ *
+ * TODO: We could probably be clever and avoid re-emitting state on
+ * pipeline changes if the number of input attachments is always 0. We
+ * could also only re-emit dynamic state.
+ */
+ if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS ||
+ ((pipeline->layout->dynamic_offset_count +
+ pipeline->layout->input_attachment_count > 0) &&
+ cmd->state.dirty & TU_CMD_DIRTY_PIPELINE) ||
+ (pipeline->layout->input_attachment_count > 0 &&
+ cmd->state.dirty & TU_CMD_DIRTY_INPUT_ATTACHMENTS)) {
+ struct tu_cs_entry desc_sets, desc_sets_gmem;
+ bool need_gmem_desc_set = pipeline->layout->input_attachment_count > 0;
+
+ result = tu6_emit_descriptor_sets(cmd, pipeline,
+ VK_PIPELINE_BIND_POINT_GRAPHICS,
+ &desc_sets, false);
if (result != VK_SUCCESS)
return result;
draw_state_groups[draw_state_group_count++] =
(struct tu_draw_state_group) {
- .id = TU_DRAW_STATE_VS_TEX,
- .enable_mask = ENABLE_ALL,
- .ib = vs_tex,
- };
- draw_state_groups[draw_state_group_count++] =
- (struct tu_draw_state_group) {
- .id = TU_DRAW_STATE_FS_TEX_GMEM,
- .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
- .ib = fs_tex_gmem,
- };
- draw_state_groups[draw_state_group_count++] =
- (struct tu_draw_state_group) {
- .id = TU_DRAW_STATE_FS_TEX_SYSMEM,
- .enable_mask = CP_SET_DRAW_STATE__0_SYSMEM,
- .ib = fs_tex_sysmem,
- };
- draw_state_groups[draw_state_group_count++] =
- (struct tu_draw_state_group) {
- .id = TU_DRAW_STATE_FS_IBO,
- .enable_mask = ENABLE_DRAW,
- .ib = fs_ibo,
+ .id = TU_DRAW_STATE_DESC_SETS,
+ .enable_mask = need_gmem_desc_set ? ENABLE_NON_GMEM : ENABLE_ALL,
+ .ib = desc_sets,
};
+
+ if (need_gmem_desc_set) {
+ result = tu6_emit_descriptor_sets(cmd, pipeline,
+ VK_PIPELINE_BIND_POINT_GRAPHICS,
+ &desc_sets_gmem, true);
+ if (result != VK_SUCCESS)
+ return result;
+
+ draw_state_groups[draw_state_group_count++] =
+ (struct tu_draw_state_group) {
+ .id = TU_DRAW_STATE_DESC_SETS_GMEM,
+ .enable_mask = CP_SET_DRAW_STATE__0_GMEM,
+ .ib = desc_sets_gmem,
+ };
+ }
}
struct tu_cs_entry vs_params;
unsigned i;
for_each_bit(i, descriptors_state->valid) {
struct tu_descriptor_set *set = descriptors_state->sets[i];
- for (unsigned j = 0; j < set->layout->buffer_count; ++j)
- if (set->descriptors[j]) {
- tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
+ for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
+ if (set->buffers[j]) {
+ tu_bo_list_add(&cmd->bo_list, set->buffers[j],
MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
}
+ }
+ if (set->size > 0) {
+ tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
+ MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+ }
}
}
if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) {
}
}
+ /* There are too many graphics dirty bits to list here, so just list the
+ * bits to preserve instead. The only things not emitted here are
+ * compute-related state.
+ */
+ cmd->state.dirty &= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
+
/* Fragment shader state overwrites compute shader state, so flag the
* compute pipeline for re-emit.
*/
- cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE;
+ cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
return VK_SUCCESS;
}
tu_emit_compute_driver_params(cs, pipeline, info);
- result = tu6_emit_textures(cmd, pipeline, descriptors_state,
- MESA_SHADER_COMPUTE, &ib, false);
- if (result != VK_SUCCESS) {
- cmd->record_result = result;
- return;
- }
-
- if (ib.size)
- tu_cs_emit_ib(cs, &ib);
-
- result = tu6_emit_ibo(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE, &ib);
- if (result != VK_SUCCESS) {
- cmd->record_result = result;
- return;
- }
-
- if (ib.size)
- tu_cs_emit_ib(cs, &ib);
+ if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) {
+ result = tu6_emit_descriptor_sets(cmd, pipeline,
+ VK_PIPELINE_BIND_POINT_COMPUTE, &ib,
+ false);
+ if (result != VK_SUCCESS) {
+ cmd->record_result = result;
+ return;
+ }
- /* track BOs */
- if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
+ /* track BOs */
unsigned i;
for_each_bit(i, descriptors_state->valid) {
struct tu_descriptor_set *set = descriptors_state->sets[i];
- for (unsigned j = 0; j < set->layout->buffer_count; ++j)
- if (set->descriptors[j]) {
- tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
+ for (unsigned j = 0; j < set->layout->buffer_count; ++j) {
+ if (set->buffers[j]) {
+ tu_bo_list_add(&cmd->bo_list, set->buffers[j],
MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
}
+ }
+
+ if (set->size > 0) {
+ tu_bo_list_add(&cmd->bo_list, &set->pool->bo,
+ MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP);
+ }
}
}
+ if (ib.size)
+ tu_cs_emit_ib(cs, &ib);
+
+ cmd->state.dirty &=
+ ~(TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS | TU_CMD_DIRTY_COMPUTE_PIPELINE);
+
/* Compute shader state overwrites fragment shader state, so we flag the
* graphics pipeline for re-emit.
*/
- cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
+ cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE));
/**
* @file
*
- * The texture and sampler descriptors are laid out in a single global space
- * across all shader stages, for both simplicity of implementation and because
- * that seems to be how things have to be structured for border color
- * handling.
- *
- * Each shader stage will declare its texture/sampler count based on the last
- * descriptor set it uses. At draw emit time (though it really should be
- * CmdBind time), we upload the descriptor sets used by each shader stage to
- * their stage.
+ * We use the bindless descriptor model, which maps fairly closely to how
+ * Vulkan descriptor sets work. The two exceptions are input attachments and
+ * dynamic descriptors, which have to be patched when recording command
+ * buffers. We reserve an extra descriptor set for these. This descriptor set
+ * contains all the input attachments in the pipeline, in order, and then all
+ * the dynamic descriptors. The dynamic descriptors are stored in the CPU-side
+ * datastructure for each tu_descriptor_set, and then combined into one big
+ * descriptor set at CmdBindDescriptors time/draw time.
*/
#include "tu_private.h"
}
static uint32_t
-descriptor_size(enum VkDescriptorType type)
+descriptor_size(VkDescriptorType type)
{
switch (type) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- return 0;
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
- /* 64bit pointer */
- return 8;
- case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
- case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
- case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
- return A6XX_TEX_CONST_DWORDS * 4;
- case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
- /* We may need the IBO or the TEX representation, or both. */
- return A6XX_TEX_CONST_DWORDS * 4 * 2;
+ /* These are remapped to the special driver-managed descriptor set,
+ * hence they don't take up any space in the original descriptor set:
+ */
+ return 0;
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
- /* texture const + texture sampler */
- return (A6XX_TEX_CONST_DWORDS + A6XX_TEX_SAMP_DWORDS) * 4;
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- return A6XX_TEX_SAMP_DWORDS * 4;
+ /* We make offsets and sizes all 16 dwords, to match how the hardware
+ * interprets indices passed to sample/load/store instructions in
+ * multiples of 16 dwords. This means that "normal" descriptors are all
+ * of size 16, with padding for smaller descriptors like uniform storage
+ * descriptors which are less than 16 dwords. However combined images
+ * and samplers are actually two descriptors, so they have size 2.
+ */
+ return A6XX_TEX_CONST_DWORDS * 4 * 2;
default:
- unreachable("unknown descriptor type\n");
- return 0;
+ return A6XX_TEX_CONST_DWORDS * 4;
}
}
set_layout->flags = pCreateInfo->flags;
- /* We just allocate all the samplers at the end of the struct */
+ /* We just allocate all the immutable samplers at the end of the struct */
struct tu_sampler *samplers = (void*) &set_layout->binding[max_binding + 1];
VkDescriptorSetLayoutBinding *bindings = create_sorted_bindings(
set_layout->binding_count = max_binding + 1;
set_layout->shader_stages = 0;
- set_layout->dynamic_shader_stages = 0;
set_layout->has_immutable_samplers = false;
set_layout->size = 0;
+ set_layout->dynamic_ubo = 0;
memset(set_layout->binding, 0,
size - sizeof(struct tu_descriptor_set_layout));
- uint32_t buffer_count = 0;
uint32_t dynamic_offset_count = 0;
+ uint32_t input_attachment_count = 0;
+ uint32_t buffer_count = 0;
for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) {
const VkDescriptorSetLayoutBinding *binding = bindings + j;
uint32_t b = binding->binding;
- uint32_t alignment = 4;
- unsigned binding_buffer_count = 1;
-
- switch (binding->descriptorType) {
- case VK_DESCRIPTOR_TYPE_SAMPLER:
- binding_buffer_count = 0;
- break;
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- assert(!(pCreateInfo->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
- set_layout->binding[b].dynamic_offset_count = 1;
- break;
- default:
- break;
- }
- set_layout->size = align(set_layout->size, alignment);
set_layout->binding[b].type = binding->descriptorType;
set_layout->binding[b].array_size = binding->descriptorCount;
set_layout->binding[b].offset = set_layout->size;
set_layout->binding[b].buffer_offset = buffer_count;
set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count;
+ set_layout->binding[b].input_attachment_offset = input_attachment_count;
set_layout->binding[b].size = descriptor_size(binding->descriptorType);
if (variable_flags && binding->binding < variable_flags->bindingCount &&
set_layout->size +=
binding->descriptorCount * set_layout->binding[b].size;
- buffer_count += binding->descriptorCount * binding_buffer_count;
- dynamic_offset_count += binding->descriptorCount *
- set_layout->binding[b].dynamic_offset_count;
+ if (binding->descriptorType != VK_DESCRIPTOR_TYPE_SAMPLER &&
+ binding->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
+ buffer_count += binding->descriptorCount;
+ if (binding->descriptorType == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC ||
+ binding->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
+ if (binding->descriptorType == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC) {
+ STATIC_ASSERT(MAX_DYNAMIC_BUFFERS <= 8 * sizeof(set_layout->dynamic_ubo));
+ set_layout->dynamic_ubo |=
+ ((1u << binding->descriptorCount) - 1) << dynamic_offset_count;
+ }
+
+ dynamic_offset_count += binding->descriptorCount;
+ }
+ if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT)
+ input_attachment_count += binding->descriptorCount;
set_layout->shader_stages |= binding->stageFlags;
}
free(bindings);
- set_layout->buffer_count = buffer_count;
set_layout->dynamic_offset_count = dynamic_offset_count;
+ set_layout->input_attachment_count = input_attachment_count;
+ set_layout->buffer_count = buffer_count;
*pSetLayout = tu_descriptor_set_layout_to_handle(set_layout);
return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
layout->num_sets = pCreateInfo->setLayoutCount;
+ layout->input_attachment_count = 0;
+ layout->dynamic_offset_count = 0;
- unsigned dynamic_offset_count = 0;
+ unsigned dynamic_offset_count = 0, input_attachment_count = 0;
_mesa_sha1_init(&ctx);
for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) {
TU_FROM_HANDLE(tu_descriptor_set_layout, set_layout,
pCreateInfo->pSetLayouts[set]);
layout->set[set].layout = set_layout;
-
layout->set[set].dynamic_offset_start = dynamic_offset_count;
+ layout->set[set].input_attachment_start = input_attachment_count;
+ dynamic_offset_count += set_layout->dynamic_offset_count;
+ input_attachment_count += set_layout->input_attachment_count;
+
for (uint32_t b = 0; b < set_layout->binding_count; b++) {
- dynamic_offset_count += set_layout->binding[b].array_size *
- set_layout->binding[b].dynamic_offset_count;
if (set_layout->binding[b].immutable_samplers_offset)
_mesa_sha1_update(
&ctx,
}
layout->dynamic_offset_count = dynamic_offset_count;
+ layout->input_attachment_count = input_attachment_count;
layout->push_constant_size = 0;
for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) {
buffer_count = layout->binding[layout->binding_count - 1].buffer_offset +
*variable_count * stride;
}
- unsigned range_offset = sizeof(struct tu_descriptor_set) +
+ unsigned dynamic_offset = sizeof(struct tu_descriptor_set) +
sizeof(struct tu_bo *) * buffer_count;
- unsigned mem_size = range_offset +
- sizeof(struct tu_descriptor_range) * layout->dynamic_offset_count;
+ unsigned mem_size = dynamic_offset +
+ A6XX_TEX_CONST_DWORDS * 4 * (layout->dynamic_offset_count +
+ layout->input_attachment_count);;
if (pool->host_memory_base) {
if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
memset(set, 0, mem_size);
- if (layout->dynamic_offset_count) {
- set->dynamic_descriptors = (struct tu_descriptor_range*)((uint8_t*)set + range_offset);
+ if (layout->dynamic_offset_count + layout->input_attachment_count > 0) {
+ set->dynamic_descriptors = (uint32_t *)((uint8_t*)set + dynamic_offset);
}
set->layout = layout;
+ set->pool = pool;
uint32_t layout_size = layout->size;
if (variable_count) {
assert(layout->has_variable_descriptors);
uint32_t stride = layout->binding[layout->binding_count - 1].size;
- if (layout->binding[layout->binding_count - 1].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT)
- stride = 1;
-
layout_size = layout->binding[layout->binding_count - 1].offset +
*variable_count * stride;
}
TU_FROM_HANDLE(tu_device, device, _device);
struct tu_descriptor_pool *pool;
uint64_t size = sizeof(struct tu_descriptor_pool);
- uint64_t bo_size = 0, bo_count = 0, range_count = 0;
+ uint64_t bo_size = 0, bo_count = 0, dynamic_count = 0;
for (unsigned i = 0; i < pCreateInfo->poolSizeCount; ++i) {
if (pCreateInfo->pPoolSizes[i].type != VK_DESCRIPTOR_TYPE_SAMPLER)
switch(pCreateInfo->pPoolSizes[i].type) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- range_count += pCreateInfo->pPoolSizes[i].descriptorCount;
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
+ dynamic_count += pCreateInfo->pPoolSizes[i].descriptorCount;
default:
break;
}
if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT)) {
uint64_t host_size = pCreateInfo->maxSets * sizeof(struct tu_descriptor_set);
host_size += sizeof(struct tu_bo*) * bo_count;
- host_size += sizeof(struct tu_descriptor_range) * range_count;
+ host_size += A6XX_TEX_CONST_DWORDS * 4 * dynamic_count;
size += host_size;
} else {
size += sizeof(struct tu_descriptor_pool_entry) * pCreateInfo->maxSets;
*buffer_list = view->buffer->bo;
}
+static uint32_t get_range(struct tu_buffer *buf, VkDeviceSize offset,
+ VkDeviceSize range)
+{
+ if (range == VK_WHOLE_SIZE) {
+ return buf->size - offset;
+ } else {
+ return range;
+ }
+}
+
static void write_buffer_descriptor(struct tu_device *device,
struct tu_cmd_buffer *cmd_buffer,
unsigned *dst,
TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer);
uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset;
- dst[0] = va;
- dst[1] = va >> 32;
+ uint32_t range = get_range(buffer, buffer_info->offset, buffer_info->range);
+ range = ALIGN_POT(range, 4) / 4;
+ dst[0] =
+ A6XX_IBO_0_TILE_MODE(TILE6_LINEAR) | A6XX_IBO_0_FMT(FMT6_32_UINT);
+ dst[1] = range;
+ dst[2] =
+ A6XX_IBO_2_UNK4 | A6XX_IBO_2_TYPE(A6XX_TEX_1D) | A6XX_IBO_2_UNK31;
+ dst[3] = 0;
+ dst[4] = A6XX_IBO_4_BASE_LO(va);
+ dst[5] = A6XX_IBO_5_BASE_HI(va >> 32);
+ for (int i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
+ dst[i] = 0;
if (cmd_buffer)
tu_bo_list_add(&cmd_buffer->bo_list, buffer->bo, MSM_SUBMIT_BO_READ);
*buffer_list = buffer->bo;
}
-static void write_dynamic_buffer_descriptor(struct tu_device *device,
- struct tu_descriptor_range *range,
- struct tu_bo **buffer_list,
- const VkDescriptorBufferInfo *buffer_info)
+static void write_ubo_descriptor(struct tu_device *device,
+ struct tu_cmd_buffer *cmd_buffer,
+ unsigned *dst,
+ struct tu_bo **buffer_list,
+ const VkDescriptorBufferInfo *buffer_info)
{
TU_FROM_HANDLE(tu_buffer, buffer, buffer_info->buffer);
- uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset;
- unsigned size = buffer_info->range;
- if (buffer_info->range == VK_WHOLE_SIZE)
- size = buffer->size - buffer_info->offset;
-
- range->va = va;
- range->size = size;
+ uint32_t range = get_range(buffer, buffer_info->offset, buffer_info->range);
+ /* The HW range is in vec4 units */
+ range = ALIGN_POT(range, 16) / 16;
+ uint64_t va = tu_buffer_iova(buffer) + buffer_info->offset;
+ dst[0] = A6XX_UBO_0_BASE_LO(va);
+ dst[1] = A6XX_UBO_1_BASE_HI(va >> 32) | A6XX_UBO_1_SIZE(range);
- *buffer_list = buffer->bo;
+ if (cmd_buffer)
+ tu_bo_list_add(&cmd_buffer->bo_list, buffer->bo, MSM_SUBMIT_BO_READ);
+ else
+ *buffer_list = buffer->bo;
}
static void
{
TU_FROM_HANDLE(tu_image_view, iview, image_info->imageView);
- memcpy(dst, iview->descriptor, sizeof(iview->descriptor));
if (descriptor_type == VK_DESCRIPTOR_TYPE_STORAGE_IMAGE) {
- memcpy(&dst[A6XX_TEX_CONST_DWORDS], iview->storage_descriptor,
- sizeof(iview->storage_descriptor));
+ memcpy(dst, iview->storage_descriptor, sizeof(iview->storage_descriptor));
+ } else {
+ memcpy(dst, iview->descriptor, sizeof(iview->descriptor));
}
if (cmd_buffer)
descriptor_type, image_info);
/* copy over sampler state */
if (has_sampler) {
- memcpy(dst + sampler_offset / sizeof(*dst), sampler, sizeof(*sampler));
+ memcpy(dst + A6XX_TEX_CONST_DWORDS, sampler, sizeof(*sampler));
}
}
const struct tu_descriptor_set_binding_layout *binding_layout =
set->layout->binding + writeset->dstBinding;
uint32_t *ptr = set->mapped_ptr;
- struct tu_bo **buffer_list = set->descriptors;
+ struct tu_bo **buffer_list = set->buffers;
ptr += binding_layout->offset / 4;
- ptr += binding_layout->size * writeset->dstArrayElement / 4;
+ ptr += (binding_layout->size / 4) * writeset->dstArrayElement;
buffer_list += binding_layout->buffer_offset;
buffer_list += writeset->dstArrayElement;
for (j = 0; j < writeset->descriptorCount; ++j) {
switch(writeset->descriptorType) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
- unsigned idx = writeset->dstArrayElement + j;
- idx += binding_layout->dynamic_offset_offset;
+ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: {
assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
- write_dynamic_buffer_descriptor(device, set->dynamic_descriptors + idx,
- buffer_list, writeset->pBufferInfo + j);
+ unsigned idx = writeset->dstArrayElement + j;
+ idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset;
+ write_ubo_descriptor(device, cmd_buffer,
+ set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
+ buffer_list, writeset->pBufferInfo + j);
break;
}
-
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ write_ubo_descriptor(device, cmd_buffer, ptr, buffer_list,
+ writeset->pBufferInfo + j);
+ break;
+ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
+ assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
+ unsigned idx = writeset->dstArrayElement + j;
+ idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset;
+ write_buffer_descriptor(device, cmd_buffer,
+ set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
+ buffer_list, writeset->pBufferInfo + j);
+ break;
+ }
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
write_buffer_descriptor(device, cmd_buffer, ptr, buffer_list,
writeset->pBufferInfo + j);
break;
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
- case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
write_image_descriptor(device, cmd_buffer, ptr, buffer_list,
writeset->descriptorType,
writeset->pImageInfo + j);
break;
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
+ unsigned idx = writeset->dstArrayElement + j;
+ idx += binding_layout->input_attachment_offset;
+ write_image_descriptor(device, cmd_buffer,
+ set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx,
+ buffer_list, writeset->descriptorType,
+ writeset->pImageInfo + j);
+ break;
+ }
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
write_combined_image_sampler_descriptor(device, cmd_buffer,
A6XX_TEX_CONST_DWORDS * 4,
dst_set->layout->binding + copyset->dstBinding;
uint32_t *src_ptr = src_set->mapped_ptr;
uint32_t *dst_ptr = dst_set->mapped_ptr;
- struct tu_bo **src_buffer_list = src_set->descriptors;
- struct tu_bo **dst_buffer_list = dst_set->descriptors;
+ struct tu_bo **src_buffer_list = src_set->buffers;
+ struct tu_bo **dst_buffer_list = dst_set->buffers;
src_ptr += src_binding_layout->offset / 4;
dst_ptr += dst_binding_layout->offset / 4;
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
unsigned src_idx = copyset->srcArrayElement + j;
unsigned dst_idx = copyset->dstArrayElement + j;
- struct tu_descriptor_range *src_range, *dst_range;
+ src_idx += src_set->layout->input_attachment_count;
+ dst_idx += dst_set->layout->input_attachment_count;
src_idx += src_binding_layout->dynamic_offset_offset;
dst_idx += dst_binding_layout->dynamic_offset_offset;
- src_range = src_set->dynamic_descriptors + src_idx;
- dst_range = dst_set->dynamic_descriptors + dst_idx;
- *dst_range = *src_range;
+ uint32_t *src_dynamic, *dst_dynamic;
+ src_dynamic = src_set->dynamic_descriptors + src_idx * A6XX_TEX_CONST_DWORDS;
+ dst_dynamic = dst_set->dynamic_descriptors + dst_idx * A6XX_TEX_CONST_DWORDS;
+ memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4);
+ break;
+ }
+ case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
+ unsigned src_idx = copyset->srcArrayElement + j;
+ unsigned dst_idx = copyset->dstArrayElement + j;
+ src_idx += src_binding_layout->input_attachment_offset;
+ dst_idx += dst_binding_layout->input_attachment_offset;
+
+ uint32_t *src_dynamic, *dst_dynamic;
+ src_dynamic = src_set->dynamic_descriptors + src_idx * A6XX_TEX_CONST_DWORDS;
+ dst_dynamic = dst_set->dynamic_descriptors + dst_idx * A6XX_TEX_CONST_DWORDS;
+ memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4);
break;
}
default:
memcpy(dst_ptr, src_ptr, src_binding_layout->size);
}
+
src_ptr += src_binding_layout->size / 4;
dst_ptr += dst_binding_layout->size / 4;
#include <vulkan/vulkan.h>
-#define MAX_SETS 32
+/* The hardware supports 5 descriptor sets, but we reserve 1 for dynamic
+ * descriptors and input attachments.
+ */
+#define MAX_SETS 4
struct tu_descriptor_set_binding_layout
{
/* Number of array elements in this binding */
uint32_t array_size;
+ /* The size in bytes of each Vulkan descriptor. */
+ uint32_t size;
+
uint32_t offset;
+
+ /* For descriptors that point to a buffer, index into the array of BO's to
+ * be added to the cmdbuffer's used BO list.
+ */
uint32_t buffer_offset;
- uint16_t dynamic_offset_offset;
- uint16_t dynamic_offset_count;
- /* redundant with the type, each for a single array element */
- uint32_t size;
+ /* Index into the pDynamicOffsets array for dynamic descriptors, as well as
+ * the array of dynamic descriptors (offsetted by
+ * tu_pipeline_layout::set::dynamic_offset_start).
+ */
+ uint32_t dynamic_offset_offset;
+
+ /* Index into the array of dynamic input attachment descriptors */
+ uint32_t input_attachment_offset;
/* Offset in the tu_descriptor_set_layout of the immutable samplers, or 0
* if there are no immutable samplers. */
/* Shader stages affected by this descriptor set */
uint16_t shader_stages;
- uint16_t dynamic_shader_stages;
-
- /* Number of buffers in this descriptor set */
- uint32_t buffer_count;
/* Number of dynamic offsets used by this descriptor set */
uint16_t dynamic_offset_count;
+ /* Number of input attachments used by the descriptor set */
+ uint16_t input_attachment_count;
+
+ /* A bitfield of which dynamic buffers are ubo's, to make the
+ * descriptor-binding-time patching easier.
+ */
+ uint32_t dynamic_ubo;
+
+ uint32_t buffer_count;
+
bool has_immutable_samplers;
bool has_variable_descriptors;
struct tu_descriptor_set_layout *layout;
uint32_t size;
uint32_t dynamic_offset_start;
+ uint32_t input_attachment_start;
} set[MAX_SETS];
uint32_t num_sets;
uint32_t push_constant_size;
uint32_t dynamic_offset_count;
+ uint32_t input_attachment_count;
unsigned char sha1[20];
};
VkSampleCountFlags sample_counts =
VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_2_BIT | VK_SAMPLE_COUNT_4_BIT;
- /* make sure that the entire descriptor set is addressable with a signed
- * 32-bit int. So the sum of all limits scaled by descriptor size has to
- * be at most 2 GiB. the combined image & samples object count as one of
- * both. This limit is for the pipeline layout, not for the set layout, but
- * there is no set limit, so we just set a pipeline limit. I don't think
- * any app is going to hit this soon. */
- size_t max_descriptor_set_size =
- ((1ull << 31) - 16 * MAX_DYNAMIC_BUFFERS) /
- (32 /* uniform buffer, 32 due to potential space wasted on alignment */ +
- 32 /* storage buffer, 32 due to potential space wasted on alignment */ +
- 32 /* sampler, largest when combined with image */ +
- 64 /* sampled image */ + 64 /* storage image */);
+ /* I have no idea what the maximum size is, but the hardware supports very
+ * large numbers of descriptors (at least 2^16). This limit is based on
+ * CP_LOAD_STATE6, which has a 28-bit field for the DWORD offset, so that
+ * we don't have to think about what to do if that overflows, but really
+ * nothing is likely to get close to this.
+ */
+ const size_t max_descriptor_set_size = (1 << 28) / A6XX_TEX_CONST_DWORDS;
VkPhysicalDeviceLimits limits = {
.maxImageDimension1D = (1 << 14),
.maxImageDimensionCube = (1 << 14),
.maxImageArrayLayers = (1 << 11),
.maxTexelBufferElements = 128 * 1024 * 1024,
- .maxUniformBufferRange = UINT32_MAX,
+ .maxUniformBufferRange = MAX_UNIFORM_BUFFER_RANGE,
.maxStorageBufferRange = MAX_STORAGE_BUFFER_RANGE,
.maxPushConstantsSize = MAX_PUSH_CONSTANTS_SIZE,
.maxMemoryAllocationCount = UINT32_MAX,
.maxPerStageDescriptorStorageBuffers = max_descriptor_set_size,
.maxPerStageDescriptorSampledImages = max_descriptor_set_size,
.maxPerStageDescriptorStorageImages = max_descriptor_set_size,
- .maxPerStageDescriptorInputAttachments = max_descriptor_set_size,
+ .maxPerStageDescriptorInputAttachments = MAX_RTS,
.maxPerStageResources = max_descriptor_set_size,
.maxDescriptorSetSamplers = max_descriptor_set_size,
.maxDescriptorSetUniformBuffers = max_descriptor_set_size,
.maxDescriptorSetStorageBuffersDynamic = MAX_DYNAMIC_STORAGE_BUFFERS,
.maxDescriptorSetSampledImages = max_descriptor_set_size,
.maxDescriptorSetStorageImages = max_descriptor_set_size,
- .maxDescriptorSetInputAttachments = max_descriptor_set_size,
+ .maxDescriptorSetInputAttachments = MAX_RTS,
.maxVertexInputAttributes = 32,
.maxVertexInputBindings = 32,
.maxVertexInputAttributeOffset = 4095,
.viewportSubPixelBits = 8,
.minMemoryMapAlignment = 4096, /* A page */
.minTexelBufferOffsetAlignment = 64,
- .minUniformBufferOffsetAlignment = 4,
- .minStorageBufferOffsetAlignment = 4,
+ .minUniformBufferOffsetAlignment = 64,
+ .minStorageBufferOffsetAlignment = 64,
.minTexelOffset = -32,
.maxTexelOffset = 31,
.minTexelGatherOffset = -32,
TU_FROM_HANDLE(tu_buffer, buffer, _buffer);
pMemoryRequirements->memoryTypeBits = 1;
- pMemoryRequirements->alignment = 16;
+ pMemoryRequirements->alignment = 64;
pMemoryRequirements->size =
align64(buffer->size, pMemoryRequirements->alignment);
}
}
}
-static unsigned
-tu_shader_nibo(const struct tu_shader *shader)
-{
- /* Don't use ir3_shader_nibo(), because that would include declared but
- * unused storage images and SSBOs.
- */
- return shader->ssbo_map.num_desc + shader->image_map.num_desc;
+static uint32_t
+emit_xs_config(const struct ir3_shader_variant *sh)
+{
+ if (sh->instrlen) {
+ return A6XX_SP_VS_CONFIG_ENABLED |
+ COND(sh->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
+ COND(sh->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
+ COND(sh->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
+ COND(sh->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO);
+ } else {
+ return 0;
+ }
}
static void
if (vs->need_fine_derivatives)
sp_vs_ctrl |= A6XX_SP_VS_CTRL_REG0_DIFF_FINE;
- uint32_t sp_vs_config = A6XX_SP_VS_CONFIG_NTEX(shader->texture_map.num_desc) |
- A6XX_SP_VS_CONFIG_NSAMP(shader->sampler_map.num_desc);
- if (vs->instrlen)
- sp_vs_config |= A6XX_SP_VS_CONFIG_ENABLED;
-
tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CTRL_REG0, 1);
tu_cs_emit(cs, sp_vs_ctrl);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_VS_CONFIG, 2);
- tu_cs_emit(cs, sp_vs_config);
+ tu_cs_emit(cs, emit_xs_config(vs));
tu_cs_emit(cs, vs->instrlen);
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_VS_CNTL, 1);
tu6_emit_hs_config(struct tu_cs *cs, struct tu_shader *shader,
const struct ir3_shader_variant *hs)
{
- uint32_t sp_hs_config = 0;
- if (hs->instrlen)
- sp_hs_config |= A6XX_SP_HS_CONFIG_ENABLED;
-
tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
tu_cs_emit(cs, 0);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CONFIG, 2);
- tu_cs_emit(cs, sp_hs_config);
+ tu_cs_emit(cs, emit_xs_config(hs));
tu_cs_emit(cs, hs->instrlen);
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_HS_CNTL, 1);
tu6_emit_ds_config(struct tu_cs *cs, struct tu_shader *shader,
const struct ir3_shader_variant *ds)
{
- uint32_t sp_ds_config = 0;
- if (ds->instrlen)
- sp_ds_config |= A6XX_SP_DS_CONFIG_ENABLED;
-
tu_cs_emit_pkt4(cs, REG_A6XX_SP_DS_CONFIG, 2);
- tu_cs_emit(cs, sp_ds_config);
+ tu_cs_emit(cs, emit_xs_config(ds));
tu_cs_emit(cs, ds->instrlen);
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_DS_CNTL, 1);
tu_cs_emit(cs, 0);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CONFIG, 2);
- tu_cs_emit(cs, COND(has_gs,
- A6XX_SP_GS_CONFIG_ENABLED |
- A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(gs)) |
- A6XX_SP_GS_CONFIG_NTEX(gs->num_samp) |
- A6XX_SP_GS_CONFIG_NSAMP(gs->num_samp)));
+ tu_cs_emit(cs, emit_xs_config(gs));
tu_cs_emit(cs, gs->instrlen);
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_GS_CNTL, 1);
if (fs->need_fine_derivatives)
sp_fs_ctrl |= A6XX_SP_FS_CTRL_REG0_DIFF_FINE;
- uint32_t sp_fs_config = 0;
- unsigned shader_nibo = 0;
- if (shader) {
- shader_nibo = tu_shader_nibo(shader);
- sp_fs_config = A6XX_SP_FS_CONFIG_NTEX(shader->texture_map.num_desc) |
- A6XX_SP_FS_CONFIG_NSAMP(shader->sampler_map.num_desc) |
- A6XX_SP_FS_CONFIG_NIBO(shader_nibo);
- }
-
- if (fs->instrlen)
- sp_fs_config |= A6XX_SP_FS_CONFIG_ENABLED;
-
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CTRL_REG0, 1);
tu_cs_emit(cs, sp_fs_ctrl);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_CONFIG, 2);
- tu_cs_emit(cs, sp_fs_config);
+ tu_cs_emit(cs, emit_xs_config(fs));
tu_cs_emit(cs, fs->instrlen);
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL, 1);
tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_CONSTLEN(align(fs->constlen, 4)) |
A6XX_HLSQ_FS_CNTL_ENABLED);
-
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_IBO_COUNT, 1);
- tu_cs_emit(cs, shader_nibo);
}
static void
A6XX_HLSQ_CS_CNTL_ENABLED);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CONFIG, 2);
- tu_cs_emit(cs, A6XX_SP_CS_CONFIG_ENABLED |
- A6XX_SP_CS_CONFIG_NIBO(tu_shader_nibo(shader)) |
- A6XX_SP_CS_CONFIG_NTEX(shader->texture_map.num_desc) |
- A6XX_SP_CS_CONFIG_NSAMP(shader->sampler_map.num_desc));
+ tu_cs_emit(cs, emit_xs_config(v));
tu_cs_emit(cs, v->instrlen);
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CTRL_REG0, 1);
A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
tu_cs_emit(cs, 0x2fc); /* HLSQ_CS_UNKNOWN_B998 */
-
- tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_IBO_COUNT, 1);
- tu_cs_emit(cs, tu_shader_nibo(shader));
}
static void
A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
}
+ if (fs->num_sampler_prefetch > 0) {
+ tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
+ for (int i = 0; i < fs->num_sampler_prefetch; i++) {
+ const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
+ tu_cs_emit(cs,
+ A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
+ A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
+ }
+ }
+
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
tu_cs_emit(cs, 0x7);
tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
link->const_state = v->shader->const_state;
link->constlen = v->constlen;
link->push_consts = shader->push_consts;
- link->texture_map = shader->texture_map;
- link->sampler_map = shader->sampler_map;
- link->ubo_map = shader->ubo_map;
- link->ssbo_map = shader->ssbo_map;
- link->image_map = shader->image_map;
}
static void
builder->shaders[i],
&builder->shaders[i]->variants[0]);
}
+
+ if (builder->shaders[MESA_SHADER_FRAGMENT]) {
+ memcpy(pipeline->program.input_attachment_idx,
+ builder->shaders[MESA_SHADER_FRAGMENT]->attachment_idx,
+ sizeof(pipeline->program.input_attachment_idx));
+ }
}
static void
if (result != VK_SUCCESS)
return result;
+ (*pipeline)->layout = builder->layout;
+
/* compile and upload shaders */
result = tu_pipeline_builder_compile_shaders(builder);
if (result == VK_SUCCESS)
#define MAX_VIEWS 8
/* The Qualcomm driver exposes 0x20000058 */
#define MAX_STORAGE_BUFFER_RANGE 0x20000000
+/* We use ldc for uniform buffer loads, just like the Qualcomm driver, so
+ * expose the same maximum range.
+ * TODO: The SIZE bitfield is 15 bits, and in 4-dword units, so the actual
+ * range might be higher.
+ */
+#define MAX_UNIFORM_BUFFER_RANGE 0x10000
#define NUM_DEPTH_CLEAR_PIPELINES 3
struct tu_descriptor_set
{
const struct tu_descriptor_set_layout *layout;
+ struct tu_descriptor_pool *pool;
uint32_t size;
uint64_t va;
uint32_t *mapped_ptr;
- struct tu_descriptor_range *dynamic_descriptors;
- struct tu_bo *descriptors[0];
+ uint32_t *dynamic_descriptors;
+
+ struct tu_bo *buffers[0];
};
struct tu_push_descriptor_set
uint32_t valid;
struct tu_push_descriptor_set push_set;
bool push_dirty;
- uint64_t dynamic_buffers[MAX_DYNAMIC_BUFFERS];
+ uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS];
+ uint32_t input_attachments[MAX_RTS * A6XX_TEX_CONST_DWORDS];
};
struct tu_tile
TU_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1,
TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 2,
TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3,
- TU_CMD_DIRTY_PUSH_CONSTANTS = 1 << 4,
- TU_CMD_DIRTY_STREAMOUT_BUFFERS = 1 << 5,
+ TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS = 1 << 4,
+ TU_CMD_DIRTY_PUSH_CONSTANTS = 1 << 5,
+ TU_CMD_DIRTY_STREAMOUT_BUFFERS = 1 << 6,
+ TU_CMD_DIRTY_INPUT_ATTACHMENTS = 1 << 7,
TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16,
TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17,
bool include_binning_pass;
};
-struct tu_descriptor_map
-{
- /* TODO: avoid fixed size array/justify the size */
- unsigned num; /* number of array entries */
- unsigned num_desc; /* Number of descriptors (sum of array_size[]) */
- int set[128];
- int binding[128];
- int value[128];
- int array_size[128];
-};
-
struct tu_push_constant_range
{
uint32_t lo;
struct ir3_shader ir3_shader;
struct tu_push_constant_range push_consts;
- struct tu_descriptor_map texture_map;
- struct tu_descriptor_map sampler_map;
- struct tu_descriptor_map ubo_map;
- struct tu_descriptor_map ssbo_map;
- struct tu_descriptor_map image_map;
+ unsigned attachment_idx[MAX_RTS];
/* This may be true for vertex shaders. When true, variants[1] is the
* binning variant and binning_binary is non-NULL.
uint32_t constlen;
struct tu_push_constant_range push_consts;
- struct tu_descriptor_map texture_map;
- struct tu_descriptor_map sampler_map;
- struct tu_descriptor_map ubo_map;
- struct tu_descriptor_map ssbo_map;
- struct tu_descriptor_map image_map;
};
struct tu_pipeline
struct tu_cs_entry binning_state_ib;
struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];
+ unsigned input_attachment_idx[MAX_RTS];
} program;
struct
return nir;
}
-static unsigned
-map_add(struct tu_descriptor_map *map, int set, int binding, int value,
- int array_size)
-{
- unsigned index = 0;
- for (unsigned i = 0; i < map->num; i++) {
- if (set == map->set[i] && binding == map->binding[i]) {
- assert(value == map->value[i]);
- assert(array_size == map->array_size[i]);
- return index;
- }
- index += map->array_size[i];
- }
-
- assert(index == map->num_desc);
-
- map->set[map->num] = set;
- map->binding[map->num] = binding;
- map->value[map->num] = value;
- map->array_size[map->num] = array_size;
- map->num++;
- map->num_desc += array_size;
-
- return index;
-}
-
-static void
-lower_tex_src_to_offset(nir_builder *b, nir_tex_instr *instr, unsigned src_idx,
- struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
-{
- nir_ssa_def *index = NULL;
- unsigned base_index = 0;
- unsigned array_elements = 1;
- nir_tex_src *src = &instr->src[src_idx];
- bool is_sampler = src->src_type == nir_tex_src_sampler_deref;
-
- /* We compute first the offsets */
- nir_deref_instr *deref = nir_instr_as_deref(src->src.ssa->parent_instr);
- while (deref->deref_type != nir_deref_type_var) {
- assert(deref->parent.is_ssa);
- nir_deref_instr *parent =
- nir_instr_as_deref(deref->parent.ssa->parent_instr);
-
- assert(deref->deref_type == nir_deref_type_array);
-
- if (nir_src_is_const(deref->arr.index) && index == NULL) {
- /* We're still building a direct index */
- base_index += nir_src_as_uint(deref->arr.index) * array_elements;
- } else {
- if (index == NULL) {
- /* We used to be direct but not anymore */
- index = nir_imm_int(b, base_index);
- base_index = 0;
- }
-
- index = nir_iadd(b, index,
- nir_imul(b, nir_imm_int(b, array_elements),
- nir_ssa_for_src(b, deref->arr.index, 1)));
- }
-
- array_elements *= glsl_get_length(parent->type);
-
- deref = parent;
- }
-
- if (index)
- index = nir_umin(b, index, nir_imm_int(b, array_elements - 1));
-
- /* We have the offsets, we apply them, rewriting the source or removing
- * instr if needed
- */
- if (index) {
- nir_instr_rewrite_src(&instr->instr, &src->src,
- nir_src_for_ssa(index));
-
- src->src_type = is_sampler ?
- nir_tex_src_sampler_offset :
- nir_tex_src_texture_offset;
- } else {
- nir_tex_instr_remove_src(instr, src_idx);
- }
-
- uint32_t set = deref->var->data.descriptor_set;
- uint32_t binding = deref->var->data.binding;
- struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
- struct tu_descriptor_set_binding_layout *binding_layout =
- &set_layout->binding[binding];
-
- int desc_index = map_add(is_sampler ?
- &shader->sampler_map : &shader->texture_map,
- deref->var->data.descriptor_set,
- deref->var->data.binding,
- deref->var->data.index,
- binding_layout->array_size) + base_index;
- if (is_sampler)
- instr->sampler_index = desc_index;
- else
- instr->texture_index = desc_index;
-}
-
-static bool
-lower_sampler(nir_builder *b, nir_tex_instr *instr, struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
-{
- int texture_idx =
- nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
-
- if (texture_idx >= 0)
- lower_tex_src_to_offset(b, instr, texture_idx, shader, layout);
-
- int sampler_idx =
- nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
-
- if (sampler_idx >= 0)
- lower_tex_src_to_offset(b, instr, sampler_idx, shader, layout);
-
- if (texture_idx < 0 && sampler_idx < 0)
- return false;
-
- return true;
-}
-
static void
lower_load_push_constant(nir_builder *b, nir_intrinsic_instr *instr,
struct tu_shader *shader)
struct tu_shader *shader,
const struct tu_pipeline_layout *layout)
{
- nir_const_value *const_val = nir_src_as_const_value(instr->src[0]);
+ nir_ssa_def *vulkan_idx = instr->src[0].ssa;
unsigned set = nir_intrinsic_desc_set(instr);
unsigned binding = nir_intrinsic_binding(instr);
struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
struct tu_descriptor_set_binding_layout *binding_layout =
&set_layout->binding[binding];
- unsigned index = 0;
+ uint32_t base;
- switch (nir_intrinsic_desc_type(instr)) {
- case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
+ switch (binding_layout->type) {
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
- if (!const_val)
- tu_finishme("non-constant vulkan_resource_index array index");
- /* skip index 0 which is used for push constants */
- index = map_add(&shader->ubo_map, set, binding, 0,
- binding_layout->array_size) + 1;
- index += const_val->u32;
- break;
- case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
- if (!const_val)
- tu_finishme("non-constant vulkan_resource_index array index");
- index = map_add(&shader->ssbo_map, set, binding, 0,
- binding_layout->array_size);
- index += const_val->u32;
+ base = layout->set[set].dynamic_offset_start +
+ binding_layout->dynamic_offset_offset +
+ layout->input_attachment_count;
+ set = MAX_SETS;
break;
default:
- tu_finishme("unsupported desc_type for vulkan_resource_index");
+ base = binding_layout->offset / (4 * A6XX_TEX_CONST_DWORDS);
break;
}
+ nir_intrinsic_instr *bindless =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_bindless_resource_ir3);
+ bindless->num_components = 1;
+ nir_ssa_dest_init(&bindless->instr, &bindless->dest,
+ 1, 32, NULL);
+ nir_intrinsic_set_desc_set(bindless, set);
+ bindless->src[0] = nir_src_for_ssa(nir_iadd(b, nir_imm_int(b, base), vulkan_idx));
+ nir_builder_instr_insert(b, &bindless->instr);
+
nir_ssa_def_rewrite_uses(&instr->dest.ssa,
- nir_src_for_ssa(nir_imm_int(b, index)));
+ nir_src_for_ssa(&bindless->dest.ssa));
nir_instr_remove(&instr->instr);
}
-static void
-lower_image_deref(nir_builder *b,
- nir_intrinsic_instr *instr, struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
+static nir_ssa_def *
+build_bindless(nir_builder *b, nir_deref_instr *deref, bool is_sampler,
+ struct tu_shader *shader,
+ const struct tu_pipeline_layout *layout)
{
- nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
nir_variable *var = nir_deref_instr_get_variable(deref);
- uint32_t set = var->data.descriptor_set;
- uint32_t binding = var->data.binding;
- struct tu_descriptor_set_layout *set_layout = layout->set[set].layout;
- struct tu_descriptor_set_binding_layout *binding_layout =
- &set_layout->binding[binding];
+ unsigned set = var->data.descriptor_set;
+ unsigned binding = var->data.binding;
+ const struct tu_descriptor_set_binding_layout *bind_layout =
+ &layout->set[set].layout->binding[binding];
+
+ nir_ssa_def *desc_offset;
+ unsigned descriptor_stride;
+ if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) {
+ unsigned offset =
+ layout->set[set].input_attachment_start +
+ bind_layout->input_attachment_offset;
+ desc_offset = nir_imm_int(b, offset);
+ set = MAX_SETS;
+ descriptor_stride = 1;
+ } else {
+ unsigned offset = 0;
+ /* Samplers come second in combined image/sampler descriptors, see
+ * write_combined_image_sampler_descriptor().
+ */
+ if (is_sampler && bind_layout->type ==
+ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) {
+ offset = 1;
+ }
+ desc_offset =
+ nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) +
+ offset);
+ descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS);
+ }
- nir_ssa_def *index = nir_imm_int(b,
- map_add(&shader->image_map,
- set, binding, var->data.index,
- binding_layout->array_size));
if (deref->deref_type != nir_deref_type_var) {
assert(deref->deref_type == nir_deref_type_array);
- index = nir_iadd(b, index, nir_ssa_for_src(b, deref->arr.index, 1));
+
+ nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1);
+ desc_offset = nir_iadd(b, desc_offset,
+ nir_imul_imm(b, arr_index, descriptor_stride));
}
- nir_rewrite_image_intrinsic(instr, index, false);
+
+ nir_intrinsic_instr *bindless =
+ nir_intrinsic_instr_create(b->shader,
+ nir_intrinsic_bindless_resource_ir3);
+ bindless->num_components = 1;
+ nir_ssa_dest_init(&bindless->instr, &bindless->dest,
+ 1, 32, NULL);
+ nir_intrinsic_set_desc_set(bindless, set);
+ bindless->src[0] = nir_src_for_ssa(desc_offset);
+ nir_builder_instr_insert(b, &bindless->instr);
+
+ return &bindless->dest.ssa;
+}
+
+static void
+lower_image_deref(nir_builder *b,
+ nir_intrinsic_instr *instr, struct tu_shader *shader,
+ const struct tu_pipeline_layout *layout)
+{
+ nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+ nir_ssa_def *bindless = build_bindless(b, deref, false, shader, layout);
+ nir_rewrite_image_intrinsic(instr, bindless, true);
}
static bool
case nir_intrinsic_image_deref_atomic_comp_swap:
case nir_intrinsic_image_deref_size:
case nir_intrinsic_image_deref_samples:
- case nir_intrinsic_image_deref_load_param_intel:
- case nir_intrinsic_image_deref_load_raw_intel:
- case nir_intrinsic_image_deref_store_raw_intel:
lower_image_deref(b, instr, shader, layout);
return true;
}
}
+static bool
+lower_tex(nir_builder *b, nir_tex_instr *tex,
+ struct tu_shader *shader, const struct tu_pipeline_layout *layout)
+{
+ int sampler_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref);
+ if (sampler_src_idx >= 0) {
+ nir_deref_instr *deref = nir_src_as_deref(tex->src[sampler_src_idx].src);
+ nir_ssa_def *bindless = build_bindless(b, deref, true, shader, layout);
+ nir_instr_rewrite_src(&tex->instr, &tex->src[sampler_src_idx].src,
+ nir_src_for_ssa(bindless));
+ tex->src[sampler_src_idx].src_type = nir_tex_src_sampler_handle;
+ }
+
+ int tex_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+ if (tex_src_idx >= 0) {
+ nir_deref_instr *deref = nir_src_as_deref(tex->src[tex_src_idx].src);
+ nir_ssa_def *bindless = build_bindless(b, deref, false, shader, layout);
+ nir_instr_rewrite_src(&tex->instr, &tex->src[tex_src_idx].src,
+ nir_src_for_ssa(bindless));
+ tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle;
+ }
+
+ return true;
+}
+
+static bool
+lower_impl(nir_function_impl *impl, struct tu_shader *shader,
+ const struct tu_pipeline_layout *layout)
+{
+ nir_builder b;
+ nir_builder_init(&b, impl);
+ bool progress = false;
+
+ nir_foreach_block(block, impl) {
+ nir_foreach_instr_safe(instr, block) {
+ b.cursor = nir_before_instr(instr);
+ switch (instr->type) {
+ case nir_instr_type_tex:
+ progress |= lower_tex(&b, nir_instr_as_tex(instr), shader, layout);
+ break;
+ case nir_instr_type_intrinsic:
+ progress |= lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader, layout);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ return progress;
+}
+
+
/* Figure out the range of push constants that we're actually going to push to
* the shader, and tell the backend to reserve this range when pushing UBO
* constants.
align(tu_shader->push_consts.count, 4);
}
-static bool
-lower_impl(nir_function_impl *impl, struct tu_shader *shader,
- const struct tu_pipeline_layout *layout)
+/* Gather the InputAttachmentIndex for each input attachment from the NIR
+ * shader and organize the info in a way so that draw-time patching is easy.
+ */
+static void
+gather_input_attachments(nir_shader *shader, struct tu_shader *tu_shader,
+ const struct tu_pipeline_layout *layout)
{
- nir_builder b;
- nir_builder_init(&b, impl);
- bool progress = false;
+ nir_foreach_variable(var, &shader->uniforms) {
+ const struct glsl_type *glsl_type = glsl_without_array(var->type);
- nir_foreach_block(block, impl) {
- nir_foreach_instr_safe(instr, block) {
- b.cursor = nir_before_instr(instr);
- switch (instr->type) {
- case nir_instr_type_tex:
- progress |= lower_sampler(&b, nir_instr_as_tex(instr), shader, layout);
- break;
- case nir_instr_type_intrinsic:
- progress |= lower_intrinsic(&b, nir_instr_as_intrinsic(instr), shader, layout);
- break;
- default:
- break;
- }
+ if (!glsl_type_is_image(glsl_type))
+ continue;
+
+ enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type);
+
+ const uint32_t set = var->data.descriptor_set;
+ const uint32_t binding = var->data.binding;
+ const struct tu_descriptor_set_binding_layout *bind_layout =
+ &layout->set[set].layout->binding[binding];
+ const uint32_t array_size = bind_layout->array_size;
+
+ if (dim == GLSL_SAMPLER_DIM_SUBPASS ||
+ dim == GLSL_SAMPLER_DIM_SUBPASS_MS) {
+ unsigned offset =
+ layout->set[set].input_attachment_start +
+ bind_layout->input_attachment_offset;
+ for (unsigned i = 0; i < array_size; i++)
+ tu_shader->attachment_idx[offset + i] = var->data.index + i;
}
}
-
- return progress;
}
static bool
bool progress = false;
gather_push_constants(shader, tu_shader);
+ gather_input_attachments(shader, tu_shader, layout);
nir_foreach_function(function, shader) {
if (function->impl)
progress |= lower_impl(function->impl, tu_shader, layout);
}
- /* spirv_to_nir produces num_ssbos equal to the number of SSBO-containing
- * variables, while ir3 wants the number of descriptors (like the gallium
- * path).
- */
- shader->info.num_ssbos = tu_shader->ssbo_map.num_desc;
-
return progress;
}