From 159a1300ceb5752dda116b07a560e351c0324886 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Sun, 14 Jun 2020 23:10:01 -0400 Subject: [PATCH] turnip: input attachment descriptor set rework Implement GMEM input attachments by using non-bindless texture state which is emitted at the start of every subpass. This achieves two things: * More vulkan-like CmdBindDescriptorSets * Fixing secondary command buffer input attachments with GMEM Signed-off-by: Jonathan Marek Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 404 +++++++++-------------- src/freedreno/vulkan/tu_descriptor_set.c | 64 +--- src/freedreno/vulkan/tu_descriptor_set.h | 8 - src/freedreno/vulkan/tu_pipeline.c | 17 +- src/freedreno/vulkan/tu_private.h | 10 +- src/freedreno/vulkan/tu_shader.c | 91 ++--- 6 files changed, 211 insertions(+), 383 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index affd1b3e9db..2c8a3411dc3 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -707,13 +707,6 @@ tu_cs_emit_draw_state(struct tu_cs *cs, uint32_t id, struct tu_draw_state state) case TU_DRAW_STATE_VI_BINNING: enable_mask = CP_SET_DRAW_STATE__0_BINNING; break; - case TU_DRAW_STATE_DESC_SETS_GMEM: - enable_mask = CP_SET_DRAW_STATE__0_GMEM; - break; - case TU_DRAW_STATE_DESC_SETS_SYSMEM: - enable_mask = CP_SET_DRAW_STATE__0_BINNING | - CP_SET_DRAW_STATE__0_SYSMEM; - break; default: enable_mask = CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM | @@ -1263,8 +1256,91 @@ tu6_emit_binning_pass(struct tu_cmd_buffer *cmd, struct tu_cs *cs) } static void -tu_emit_load_clear(struct tu_cmd_buffer *cmd, - const VkRenderPassBeginInfo *info) +tu_emit_input_attachments(struct tu_cmd_buffer *cmd, + const struct tu_subpass *subpass, + bool gmem) +{ + /* note: we can probably emit input attachments just once for the whole + * renderpass, this would avoid emitting both sysmem/gmem versions + * + * emit two texture descriptors for each input, as a workaround for + * d24s8, which can be sampled as both float (depth) and integer (stencil) + * tu_shader lowers uint input attachment loads to use the 2nd descriptor + * in the pair + * TODO: a smarter workaround + */ + + if (!subpass->input_count) + return; + + struct ts_cs_memory texture; + VkResult result = tu_cs_alloc(&cmd->sub_cs, subpass->input_count * 2, + A6XX_TEX_CONST_DWORDS, &texture); + assert(result == VK_SUCCESS); + + for (unsigned i = 0; i < subpass->input_count * 2; i++) { + uint32_t a = subpass->input_attachments[i / 2].attachment; + if (a == VK_ATTACHMENT_UNUSED) + continue; + + struct tu_image_view *iview = + cmd->state.framebuffer->attachments[a].attachment; + const struct tu_render_pass_attachment *att = + &cmd->state.pass->attachments[a]; + uint32_t *dst = &texture.map[A6XX_TEX_CONST_DWORDS * i]; + + memcpy(dst, iview->descriptor, A6XX_TEX_CONST_DWORDS * 4); + + if (i % 2 == 1 && att->format == VK_FORMAT_D24_UNORM_S8_UINT) { + /* note this works because spec says fb and input attachments + * must use identity swizzle + */ + dst[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK | + A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK | + A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK); + dst[0] |= A6XX_TEX_CONST_0_FMT(FMT6_S8Z24_UINT) | + A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_Y) | + A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_ZERO) | + A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_ZERO) | + A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_ONE); + } + + if (!gmem) + continue; + + /* patched for gmem */ + dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); + dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); + dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK); + dst[2] |= + A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | + A6XX_TEX_CONST_2_PITCH(cmd->state.tiling_config.tile0.extent.width * att->cpp); + dst[3] = 0; + dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset; + dst[5] = A6XX_TEX_CONST_5_DEPTH(1); + for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) + dst[i] = 0; + } + + struct tu_cs *cs = &cmd->draw_cs; + + tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) | + CP_LOAD_STATE6_0_NUM_UNIT(subpass->input_count * 2)); + tu_cs_emit_qw(cs, texture.iova); + + tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_TEX_CONST_LO, 2); + tu_cs_emit_qw(cs, texture.iova); + + tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(subpass->input_count * 2)); +} + +static void +tu_emit_renderpass_begin(struct tu_cmd_buffer *cmd, + const VkRenderPassBeginInfo *info) { struct tu_cs *cs = &cmd->draw_cs; @@ -1280,6 +1356,8 @@ tu_emit_load_clear(struct tu_cmd_buffer *cmd, for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) tu_clear_gmem_attachment(cmd, cs, i, info); + tu_emit_input_attachments(cmd, cmd->state.subpass, true); + tu_cond_exec_end(cs); tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); @@ -1287,6 +1365,8 @@ tu_emit_load_clear(struct tu_cmd_buffer *cmd, for (uint32_t i = 0; i < cmd->state.pass->attachment_count; ++i) tu_clear_sysmem_attachment(cmd, cs, i, info); + tu_emit_input_attachments(cmd, cmd->state.subpass, false); + tu_cond_exec_end(cs); } @@ -1343,7 +1423,6 @@ tu6_sysmem_render_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_sanity_check(cs); } - static void tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { @@ -1575,9 +1654,6 @@ tu_cmd_buffer_destroy(struct tu_cmd_buffer *cmd_buffer) list_del(&cmd_buffer->pool_link); - for (unsigned i = 0; i < MAX_BIND_POINTS; i++) - free(cmd_buffer->descriptors[i].push_set.set.mapped_ptr); - tu_cs_finish(&cmd_buffer->cs); tu_cs_finish(&cmd_buffer->draw_cs); tu_cs_finish(&cmd_buffer->draw_epilogue_cs); @@ -1598,10 +1674,8 @@ tu_reset_cmd_buffer(struct tu_cmd_buffer *cmd_buffer) tu_cs_reset(&cmd_buffer->draw_epilogue_cs); tu_cs_reset(&cmd_buffer->sub_cs); - for (unsigned i = 0; i < MAX_BIND_POINTS; i++) { - cmd_buffer->descriptors[i].valid = 0; - cmd_buffer->descriptors[i].push_dirty = false; - } + for (unsigned i = 0; i < MAX_BIND_POINTS; i++) + memset(&cmd_buffer->descriptors[i].sets, 0, sizeof(cmd_buffer->descriptors[i].sets)); cmd_buffer->status = TU_CMD_BUFFER_STATUS_INITIAL; @@ -1829,31 +1903,10 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, TU_FROM_HANDLE(tu_descriptor_set, set, pDescriptorSets[i]); descriptors_state->sets[idx] = set; - descriptors_state->valid |= (1u << idx); - - /* Note: the actual input attachment indices come from the shader - * itself, so we can't generate the patched versions of these until - * draw time when both the pipeline and descriptors are bound and - * we're inside the render pass. - */ - unsigned dst_idx = layout->set[idx].input_attachment_start; - memcpy(&descriptors_state->input_attachments[dst_idx * A6XX_TEX_CONST_DWORDS], - set->dynamic_descriptors, - set->layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4); for(unsigned j = 0; j < set->layout->dynamic_offset_count; ++j, ++dyn_idx) { - /* Dynamic buffers come after input attachments in the descriptor set - * itself, but due to how the Vulkan descriptor set binding works, we - * have to put input attachments and dynamic buffers in separate - * buffers in the descriptor_state and then combine them at draw - * time. Binding a descriptor set only invalidates the descriptor - * sets after it, but if we try to tightly pack the descriptors after - * the input attachments then we could corrupt dynamic buffers in the - * descriptor set before it, or we'd have to move all the dynamic - * buffers over. We just put them into separate buffers to make - * binding as well as the later patching of input attachments easy. - */ - unsigned src_idx = j + set->layout->input_attachment_count; + /* update the contents of the dynamic descriptor set */ + unsigned src_idx = j; unsigned dst_idx = j + layout->set[idx].dynamic_offset_start; assert(dyn_idx < dynamicOffsetCount); @@ -1894,11 +1947,65 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_DUMP); } } + assert(dyn_idx == dynamicOffsetCount); + + uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg, hlsq_update_value; + uint64_t addr[MAX_SETS + 1] = {}; + struct tu_cs cs; + + for (uint32_t i = 0; i < MAX_SETS; i++) { + struct tu_descriptor_set *set = descriptors_state->sets[i]; + if (set) + addr[i] = set->va | 3; + } + + if (layout->dynamic_offset_count) { + /* allocate and fill out dynamic descriptor set */ + struct ts_cs_memory dynamic_desc_set; + VkResult result = tu_cs_alloc(&cmd->sub_cs, layout->dynamic_offset_count, + A6XX_TEX_CONST_DWORDS, &dynamic_desc_set); + assert(result == VK_SUCCESS); + + memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors, + layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4); + addr[MAX_SETS] = dynamic_desc_set.iova | 3; + } + + if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { + sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0); + hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0); + hlsq_update_value = 0x7c000; - if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) - cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS; - else cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS | TU_CMD_DIRTY_SHADER_CONSTS; + } else { + assert(pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE); + + sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0); + hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0); + hlsq_update_value = 0x3e00; + + cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS; + } + + tu_cs_begin_sub_stream(&cmd->sub_cs, 24, &cs); + + tu_cs_emit_pkt4(&cs, sp_bindless_base_reg, 10); + tu_cs_emit_array(&cs, (const uint32_t*) addr, 10); + tu_cs_emit_pkt4(&cs, hlsq_bindless_base_reg, 10); + tu_cs_emit_array(&cs, (const uint32_t*) addr, 10); + tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(.dword = hlsq_update_value)); + + struct tu_cs_entry ib = tu_cs_end_sub_stream(&cmd->sub_cs, &cs); + if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) { + tu_cs_emit_pkt7(&cmd->draw_cs, CP_SET_DRAW_STATE, 3); + tu_cs_emit_sds_ib(&cmd->draw_cs, TU_DRAW_STATE_DESC_SETS, ib); + cmd->state.desc_sets_ib = ib; + } else { + /* note: for compute we could emit directly, instead of a CP_INDIRECT + * however, the blob uses draw states for compute + */ + tu_cs_emit_ib(&cmd->cs, &ib); + } } void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, @@ -2111,7 +2218,7 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer, cmd->state.dirty |= TU_CMD_DIRTY_VERTEX_BUFFERS; /* If the pipeline needs a dynamic descriptor, re-emit descriptor sets */ - if (pipeline->layout->dynamic_offset_count + pipeline->layout->input_attachment_count) + if (pipeline->layout->dynamic_offset_count) cmd->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS; /* dynamic linewidth state depends pipeline state's gras_su_cntl @@ -2666,7 +2773,7 @@ tu_CmdBeginRenderPass(VkCommandBuffer commandBuffer, cmd->state.cache.pending_flush_bits; cmd->state.renderpass_cache.flush_bits = 0; - tu_emit_load_clear(cmd, pRenderPassBegin); + tu_emit_renderpass_begin(cmd, pRenderPassBegin); tu6_emit_zs(cmd, cmd->state.subpass, &cmd->draw_cs); tu6_emit_mrt(cmd, cmd->state.subpass, &cmd->draw_cs); @@ -2729,12 +2836,16 @@ tu_CmdNextSubpass(VkCommandBuffer commandBuffer, VkSubpassContents contents) } } + tu_emit_input_attachments(cmd, cmd->state.subpass, true); + tu_cond_exec_end(cs); tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM); tu6_emit_sysmem_resolves(cmd, cs, subpass); + tu_emit_input_attachments(cmd, cmd->state.subpass, false); + tu_cond_exec_end(cs); /* Handle dependencies for the next subpass */ @@ -2857,14 +2968,6 @@ tu6_emit_user_consts(struct tu_cs *cs, const struct tu_pipeline *pipeline, descriptors_state->dynamic_descriptors : descriptors_state->sets[state->range[i].bindless_base]->mapped_ptr; unsigned block = state->range[i].block; - /* If the block in the shader here is in the dynamic descriptor set, it - * is an index into the dynamic descriptor set which is combined from - * dynamic descriptors and input attachments on-the-fly, and we don't - * have access to it here. Instead we work backwards to get the index - * into dynamic_descriptors. - */ - if (state->range[i].bindless_base == MAX_SETS) - block -= pipeline->layout->input_attachment_count; uint32_t *desc = base + block * A6XX_TEX_CONST_DWORDS; uint64_t va = desc[0] | ((uint64_t)(desc[1] & A6XX_UBO_1_BASE_HI__MASK) << 32); assert(va); @@ -2957,143 +3060,6 @@ tu6_emit_vertex_buffers(struct tu_cmd_buffer *cmd, return tu_cs_end_sub_stream(&cmd->sub_cs, &cs); } -static VkResult -tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd, - const struct tu_pipeline *pipeline, - VkPipelineBindPoint bind_point, - struct tu_cs_entry *entry, - bool gmem) -{ - struct tu_cs *draw_state = &cmd->sub_cs; - struct tu_pipeline_layout *layout = pipeline->layout; - struct tu_descriptor_state *descriptors_state = - tu_get_descriptors_state(cmd, bind_point); - const struct tu_tiling_config *tiling = &cmd->state.tiling_config; - const uint32_t *input_attachment_idx = - pipeline->program.input_attachment_idx; - uint32_t num_dynamic_descs = layout->dynamic_offset_count + - layout->input_attachment_count; - struct ts_cs_memory dynamic_desc_set; - VkResult result; - - if (num_dynamic_descs > 0) { - /* allocate and fill out dynamic descriptor set */ - result = tu_cs_alloc(draw_state, num_dynamic_descs, - A6XX_TEX_CONST_DWORDS, &dynamic_desc_set); - if (result != VK_SUCCESS) - return result; - - memcpy(dynamic_desc_set.map, descriptors_state->input_attachments, - layout->input_attachment_count * A6XX_TEX_CONST_DWORDS * 4); - - if (gmem) { - /* Patch input attachments to refer to GMEM instead */ - for (unsigned i = 0; i < layout->input_attachment_count; i++) { - uint32_t *dst = - &dynamic_desc_set.map[A6XX_TEX_CONST_DWORDS * i]; - - /* The compiler has already laid out input_attachment_idx in the - * final order of input attachments, so there's no need to go - * through the pipeline layout finding input attachments. - */ - unsigned attachment_idx = input_attachment_idx[i]; - - /* It's possible for the pipeline layout to include an input - * attachment which doesn't actually exist for the current - * subpass. Of course, this is only valid so long as the pipeline - * doesn't try to actually load that attachment. Just skip - * patching in that scenario to avoid out-of-bounds accesses. - */ - if (attachment_idx >= cmd->state.subpass->input_count) - continue; - - uint32_t a = cmd->state.subpass->input_attachments[attachment_idx].attachment; - const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a]; - - assert(att->gmem_offset >= 0); - - dst[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); - dst[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); - dst[2] &= ~(A6XX_TEX_CONST_2_TYPE__MASK | A6XX_TEX_CONST_2_PITCH__MASK); - dst[2] |= - A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) | - A6XX_TEX_CONST_2_PITCH(tiling->tile0.extent.width * att->cpp); - dst[3] = 0; - dst[4] = cmd->device->physical_device->gmem_base + att->gmem_offset; - dst[5] = A6XX_TEX_CONST_5_DEPTH(1); - for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++) - dst[i] = 0; - - if (cmd->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) - tu_finishme("patch input attachment pitch for secondary cmd buffer"); - } - } - - memcpy(dynamic_desc_set.map + layout->input_attachment_count * A6XX_TEX_CONST_DWORDS, - descriptors_state->dynamic_descriptors, - layout->dynamic_offset_count * A6XX_TEX_CONST_DWORDS * 4); - } - - uint32_t sp_bindless_base_reg, hlsq_bindless_base_reg; - uint32_t hlsq_update_value; - switch (bind_point) { - case VK_PIPELINE_BIND_POINT_GRAPHICS: - sp_bindless_base_reg = REG_A6XX_SP_BINDLESS_BASE(0); - hlsq_bindless_base_reg = REG_A6XX_HLSQ_BINDLESS_BASE(0); - hlsq_update_value = 0x7c000; - break; - case VK_PIPELINE_BIND_POINT_COMPUTE: - sp_bindless_base_reg = REG_A6XX_SP_CS_BINDLESS_BASE(0); - hlsq_bindless_base_reg = REG_A6XX_HLSQ_CS_BINDLESS_BASE(0); - hlsq_update_value = 0x3e00; - break; - default: - unreachable("bad bind point"); - } - - /* Be careful here to *not* refer to the pipeline, so that if only the - * pipeline changes we don't have to emit this again (except if there are - * dynamic descriptors in the pipeline layout). This means always emitting - * all the valid descriptors, which means that we always have to put the - * dynamic descriptor in the driver-only slot at the end - */ - uint32_t num_user_sets = util_last_bit(descriptors_state->valid); - uint32_t num_sets = num_user_sets; - if (num_dynamic_descs > 0) { - num_user_sets = MAX_SETS; - num_sets = num_user_sets + 1; - } - - unsigned regs[2] = { sp_bindless_base_reg, hlsq_bindless_base_reg }; - - struct tu_cs cs; - result = tu_cs_begin_sub_stream(draw_state, ARRAY_SIZE(regs) * (1 + num_sets * 2) + 2, &cs); - if (result != VK_SUCCESS) - return result; - - if (num_sets > 0) { - for (unsigned i = 0; i < ARRAY_SIZE(regs); i++) { - tu_cs_emit_pkt4(&cs, regs[i], num_sets * 2); - for (unsigned j = 0; j < num_user_sets; j++) { - if (descriptors_state->valid & (1 << j)) { - /* magic | 3 copied from the blob */ - tu_cs_emit_qw(&cs, descriptors_state->sets[j]->va | 3); - } else { - tu_cs_emit_qw(&cs, 0 | 3); - } - } - if (num_dynamic_descs > 0) { - tu_cs_emit_qw(&cs, dynamic_desc_set.iova | 3); - } - } - - tu_cs_emit_regs(&cs, A6XX_HLSQ_UPDATE_CNTL(hlsq_update_value)); - } - - *entry = tu_cs_end_sub_stream(draw_state, &cs); - return VK_SUCCESS; -} - static void tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs) { @@ -3184,41 +3150,7 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) tu6_emit_streamout(cmd, cs); - /* If there are any any dynamic descriptors, then we may need to re-emit - * them after every pipeline change in case the number of input attachments - * changes. We also always need to re-emit after a pipeline change if there - * are any input attachments, because the input attachment index comes from - * the pipeline. Finally, it can also happen that the subpass changes - * without the pipeline changing, in which case the GMEM descriptors need - * to be patched differently. - * - * TODO: We could probably be clever and avoid re-emitting state on - * pipeline changes if the number of input attachments is always 0. We - * could also only re-emit dynamic state. - */ if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { - bool need_gmem_desc_set = pipeline->layout->input_attachment_count > 0; - - result = tu6_emit_descriptor_sets(cmd, pipeline, - VK_PIPELINE_BIND_POINT_GRAPHICS, - &cmd->state.desc_sets_ib, false); - if (result != VK_SUCCESS) - return result; - - if (need_gmem_desc_set) { - cmd->state.desc_sets_sysmem_ib = cmd->state.desc_sets_ib; - cmd->state.desc_sets_ib.size = 0; - - result = tu6_emit_descriptor_sets(cmd, pipeline, - VK_PIPELINE_BIND_POINT_GRAPHICS, - &cmd->state.desc_sets_gmem_ib, true); - if (result != VK_SUCCESS) - return result; - } else { - cmd->state.desc_sets_gmem_ib.size = 0; - cmd->state.desc_sets_sysmem_ib.size = 0; - } - /* We need to reload the descriptors every time the descriptor sets * change. However, the commands we send only depend on the pipeline * because the whole point is to cache descriptors which are used by the @@ -3274,8 +3206,6 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets_ib); - tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_GMEM, cmd->state.desc_sets_gmem_ib); - tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_SYSMEM, cmd->state.desc_sets_sysmem_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_PARAMS, vs_params); @@ -3293,7 +3223,7 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, */ uint32_t draw_state_count = ((cmd->state.dirty & TU_CMD_DIRTY_SHADER_CONSTS) ? 3 : 0) + - ((cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) ? 4 : 0) + + ((cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) ? 1 : 0) + ((cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) ? 1 : 0) + 1; /* vs_params */ @@ -3304,12 +3234,8 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_GS_CONST, cmd->state.shader_const_ib[MESA_SHADER_GEOMETRY]); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_FS_CONST, cmd->state.shader_const_ib[MESA_SHADER_FRAGMENT]); } - if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) { - tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS, cmd->state.desc_sets_ib); - tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_GMEM, cmd->state.desc_sets_gmem_ib); - tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_SYSMEM, cmd->state.desc_sets_sysmem_ib); + if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_DESC_SETS_LOAD, cmd->state.desc_sets_load_ib); - } if (cmd->state.dirty & TU_CMD_DIRTY_VERTEX_BUFFERS) tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VB, cmd->state.vertex_buffers_ib); tu_cs_emit_sds_ib(cs, TU_DRAW_STATE_VS_PARAMS, vs_params); @@ -3641,7 +3567,6 @@ tu_dispatch(struct tu_cmd_buffer *cmd, struct tu_pipeline *pipeline = cmd->state.compute_pipeline; struct tu_descriptor_state *descriptors_state = &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE]; - VkResult result; /* TODO: We could probably flush less if we add a compute_flush_bits * bitfield. @@ -3659,19 +3584,6 @@ tu_dispatch(struct tu_cmd_buffer *cmd, tu_emit_compute_driver_params(cs, pipeline, info); - if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) { - result = tu6_emit_descriptor_sets(cmd, pipeline, - VK_PIPELINE_BIND_POINT_COMPUTE, &ib, - false); - if (result != VK_SUCCESS) { - cmd->record_result = result; - return; - } - } - - if (ib.size) - tu_cs_emit_ib(cs, &ib); - if ((cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS) && pipeline->load_state.state_ib.size > 0) { tu_cs_emit_ib(cs, &pipeline->load_state.state_ib); diff --git a/src/freedreno/vulkan/tu_descriptor_set.c b/src/freedreno/vulkan/tu_descriptor_set.c index c0d349c12c7..314088cefd9 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.c +++ b/src/freedreno/vulkan/tu_descriptor_set.c @@ -84,6 +84,7 @@ descriptor_size(VkDescriptorType type) case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: /* These are remapped to the special driver-managed descriptor set, * hence they don't take up any space in the original descriptor set: + * Input attachment doesn't use descriptor sets at all */ return 0; case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: @@ -175,7 +176,6 @@ tu_CreateDescriptorSetLayout( size - sizeof(struct tu_descriptor_set_layout)); uint32_t dynamic_offset_count = 0; - uint32_t input_attachment_count = 0; uint32_t buffer_count = 0; for (uint32_t j = 0; j < pCreateInfo->bindingCount; j++) { @@ -187,7 +187,6 @@ tu_CreateDescriptorSetLayout( set_layout->binding[b].offset = set_layout->size; set_layout->binding[b].buffer_offset = buffer_count; set_layout->binding[b].dynamic_offset_offset = dynamic_offset_count; - set_layout->binding[b].input_attachment_offset = input_attachment_count; set_layout->binding[b].size = descriptor_size(binding->descriptorType); set_layout->binding[b].shader_stages = binding->stageFlags; @@ -250,15 +249,13 @@ tu_CreateDescriptorSetLayout( dynamic_offset_count += binding->descriptorCount; } - if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) - input_attachment_count += binding->descriptorCount; + set_layout->shader_stages |= binding->stageFlags; } free(bindings); set_layout->dynamic_offset_count = dynamic_offset_count; - set_layout->input_attachment_count = input_attachment_count; set_layout->buffer_count = buffer_count; *pSetLayout = tu_descriptor_set_layout_to_handle(set_layout); @@ -364,10 +361,9 @@ tu_CreatePipelineLayout(VkDevice _device, return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY); layout->num_sets = pCreateInfo->setLayoutCount; - layout->input_attachment_count = 0; layout->dynamic_offset_count = 0; - unsigned dynamic_offset_count = 0, input_attachment_count = 0; + unsigned dynamic_offset_count = 0; _mesa_sha1_init(&ctx); for (uint32_t set = 0; set < pCreateInfo->setLayoutCount; set++) { @@ -375,9 +371,7 @@ tu_CreatePipelineLayout(VkDevice _device, pCreateInfo->pSetLayouts[set]); layout->set[set].layout = set_layout; layout->set[set].dynamic_offset_start = dynamic_offset_count; - layout->set[set].input_attachment_start = input_attachment_count; dynamic_offset_count += set_layout->dynamic_offset_count; - input_attachment_count += set_layout->input_attachment_count; for (uint32_t b = 0; b < set_layout->binding_count; b++) { if (set_layout->binding[b].immutable_samplers_offset) @@ -392,7 +386,6 @@ tu_CreatePipelineLayout(VkDevice _device, } layout->dynamic_offset_count = dynamic_offset_count; - layout->input_attachment_count = input_attachment_count; layout->push_constant_size = 0; for (unsigned i = 0; i < pCreateInfo->pushConstantRangeCount; ++i) { @@ -445,8 +438,7 @@ tu_descriptor_set_create(struct tu_device *device, unsigned dynamic_offset = sizeof(struct tu_descriptor_set) + sizeof(struct tu_bo *) * buffer_count; unsigned mem_size = dynamic_offset + - A6XX_TEX_CONST_DWORDS * 4 * (layout->dynamic_offset_count + - layout->input_attachment_count);; + A6XX_TEX_CONST_DWORDS * 4 * layout->dynamic_offset_count; if (pool->host_memory_base) { if (pool->host_memory_end - pool->host_memory_ptr < mem_size) @@ -464,7 +456,7 @@ tu_descriptor_set_create(struct tu_device *device, memset(set, 0, mem_size); - if (layout->dynamic_offset_count + layout->input_attachment_count > 0) { + if (layout->dynamic_offset_count) { set->dynamic_descriptors = (uint32_t *)((uint8_t*)set + dynamic_offset); } @@ -590,7 +582,6 @@ tu_CreateDescriptorPool(VkDevice _device, switch(pCreateInfo->pPoolSizes[i].type) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: dynamic_count += pCreateInfo->pPoolSizes[i].descriptorCount; default: break; @@ -903,7 +894,7 @@ tu_update_descriptor_sets(struct tu_device *device, case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: { assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); unsigned idx = writeset->dstArrayElement + j; - idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset; + idx += binding_layout->dynamic_offset_offset; write_ubo_descriptor(device, cmd_buffer, set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx, buffer_list, writeset->pBufferInfo + j); @@ -916,7 +907,7 @@ tu_update_descriptor_sets(struct tu_device *device, case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)); unsigned idx = writeset->dstArrayElement + j; - idx += set->layout->input_attachment_count + binding_layout->dynamic_offset_offset; + idx += binding_layout->dynamic_offset_offset; write_buffer_descriptor(device, cmd_buffer, set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx, buffer_list, writeset->pBufferInfo + j); @@ -937,15 +928,6 @@ tu_update_descriptor_sets(struct tu_device *device, writeset->descriptorType, writeset->pImageInfo + j); break; - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: { - unsigned idx = writeset->dstArrayElement + j; - idx += binding_layout->input_attachment_offset; - write_image_descriptor(device, cmd_buffer, - set->dynamic_descriptors + A6XX_TEX_CONST_DWORDS * idx, - buffer_list, writeset->descriptorType, - writeset->pImageInfo + j); - break; - } case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: write_combined_image_sampler_descriptor(device, cmd_buffer, A6XX_TEX_CONST_DWORDS * 4, @@ -957,6 +939,9 @@ tu_update_descriptor_sets(struct tu_device *device, case VK_DESCRIPTOR_TYPE_SAMPLER: write_sampler_descriptor(device, ptr, writeset->pImageInfo + j); break; + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: + /* nothing in descriptor set - framebuffer state is used instead */ + break; default: unreachable("unimplemented descriptor type"); break; @@ -999,8 +984,6 @@ tu_update_descriptor_sets(struct tu_device *device, case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: { unsigned src_idx = copyset->srcArrayElement + j; unsigned dst_idx = copyset->dstArrayElement + j; - src_idx += src_set->layout->input_attachment_count; - dst_idx += dst_set->layout->input_attachment_count; src_idx += src_binding_layout->dynamic_offset_offset; dst_idx += dst_binding_layout->dynamic_offset_offset; @@ -1010,18 +993,6 @@ tu_update_descriptor_sets(struct tu_device *device, memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4); break; } - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: { - unsigned src_idx = copyset->srcArrayElement + j; - unsigned dst_idx = copyset->dstArrayElement + j; - src_idx += src_binding_layout->input_attachment_offset; - dst_idx += dst_binding_layout->input_attachment_offset; - - uint32_t *src_dynamic, *dst_dynamic; - src_dynamic = src_set->dynamic_descriptors + src_idx * A6XX_TEX_CONST_DWORDS; - dst_dynamic = dst_set->dynamic_descriptors + dst_idx * A6XX_TEX_CONST_DWORDS; - memcpy(dst_dynamic, src_dynamic, A6XX_TEX_CONST_DWORDS * 4); - break; - } default: memcpy(dst_ptr, src_ptr, src_binding_layout->size); } @@ -1099,13 +1070,7 @@ tu_CreateDescriptorUpdateTemplate( switch (entry->descriptorType) { case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: - dst_offset = (set_layout->input_attachment_count + - binding_layout->dynamic_offset_offset + - entry->dstArrayElement) * A6XX_TEX_CONST_DWORDS; - dst_stride = A6XX_TEX_CONST_DWORDS; - break; - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - dst_offset = (binding_layout->input_attachment_offset + + dst_offset = (binding_layout->dynamic_offset_offset + entry->dstArrayElement) * A6XX_TEX_CONST_DWORDS; dst_stride = A6XX_TEX_CONST_DWORDS; break; @@ -1197,16 +1162,11 @@ tu_update_descriptor_set_with_template( break; case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: + case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: { write_image_descriptor(device, cmd_buffer, ptr, buffer_list, templ->entry[i].descriptor_type, src); break; - case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: { - write_image_descriptor(device, cmd_buffer, - set->dynamic_descriptors + dst_offset, - buffer_list, templ->entry[i].descriptor_type, - src); - break; } case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: write_combined_image_sampler_descriptor(device, cmd_buffer, diff --git a/src/freedreno/vulkan/tu_descriptor_set.h b/src/freedreno/vulkan/tu_descriptor_set.h index 22b5eb03970..8673729f0d8 100644 --- a/src/freedreno/vulkan/tu_descriptor_set.h +++ b/src/freedreno/vulkan/tu_descriptor_set.h @@ -54,9 +54,6 @@ struct tu_descriptor_set_binding_layout */ uint32_t dynamic_offset_offset; - /* Index into the array of dynamic input attachment descriptors */ - uint32_t input_attachment_offset; - /* Offset in the tu_descriptor_set_layout of the immutable samplers, or 0 * if there are no immutable samplers. */ uint32_t immutable_samplers_offset; @@ -86,9 +83,6 @@ struct tu_descriptor_set_layout /* Number of dynamic offsets used by this descriptor set */ uint16_t dynamic_offset_count; - /* Number of input attachments used by the descriptor set */ - uint16_t input_attachment_count; - /* A bitfield of which dynamic buffers are ubo's, to make the * descriptor-binding-time patching easier. */ @@ -110,13 +104,11 @@ struct tu_pipeline_layout struct tu_descriptor_set_layout *layout; uint32_t size; uint32_t dynamic_offset_start; - uint32_t input_attachment_start; } set[MAX_SETS]; uint32_t num_sets; uint32_t push_constant_size; uint32_t dynamic_offset_count; - uint32_t input_attachment_count; unsigned char sha1[20]; }; diff --git a/src/freedreno/vulkan/tu_pipeline.c b/src/freedreno/vulkan/tu_pipeline.c index e22b301099b..e556596c854 100644 --- a/src/freedreno/vulkan/tu_pipeline.c +++ b/src/freedreno/vulkan/tu_pipeline.c @@ -183,8 +183,7 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute) switch (binding->type) { case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: base = MAX_SETS; - offset = (layout->input_attachment_count + - layout->set[i].dynamic_offset_start + + offset = (layout->set[i].dynamic_offset_start + binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; /* fallthrough */ case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: @@ -201,9 +200,8 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute) } break; case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: - base = MAX_SETS; - offset = (layout->set[i].input_attachment_start + - binding->input_attachment_offset) * A6XX_TEX_CONST_DWORDS; + /* nothing - input attachment doesn't use bindless */ + break; case VK_DESCRIPTOR_TYPE_SAMPLER: case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { @@ -217,8 +215,7 @@ tu6_emit_load_state(struct tu_pipeline *pipeline, bool compute) } case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: base = MAX_SETS; - offset = (layout->input_attachment_count + - layout->set[i].dynamic_offset_start + + offset = (layout->set[i].dynamic_offset_start + binding->dynamic_offset_offset) * A6XX_TEX_CONST_DWORDS; /* fallthrough */ case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { @@ -2055,12 +2052,6 @@ tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder, desc_sets |= builder->shaders[i]->active_desc_sets; } pipeline->active_desc_sets = desc_sets; - - if (builder->shaders[MESA_SHADER_FRAGMENT]) { - memcpy(pipeline->program.input_attachment_idx, - builder->shaders[MESA_SHADER_FRAGMENT]->attachment_idx, - sizeof(pipeline->program.input_attachment_idx)); - } } static void diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index ddb25677b48..213e4bd4843 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -436,8 +436,6 @@ enum tu_draw_state_group_id TU_DRAW_STATE_GS_CONST, TU_DRAW_STATE_FS_CONST, TU_DRAW_STATE_DESC_SETS, - TU_DRAW_STATE_DESC_SETS_GMEM, - TU_DRAW_STATE_DESC_SETS_SYSMEM, TU_DRAW_STATE_DESC_SETS_LOAD, TU_DRAW_STATE_VS_PARAMS, @@ -630,11 +628,7 @@ tu_get_perftest_option_name(int id); struct tu_descriptor_state { struct tu_descriptor_set *sets[MAX_SETS]; - uint32_t valid; - struct tu_push_descriptor_set push_set; - bool push_dirty; uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS * A6XX_TEX_CONST_DWORDS]; - uint32_t input_attachments[MAX_RTS * A6XX_TEX_CONST_DWORDS]; }; struct tu_tile @@ -821,7 +815,7 @@ struct tu_cmd_state struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT]; struct tu_cs_entry vertex_buffers_ib; struct tu_cs_entry shader_const_ib[MESA_SHADER_STAGES]; - struct tu_cs_entry desc_sets_ib, desc_sets_gmem_ib, desc_sets_sysmem_ib, desc_sets_load_ib; + struct tu_cs_entry desc_sets_ib, desc_sets_load_ib; /* Stream output buffers */ struct @@ -1055,7 +1049,6 @@ struct tu_shader struct ir3_shader *ir3_shader; struct tu_push_constant_range push_consts; - unsigned attachment_idx[MAX_RTS]; uint8_t active_desc_sets; }; @@ -1109,7 +1102,6 @@ struct tu_pipeline struct tu_cs_entry binning_state_ib; struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; - unsigned input_attachment_idx[MAX_RTS]; } program; struct diff --git a/src/freedreno/vulkan/tu_shader.c b/src/freedreno/vulkan/tu_shader.c index 54ea0b86d21..1a0772726f5 100644 --- a/src/freedreno/vulkan/tu_shader.c +++ b/src/freedreno/vulkan/tu_shader.c @@ -141,8 +141,7 @@ lower_vulkan_resource_index(nir_builder *b, nir_intrinsic_instr *instr, case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC: case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: base = layout->set[set].dynamic_offset_start + - binding_layout->dynamic_offset_offset + - layout->input_attachment_count; + binding_layout->dynamic_offset_offset; set = MAX_SETS; break; default: @@ -177,31 +176,42 @@ build_bindless(nir_builder *b, nir_deref_instr *deref, bool is_sampler, const struct tu_descriptor_set_binding_layout *bind_layout = &layout->set[set].layout->binding[binding]; + /* input attachments use non bindless workaround */ + if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) { + const struct glsl_type *glsl_type = glsl_without_array(var->type); + uint32_t idx = var->data.index * 2; + + b->shader->info.textures_used |= + ((1ull << (bind_layout->array_size * 2)) - 1) << (idx * 2); + + /* D24S8 workaround: stencil of D24S8 will be sampled as uint */ + if (glsl_get_sampler_result_type(glsl_type) == GLSL_TYPE_UINT) + idx += 1; + + if (deref->deref_type == nir_deref_type_var) + return nir_imm_int(b, idx); + + nir_ssa_def *arr_index = nir_ssa_for_src(b, deref->arr.index, 1); + return nir_iadd(b, nir_imm_int(b, idx), + nir_imul_imm(b, arr_index, 2)); + } + shader->active_desc_sets |= 1u << set; nir_ssa_def *desc_offset; unsigned descriptor_stride; - if (bind_layout->type == VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT) { - unsigned offset = - layout->set[set].input_attachment_start + - bind_layout->input_attachment_offset; - desc_offset = nir_imm_int(b, offset); - set = MAX_SETS; - descriptor_stride = 1; - } else { - unsigned offset = 0; - /* Samplers come second in combined image/sampler descriptors, see - * write_combined_image_sampler_descriptor(). - */ - if (is_sampler && bind_layout->type == - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { - offset = 1; - } - desc_offset = - nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) + - offset); - descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS); + unsigned offset = 0; + /* Samplers come second in combined image/sampler descriptors, see + * write_combined_image_sampler_descriptor(). + */ + if (is_sampler && bind_layout->type == + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) { + offset = 1; } + desc_offset = + nir_imm_int(b, (bind_layout->offset / (4 * A6XX_TEX_CONST_DWORDS)) + + offset); + descriptor_stride = bind_layout->size / (4 * A6XX_TEX_CONST_DWORDS); if (deref->deref_type != nir_deref_type_var) { assert(deref->deref_type == nir_deref_type_array); @@ -356,6 +366,10 @@ lower_tex(nir_builder *b, nir_tex_instr *tex, nir_instr_rewrite_src(&tex->instr, &tex->src[tex_src_idx].src, nir_src_for_ssa(bindless)); tex->src[tex_src_idx].src_type = nir_tex_src_texture_handle; + + /* for the input attachment case: */ + if (bindless->parent_instr->type != nir_instr_type_intrinsic) + tex->src[tex_src_idx].src_type = nir_tex_src_texture_offset; } return true; @@ -435,38 +449,6 @@ gather_push_constants(nir_shader *shader, struct tu_shader *tu_shader) align(max, 16) / 16 - tu_shader->push_consts.lo; } -/* Gather the InputAttachmentIndex for each input attachment from the NIR - * shader and organize the info in a way so that draw-time patching is easy. - */ -static void -gather_input_attachments(nir_shader *shader, struct tu_shader *tu_shader, - const struct tu_pipeline_layout *layout) -{ - nir_foreach_variable(var, &shader->uniforms) { - const struct glsl_type *glsl_type = glsl_without_array(var->type); - - if (!glsl_type_is_image(glsl_type)) - continue; - - enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type); - - const uint32_t set = var->data.descriptor_set; - const uint32_t binding = var->data.binding; - const struct tu_descriptor_set_binding_layout *bind_layout = - &layout->set[set].layout->binding[binding]; - const uint32_t array_size = bind_layout->array_size; - - if (dim == GLSL_SAMPLER_DIM_SUBPASS || - dim == GLSL_SAMPLER_DIM_SUBPASS_MS) { - unsigned offset = - layout->set[set].input_attachment_start + - bind_layout->input_attachment_offset; - for (unsigned i = 0; i < array_size; i++) - tu_shader->attachment_idx[offset + i] = var->data.index + i; - } - } -} - static bool tu_lower_io(nir_shader *shader, struct tu_shader *tu_shader, const struct tu_pipeline_layout *layout) @@ -474,7 +456,6 @@ tu_lower_io(nir_shader *shader, struct tu_shader *tu_shader, bool progress = false; gather_push_constants(shader, tu_shader); - gather_input_attachments(shader, tu_shader, layout); nir_foreach_function(function, shader) { if (function->impl) -- 2.30.2