From 9ff1959ca5d24afe48bec20edf7e3d059d254134 Mon Sep 17 00:00:00 2001 From: Hyunjun Ko Date: Tue, 25 Feb 2020 10:08:25 +0900 Subject: [PATCH] turnip: Implement stream-out emit and vkApis for transform feedback 1. Implement vkCmdBindTransformFeedbackBuffersEXT, vkCmdBeginTransformFeedbackEXT and vkCmdEndTransformFeedbackEXT. - Not handling counter buffers yet. 2. Implement streamout emit function, mostly taken from fd6_emit.c v2. Replace emit_pkt4 funcs with emit_regs. v3. Don't copy the state of stream-output from tu_pipeline. v4. Set zero to VPC_SO_CNTL/VPC_SO_BUF_CNTL in tu6_init_hw. Signed-off-by: Hyunjun Ko Reviewed-by: Jonathan Marek Part-of: --- src/freedreno/vulkan/tu_cmd_buffer.c | 179 +++++++++++++++++++++----- src/freedreno/vulkan/tu_extensions.py | 1 + src/freedreno/vulkan/tu_private.h | 1 + 3 files changed, 148 insertions(+), 33 deletions(-) diff --git a/src/freedreno/vulkan/tu_cmd_buffer.c b/src/freedreno/vulkan/tu_cmd_buffer.c index 74997b31933..4367f8e4dfb 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.c +++ b/src/freedreno/vulkan/tu_cmd_buffer.c @@ -798,7 +798,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd, tu6_emit_window_offset(cmd, cs, x1, y1); tu_cs_emit_regs(cs, - A6XX_VPC_SO_OVERRIDE(.so_disable = true)); + A6XX_VPC_SO_OVERRIDE(.so_disable = false)); if (use_hw_binning(cmd)) { tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0); @@ -1144,38 +1144,12 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0)); tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - tu_cs_emit_regs(cs, - A6XX_VPC_SO_BUFFER_BASE(0), - A6XX_VPC_SO_BUFFER_SIZE(0)); - - tu_cs_emit_regs(cs, - A6XX_VPC_SO_FLUSH_BASE(0)); - - tu_cs_emit_regs(cs, - A6XX_VPC_SO_BUF_CNTL(0)); - - tu_cs_emit_regs(cs, - A6XX_VPC_SO_BUFFER_OFFSET(0, 0)); - - tu_cs_emit_regs(cs, - A6XX_VPC_SO_BUFFER_BASE(1, 0), - A6XX_VPC_SO_BUFFER_SIZE(1, 0)); - - tu_cs_emit_regs(cs, - A6XX_VPC_SO_BUFFER_OFFSET(1, 0), - A6XX_VPC_SO_FLUSH_BASE(1, 0), - A6XX_VPC_SO_BUFFER_BASE(2, 0), - A6XX_VPC_SO_BUFFER_SIZE(2, 0)); - - tu_cs_emit_regs(cs, - A6XX_VPC_SO_BUFFER_OFFSET(2, 0), - A6XX_VPC_SO_FLUSH_BASE(2, 0), - A6XX_VPC_SO_BUFFER_BASE(3, 0), - A6XX_VPC_SO_BUFFER_SIZE(3, 0)); - - tu_cs_emit_regs(cs, - A6XX_VPC_SO_BUFFER_OFFSET(3, 0), - A6XX_VPC_SO_FLUSH_BASE(3, 0)); + /* Set not to use streamout by default, */ + tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4); + tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL); + tu_cs_emit(cs, 0); tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(0)); @@ -1577,6 +1551,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) const struct tu_tiling_config *tiling = &cmd->state.tiling_config; if (use_hw_binning(cmd)) { + /* enable stream-out during binning pass: */ + tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false)); + tu6_emit_bin_size(cs, tiling->tile0.extent.width, tiling->tile0.extent.height, @@ -1586,6 +1563,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu6_emit_binning_pass(cmd, cs); + /* and disable stream-out for draw pass: */ + tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=true)); + tu6_emit_bin_size(cs, tiling->tile0.extent.width, tiling->tile0.extent.height, @@ -1601,6 +1581,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs) tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1); tu_cs_emit(cs, 0x1); } else { + /* no binning pass, so enable stream-out for draw pass:: */ + tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false)); + tu6_emit_bin_size(cs, tiling->tile0.extent.width, tiling->tile0.extent.height, @@ -2173,6 +2156,56 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer, cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS; } +void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, + uint32_t firstBinding, + uint32_t bindingCount, + const VkBuffer *pBuffers, + const VkDeviceSize *pOffsets, + const VkDeviceSize *pSizes) +{ + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + assert(firstBinding + bindingCount <= IR3_MAX_SO_BUFFERS); + + for (uint32_t i = 0; i < bindingCount; i++) { + uint32_t idx = firstBinding + i; + TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]); + + if (pOffsets[i] != 0) + cmd->state.streamout_reset |= 1 << idx; + + cmd->state.streamout_buf.buffers[idx] = buf; + cmd->state.streamout_buf.offsets[idx] = pOffsets[i]; + cmd->state.streamout_buf.sizes[idx] = pSizes[i]; + + cmd->state.streamout_enabled |= 1 << idx; + } + + cmd->state.dirty |= TU_CMD_DIRTY_STREAMOUT_BUFFERS; +} + +void tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer *pCounterBuffers, + const VkDeviceSize *pCounterBufferOffsets) +{ + assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS); + /* TODO do something with counter buffer? */ +} + +void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, + uint32_t firstCounterBuffer, + uint32_t counterBufferCount, + const VkBuffer *pCounterBuffers, + const VkDeviceSize *pCounterBufferOffsets) +{ + assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS); + /* TODO do something with counter buffer? */ + + TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); + cmd->state.streamout_enabled = 0; +} + void tu_CmdPushConstants(VkCommandBuffer commandBuffer, VkPipelineLayout layout, @@ -3374,6 +3407,67 @@ tu6_emit_border_color(struct tu_cmd_buffer *cmd, return VK_SUCCESS; } +static void +tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs) +{ + struct tu_streamout_state *tf = &cmd->state.pipeline->streamout; + + for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { + struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i]; + if (!buf) + continue; + + uint32_t offset; + offset = cmd->state.streamout_buf.offsets[i]; + + tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_BASE(i, .bo = buf->bo, + .bo_offset = buf->bo_offset)); + tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_SIZE(i, buf->size)); + + if (cmd->state.streamout_reset & (1 << i)) { + offset *= tf->stride[i]; + + tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, offset)); + cmd->state.streamout_reset &= ~(1 << i); + } else { + tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3); + tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) | + CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 | + CP_MEM_TO_REG_0_CNT(0)); + tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_FLUSH * (i + 1)); + } + + tu_cs_emit_regs(cs, A6XX_VPC_SO_FLUSH_BASE(i, .bo = &cmd->scratch_bo, + .bo_offset = VSC_FLUSH * (i + 1))); + } + + if (cmd->state.streamout_enabled) { + tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 12 + (2 * tf->prog_count)); + tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL); + tu_cs_emit(cs, tf->vpc_so_buf_cntl); + tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(0)); + tu_cs_emit(cs, tf->ncomp[0]); + tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(1)); + tu_cs_emit(cs, tf->ncomp[1]); + tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(2)); + tu_cs_emit(cs, tf->ncomp[2]); + tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(3)); + tu_cs_emit(cs, tf->ncomp[3]); + tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); + tu_cs_emit(cs, A6XX_VPC_SO_CNTL_ENABLE); + for (unsigned i = 0; i < tf->prog_count; i++) { + tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG); + tu_cs_emit(cs, tf->prog[i]); + } + } else { + tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4); + tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL); + tu_cs_emit(cs, 0); + } +} + static VkResult tu6_bind_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs, @@ -3505,6 +3599,9 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, }; } + if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) + tu6_emit_streamout(cmd, cs); + if (cmd->state.dirty & (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) { bool needs_border = false; @@ -3623,6 +3720,15 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd, } } } + if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) { + for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { + const struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i]; + if (buf) { + tu_bo_list_add(&cmd->bo_list, buf->bo, + MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE); + } + } + } /* Fragment shader state overwrites compute shader state, so flag the * compute pipeline for re-emit. @@ -3742,6 +3848,13 @@ tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw) else tu6_emit_draw_direct(cmd, cs, draw); + if (cmd->state.streamout_enabled) { + for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { + if (cmd->state.streamout_enabled & (1 << i)) + tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i, false); + } + } + cmd->wait_for_idle = true; tu_cs_sanity_check(cs); diff --git a/src/freedreno/vulkan/tu_extensions.py b/src/freedreno/vulkan/tu_extensions.py index 3e1a7cf6625..efeb8dc0c89 100644 --- a/src/freedreno/vulkan/tu_extensions.py +++ b/src/freedreno/vulkan/tu_extensions.py @@ -77,6 +77,7 @@ EXTENSIONS = [ Extension('VK_KHR_external_memory_fd', 1, True), Extension('VK_EXT_external_memory_dma_buf', 1, True), Extension('VK_EXT_image_drm_format_modifier', 1, False), + Extension('VK_EXT_transform_feedback', 1, False), ] class VkVersion: diff --git a/src/freedreno/vulkan/tu_private.h b/src/freedreno/vulkan/tu_private.h index d04fe99aad1..661529fa537 100644 --- a/src/freedreno/vulkan/tu_private.h +++ b/src/freedreno/vulkan/tu_private.h @@ -984,6 +984,7 @@ struct tu_cmd_buffer uint32_t scratch_seqno; #define VSC_OVERFLOW 0x8 #define VSC_SCRATCH 0x10 +#define VSC_FLUSH 0x20 struct tu_bo vsc_data; struct tu_bo vsc_data2; -- 2.30.2