turnip: Implement stream-out emit and vkApis for transform feedback
authorHyunjun Ko <zzoon@igalia.com>
Tue, 25 Feb 2020 01:08:25 +0000 (10:08 +0900)
committerMarge Bot <eric+marge@anholt.net>
Thu, 12 Mar 2020 03:10:16 +0000 (03:10 +0000)
1. Implement vkCmdBindTransformFeedbackBuffersEXT,
vkCmdBeginTransformFeedbackEXT and vkCmdEndTransformFeedbackEXT.
 - Not handling counter buffers yet.
2. Implement streamout emit function, mostly taken from fd6_emit.c

v2. Replace emit_pkt4 funcs with emit_regs.

v3. Don't copy the state of stream-output from tu_pipeline.

v4. Set zero to VPC_SO_CNTL/VPC_SO_BUF_CNTL in tu6_init_hw.

Signed-off-by: Hyunjun Ko <zzoon@igalia.com>
Reviewed-by: Jonathan Marek <jonathan@marek.ca>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/3942>

src/freedreno/vulkan/tu_cmd_buffer.c
src/freedreno/vulkan/tu_extensions.py
src/freedreno/vulkan/tu_private.h

index 74997b319335d38ce0f4629a42910729311c9f7e..4367f8e4dfb7af1a907731eb90eb3742f758a689 100644 (file)
@@ -798,7 +798,7 @@ tu6_emit_tile_select(struct tu_cmd_buffer *cmd,
    tu6_emit_window_offset(cmd, cs, x1, y1);
 
    tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_OVERRIDE(.so_disable = true));
+                   A6XX_VPC_SO_OVERRIDE(.so_disable = false));
 
    if (use_hw_binning(cmd)) {
       tu_cs_emit_pkt7(cs, CP_WAIT_FOR_ME, 0);
@@ -1144,38 +1144,12 @@ tu6_init_hw(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
    tu_cs_emit(cs, CP_SET_DRAW_STATE__1_ADDR_LO(0));
    tu_cs_emit(cs, CP_SET_DRAW_STATE__2_ADDR_HI(0));
 
-   tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_BUFFER_BASE(0),
-                   A6XX_VPC_SO_BUFFER_SIZE(0));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_FLUSH_BASE(0));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_BUF_CNTL(0));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_BUFFER_OFFSET(0, 0));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_BUFFER_BASE(1, 0),
-                   A6XX_VPC_SO_BUFFER_SIZE(1, 0));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_BUFFER_OFFSET(1, 0),
-                   A6XX_VPC_SO_FLUSH_BASE(1, 0),
-                   A6XX_VPC_SO_BUFFER_BASE(2, 0),
-                   A6XX_VPC_SO_BUFFER_SIZE(2, 0));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_BUFFER_OFFSET(2, 0),
-                   A6XX_VPC_SO_FLUSH_BASE(2, 0),
-                   A6XX_VPC_SO_BUFFER_BASE(3, 0),
-                   A6XX_VPC_SO_BUFFER_SIZE(3, 0));
-
-   tu_cs_emit_regs(cs,
-                   A6XX_VPC_SO_BUFFER_OFFSET(3, 0),
-                   A6XX_VPC_SO_FLUSH_BASE(3, 0));
+   /* Set not to use streamout by default, */
+   tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
+   tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
+   tu_cs_emit(cs, 0);
+   tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
+   tu_cs_emit(cs, 0);
 
    tu_cs_emit_regs(cs,
                    A6XX_SP_HS_CTRL_REG0(0));
@@ -1577,6 +1551,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
    const struct tu_tiling_config *tiling = &cmd->state.tiling_config;
    if (use_hw_binning(cmd)) {
+      /* enable stream-out during binning pass: */
+      tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
+
       tu6_emit_bin_size(cs,
                         tiling->tile0.extent.width,
                         tiling->tile0.extent.height,
@@ -1586,6 +1563,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
 
       tu6_emit_binning_pass(cmd, cs);
 
+      /* and disable stream-out for draw pass: */
+      tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=true));
+
       tu6_emit_bin_size(cs,
                         tiling->tile0.extent.width,
                         tiling->tile0.extent.height,
@@ -1601,6 +1581,9 @@ tu6_tile_render_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
       tu_cs_emit_pkt7(cs, CP_SKIP_IB2_ENABLE_GLOBAL, 1);
       tu_cs_emit(cs, 0x1);
    } else {
+      /* no binning pass, so enable stream-out for draw pass:: */
+      tu_cs_emit_regs(cs, A6XX_VPC_SO_OVERRIDE(.so_disable=false));
+
       tu6_emit_bin_size(cs,
                         tiling->tile0.extent.width,
                         tiling->tile0.extent.height,
@@ -2173,6 +2156,56 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
    cmd_buffer->state.dirty |= TU_CMD_DIRTY_DESCRIPTOR_SETS;
 }
 
+void tu_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
+                                           uint32_t firstBinding,
+                                           uint32_t bindingCount,
+                                           const VkBuffer *pBuffers,
+                                           const VkDeviceSize *pOffsets,
+                                           const VkDeviceSize *pSizes)
+{
+   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   assert(firstBinding + bindingCount <= IR3_MAX_SO_BUFFERS);
+
+   for (uint32_t i = 0; i < bindingCount; i++) {
+      uint32_t idx = firstBinding + i;
+      TU_FROM_HANDLE(tu_buffer, buf, pBuffers[i]);
+
+      if (pOffsets[i] != 0)
+         cmd->state.streamout_reset |= 1 << idx;
+
+      cmd->state.streamout_buf.buffers[idx] = buf;
+      cmd->state.streamout_buf.offsets[idx] = pOffsets[i];
+      cmd->state.streamout_buf.sizes[idx] = pSizes[i];
+
+      cmd->state.streamout_enabled |= 1 << idx;
+   }
+
+   cmd->state.dirty |= TU_CMD_DIRTY_STREAMOUT_BUFFERS;
+}
+
+void tu_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
+                                       uint32_t firstCounterBuffer,
+                                       uint32_t counterBufferCount,
+                                       const VkBuffer *pCounterBuffers,
+                                       const VkDeviceSize *pCounterBufferOffsets)
+{
+   assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS);
+   /* TODO do something with counter buffer? */
+}
+
+void tu_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
+                                       uint32_t firstCounterBuffer,
+                                       uint32_t counterBufferCount,
+                                       const VkBuffer *pCounterBuffers,
+                                       const VkDeviceSize *pCounterBufferOffsets)
+{
+   assert(firstCounterBuffer + counterBufferCount <= IR3_MAX_SO_BUFFERS);
+   /* TODO do something with counter buffer? */
+
+   TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
+   cmd->state.streamout_enabled = 0;
+}
+
 void
 tu_CmdPushConstants(VkCommandBuffer commandBuffer,
                     VkPipelineLayout layout,
@@ -3374,6 +3407,67 @@ tu6_emit_border_color(struct tu_cmd_buffer *cmd,
    return VK_SUCCESS;
 }
 
+static void
+tu6_emit_streamout(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
+{
+   struct tu_streamout_state *tf = &cmd->state.pipeline->streamout;
+
+   for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+      struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i];
+      if (!buf)
+         continue;
+
+      uint32_t offset;
+      offset = cmd->state.streamout_buf.offsets[i];
+
+      tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_BASE(i, .bo = buf->bo,
+                                                     .bo_offset = buf->bo_offset));
+      tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_SIZE(i, buf->size));
+
+      if (cmd->state.streamout_reset & (1 << i)) {
+         offset *= tf->stride[i];
+
+         tu_cs_emit_regs(cs, A6XX_VPC_SO_BUFFER_OFFSET(i, offset));
+         cmd->state.streamout_reset &= ~(1  << i);
+      } else {
+         tu_cs_emit_pkt7(cs, CP_MEM_TO_REG, 3);
+         tu_cs_emit(cs, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) |
+                        CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 |
+                        CP_MEM_TO_REG_0_CNT(0));
+         tu_cs_emit_qw(cs, cmd->scratch_bo.iova + VSC_FLUSH * (i + 1));
+      }
+
+      tu_cs_emit_regs(cs, A6XX_VPC_SO_FLUSH_BASE(i, .bo = &cmd->scratch_bo,
+                                                    .bo_offset = VSC_FLUSH * (i + 1)));
+   }
+
+   if (cmd->state.streamout_enabled) {
+      tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 12 + (2 * tf->prog_count));
+      tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
+      tu_cs_emit(cs, tf->vpc_so_buf_cntl);
+      tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(0));
+      tu_cs_emit(cs, tf->ncomp[0]);
+      tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(1));
+      tu_cs_emit(cs, tf->ncomp[1]);
+      tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(2));
+      tu_cs_emit(cs, tf->ncomp[2]);
+      tu_cs_emit(cs, REG_A6XX_VPC_SO_NCOMP(3));
+      tu_cs_emit(cs, tf->ncomp[3]);
+      tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
+      tu_cs_emit(cs, A6XX_VPC_SO_CNTL_ENABLE);
+      for (unsigned i = 0; i < tf->prog_count; i++) {
+         tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
+         tu_cs_emit(cs, tf->prog[i]);
+      }
+   } else {
+      tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
+      tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
+      tu_cs_emit(cs, 0);
+      tu_cs_emit(cs, REG_A6XX_VPC_SO_BUF_CNTL);
+      tu_cs_emit(cs, 0);
+   }
+}
+
 static VkResult
 tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
                      struct tu_cs *cs,
@@ -3505,6 +3599,9 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
          };
    }
 
+   if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS)
+      tu6_emit_streamout(cmd, cs);
+
    if (cmd->state.dirty &
          (TU_CMD_DIRTY_PIPELINE | TU_CMD_DIRTY_DESCRIPTOR_SETS)) {
       bool needs_border = false;
@@ -3623,6 +3720,15 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
             }
       }
    }
+   if (cmd->state.dirty & TU_CMD_DIRTY_STREAMOUT_BUFFERS) {
+      for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+         const struct tu_buffer *buf = cmd->state.streamout_buf.buffers[i];
+         if (buf) {
+            tu_bo_list_add(&cmd->bo_list, buf->bo,
+                              MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
+         }
+      }
+   }
 
    /* Fragment shader state overwrites compute shader state, so flag the
     * compute pipeline for re-emit.
@@ -3742,6 +3848,13 @@ tu_draw(struct tu_cmd_buffer *cmd, const struct tu_draw_info *draw)
    else
       tu6_emit_draw_direct(cmd, cs, draw);
 
+   if (cmd->state.streamout_enabled) {
+      for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) {
+         if (cmd->state.streamout_enabled & (1 << i))
+            tu6_emit_event_write(cmd, cs, FLUSH_SO_0 + i, false);
+      }
+   }
+
    cmd->wait_for_idle = true;
 
    tu_cs_sanity_check(cs);
index 3e1a7cf6625e1af7d3cc0e649d38b9334732d97b..efeb8dc0c89e65c3c5552c724b4895b09dbaa007 100644 (file)
@@ -77,6 +77,7 @@ EXTENSIONS = [
     Extension('VK_KHR_external_memory_fd',                1, True),
     Extension('VK_EXT_external_memory_dma_buf',           1, True),
     Extension('VK_EXT_image_drm_format_modifier',         1, False),
+    Extension('VK_EXT_transform_feedback',                1, False),
 ]
 
 class VkVersion:
index d04fe99aad1f2805986bcc2437a335d4b529aeb1..661529fa537be7fb2889e1115db5a743f41f4a55 100644 (file)
@@ -984,6 +984,7 @@ struct tu_cmd_buffer
    uint32_t scratch_seqno;
 #define VSC_OVERFLOW 0x8
 #define VSC_SCRATCH 0x10
+#define VSC_FLUSH   0x20
 
    struct tu_bo vsc_data;
    struct tu_bo vsc_data2;