radv: make sure to wait for CP DMA when needed
authorSamuel Pitoiset <samuel.pitoiset@gmail.com>
Mon, 9 Jul 2018 16:02:58 +0000 (18:02 +0200)
committerSamuel Pitoiset <samuel.pitoiset@gmail.com>
Wed, 11 Jul 2018 10:11:56 +0000 (12:11 +0200)
This might fix some synchronization issues. I don't know if
that will affect performance but it's required for correctness.

CC: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
src/amd/vulkan/radv_cmd_buffer.c
src/amd/vulkan/radv_private.h
src/amd/vulkan/si_cmd_buffer.c

index 9da42fe03e9fd2af75a7166057486ab9fcd82e57..5dbdb3d99665f3f101e0a6fb0887035a4936326f 100644 (file)
@@ -2596,6 +2596,11 @@ VkResult radv_EndCommandBuffer(
                si_emit_cache_flush(cmd_buffer);
        }
 
+       /* Make sure CP DMA is idle at the end of IBs because the kernel
+        * doesn't wait for it.
+        */
+       si_cp_dma_wait_for_idle(cmd_buffer);
+
        vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
 
        if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
@@ -4242,6 +4247,11 @@ radv_barrier(struct radv_cmd_buffer *cmd_buffer,
                                             0);
        }
 
+       /* Make sure CP DMA is idle because the driver might have performed a
+        * DMA operation for copying or filling buffers/images.
+        */
+       si_cp_dma_wait_for_idle(cmd_buffer);
+
        cmd_buffer->state.flush_bits |= dst_flush_bits;
 }
 
@@ -4292,6 +4302,11 @@ static void write_event(struct radv_cmd_buffer *cmd_buffer,
                VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
                VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
 
+       /* Make sure CP DMA is idle because the driver might have performed a
+        * DMA operation for copying or filling buffers/images.
+        */
+       si_cp_dma_wait_for_idle(cmd_buffer);
+
        /* TODO: Emit EOS events for syncing PS/CS stages. */
 
        if (!(stageMask & ~top_of_pipe_flags)) {
index 4e4b3a6037a8b4ef245f62c04d5c5da0b9ad62b5..2400de49a25f9e0e7991c7e75a254ef629e41e66 100644 (file)
@@ -979,6 +979,9 @@ struct radv_cmd_state {
        uint32_t last_num_instances;
        uint32_t last_first_instance;
        uint32_t last_vertex_offset;
+
+       /* Whether CP DMA is busy/idle. */
+       bool dma_is_busy;
 };
 
 struct radv_cmd_pool {
@@ -1091,6 +1094,8 @@ void si_cp_dma_prefetch(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
                         unsigned size);
 void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
                            uint64_t size, unsigned value);
+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
+
 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
 bool
 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
index 454fd8c39c806dc28c7fe18eb891daf724032332..6d566a918dfc4540e0055cd956c566b74995c066 100644 (file)
@@ -1040,7 +1040,6 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
        struct radeon_cmdbuf *cs = cmd_buffer->cs;
        uint32_t header = 0, command = 0;
 
-       assert(size);
        assert(size <= cp_dma_max_byte_count(cmd_buffer));
 
        radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
@@ -1099,9 +1098,14 @@ static void si_emit_cp_dma(struct radv_cmd_buffer *cmd_buffer,
         * indices. If we wanted to execute CP DMA in PFP, this packet
         * should precede it.
         */
-       if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
-               radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
-               radeon_emit(cs, 0);
+       if (flags & CP_DMA_SYNC) {
+               if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
+                       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
+                       radeon_emit(cs, 0);
+               }
+
+               /* CP will see the sync flag and wait for all DMAs to complete. */
+               cmd_buffer->state.dma_is_busy = false;
        }
 
        if (unlikely(cmd_buffer->device->trace_bo))
@@ -1165,6 +1169,8 @@ void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
        uint64_t main_src_va, main_dest_va;
        uint64_t skipped_size = 0, realign_size = 0;
 
+       /* Assume that we are not going to sync after the last DMA operation. */
+       cmd_buffer->state.dma_is_busy = true;
 
        if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
            cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) {
@@ -1228,6 +1234,9 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
 
        assert(va % 4 == 0 && size % 4 == 0);
 
+       /* Assume that we are not going to sync after the last DMA operation. */
+       cmd_buffer->state.dma_is_busy = true;
+
        while (size) {
                unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer));
                unsigned dma_flags = CP_DMA_CLEAR;
@@ -1243,6 +1252,25 @@ void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
        }
 }
 
+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer)
+{
+       if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK)
+               return;
+
+       if (!cmd_buffer->state.dma_is_busy)
+               return;
+
+       /* Issue a dummy DMA that copies zero bytes.
+        *
+        * The DMA engine will see that there's no work to do and skip this
+        * DMA request, however, the CP will see the sync flag and still wait
+        * for all DMAs to complete.
+        */
+       si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC);
+
+       cmd_buffer->state.dma_is_busy = false;
+}
+
 /* For MSAA sample positions. */
 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
        (((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |                  \