si_emit_cache_flush(cmd_buffer);
}
+ /* Make sure CP DMA is idle at the end of IBs because the kernel
+ * doesn't wait for it.
+ */
+ si_cp_dma_wait_for_idle(cmd_buffer);
+
vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
0);
}
+ /* Make sure CP DMA is idle because the driver might have performed a
+ * DMA operation for copying or filling buffers/images.
+ */
+ si_cp_dma_wait_for_idle(cmd_buffer);
+
cmd_buffer->state.flush_bits |= dst_flush_bits;
}
VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
+ /* Make sure CP DMA is idle because the driver might have performed a
+ * DMA operation for copying or filling buffers/images.
+ */
+ si_cp_dma_wait_for_idle(cmd_buffer);
+
/* TODO: Emit EOS events for syncing PS/CS stages. */
if (!(stageMask & ~top_of_pipe_flags)) {
uint32_t last_num_instances;
uint32_t last_first_instance;
uint32_t last_vertex_offset;
+
+ /* Whether CP DMA is busy/idle. */
+ bool dma_is_busy;
};
struct radv_cmd_pool {
unsigned size);
void si_cp_dma_clear_buffer(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
uint64_t size, unsigned value);
+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer);
+
void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer);
bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer,
struct radeon_cmdbuf *cs = cmd_buffer->cs;
uint32_t header = 0, command = 0;
- assert(size);
assert(size <= cp_dma_max_byte_count(cmd_buffer));
radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 9);
* indices. If we wanted to execute CP DMA in PFP, this packet
* should precede it.
*/
- if ((flags & CP_DMA_SYNC) && cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
- radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
- radeon_emit(cs, 0);
+ if (flags & CP_DMA_SYNC) {
+ if (cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL) {
+ radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
+ radeon_emit(cs, 0);
+ }
+
+ /* CP will see the sync flag and wait for all DMAs to complete. */
+ cmd_buffer->state.dma_is_busy = false;
}
if (unlikely(cmd_buffer->device->trace_bo))
uint64_t main_src_va, main_dest_va;
uint64_t skipped_size = 0, realign_size = 0;
+ /* Assume that we are not going to sync after the last DMA operation. */
+ cmd_buffer->state.dma_is_busy = true;
if (cmd_buffer->device->physical_device->rad_info.family <= CHIP_CARRIZO ||
cmd_buffer->device->physical_device->rad_info.family == CHIP_STONEY) {
assert(va % 4 == 0 && size % 4 == 0);
+ /* Assume that we are not going to sync after the last DMA operation. */
+ cmd_buffer->state.dma_is_busy = true;
+
while (size) {
unsigned byte_count = MIN2(size, cp_dma_max_byte_count(cmd_buffer));
unsigned dma_flags = CP_DMA_CLEAR;
}
}
+void si_cp_dma_wait_for_idle(struct radv_cmd_buffer *cmd_buffer)
+{
+ if (cmd_buffer->device->physical_device->rad_info.chip_class < CIK)
+ return;
+
+ if (!cmd_buffer->state.dma_is_busy)
+ return;
+
+ /* Issue a dummy DMA that copies zero bytes.
+ *
+ * The DMA engine will see that there's no work to do and skip this
+ * DMA request, however, the CP will see the sync flag and still wait
+ * for all DMAs to complete.
+ */
+ si_emit_cp_dma(cmd_buffer, 0, 0, 0, CP_DMA_SYNC);
+
+ cmd_buffer->state.dma_is_busy = false;
+}
+
/* For MSAA sample positions. */
#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
(((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) | \