radv: Split out commandbuffer submission.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Thu, 3 Oct 2019 19:08:29 +0000 (21:08 +0200)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Wed, 30 Oct 2019 10:57:07 +0000 (11:57 +0100)
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
src/amd/vulkan/radv_device.c

index 1d2addcd1d06001f838ce14e14f1e4d0ddccff35..239409adadae15a11e16c2bfdfec5989b361b859 100644 (file)
@@ -3516,172 +3516,211 @@ radv_alloc_sem_info(struct radv_instance *instance,
        return ret;
 }
 
-/* Signals fence as soon as all the work currently put on queue is done. */
-static VkResult radv_signal_fence(struct radv_queue *queue,
-                              struct radv_fence *fence)
-{
-       int ret;
-       VkResult result;
-       struct radv_winsys_sem_info sem_info;
-
-       result = radv_alloc_sem_info(queue->device->instance, &sem_info, 0, NULL, 0, NULL,
-                                    radv_fence_to_handle(fence));
-       if (result != VK_SUCCESS)
-               return result;
-
-       ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
-                                          &queue->device->empty_cs[queue->queue_family_index],
-                                          1, NULL, NULL, &sem_info, NULL,
-                                          false, fence->fence);
-       radv_free_sem_info(&sem_info);
-
-       if (ret)
-               return vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST);
-
-       return VK_SUCCESS;
-}
-
-VkResult radv_QueueSubmit(
-       VkQueue                                     _queue,
-       uint32_t                                    submitCount,
-       const VkSubmitInfo*                         pSubmits,
-       VkFence                                     _fence)
+static VkResult
+radv_get_preambles(struct radv_queue *queue,
+                   const VkCommandBuffer *cmd_buffers,
+                   uint32_t cmd_buffer_count,
+                   struct radeon_cmdbuf **initial_full_flush_preamble_cs,
+                   struct radeon_cmdbuf **initial_preamble_cs,
+                   struct radeon_cmdbuf **continue_preamble_cs)
 {
-       RADV_FROM_HANDLE(radv_queue, queue, _queue);
-       RADV_FROM_HANDLE(radv_fence, fence, _fence);
-       struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL;
-       struct radeon_winsys_ctx *ctx = queue->hw_ctx;
-       int ret;
-       uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
        uint32_t scratch_size = 0;
        uint32_t compute_scratch_size = 0;
        uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
-       struct radeon_cmdbuf *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL;
-       VkResult result;
-       bool fence_emitted = false;
        bool tess_rings_needed = false;
        bool gds_needed = false;
        bool sample_positions_needed = false;
 
-       /* Do this first so failing to allocate scratch buffers can't result in
-        * partially executed submissions. */
-       for (uint32_t i = 0; i < submitCount; i++) {
-               for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
-                       RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
-                                        pSubmits[i].pCommandBuffers[j]);
+       for (uint32_t j = 0; j < cmd_buffer_count; j++) {
+               RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
+                                cmd_buffers[j]);
 
-                       scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed);
-                       compute_scratch_size = MAX2(compute_scratch_size,
-                                                   cmd_buffer->compute_scratch_size_needed);
-                       esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
-                       gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
-                       tess_rings_needed |= cmd_buffer->tess_rings_needed;
-                       gds_needed |= cmd_buffer->gds_needed;
-                       sample_positions_needed |= cmd_buffer->sample_positions_needed;
-               }
+               scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed);
+               compute_scratch_size = MAX2(compute_scratch_size,
+                                           cmd_buffer->compute_scratch_size_needed);
+               esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
+               gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
+               tess_rings_needed |= cmd_buffer->tess_rings_needed;
+               gds_needed |= cmd_buffer->gds_needed;
+               sample_positions_needed |= cmd_buffer->sample_positions_needed;
        }
 
-       result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
+       return radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
                                      esgs_ring_size, gsvs_ring_size, tess_rings_needed,
-                                     gds_needed, sample_positions_needed,
-                                     &initial_flush_preamble_cs,
-                                     &initial_preamble_cs, &continue_preamble_cs);
+                                     gds_needed, sample_positions_needed,
+                                     initial_full_flush_preamble_cs,
+                                     initial_preamble_cs, continue_preamble_cs);
+}
+
+
+struct radv_queue_submission {
+       const VkCommandBuffer *cmd_buffers;
+       uint32_t cmd_buffer_count;
+       bool flush_caches;
+       VkPipelineStageFlags wait_dst_stage_mask;
+       const VkSemaphore *wait_semaphores;
+       uint32_t wait_semaphore_count;
+       const VkSemaphore *signal_semaphores;
+       uint32_t signal_semaphore_count;
+       VkFence fence;
+};
+
+static VkResult
+radv_queue_submit(struct radv_queue *queue,
+                  const struct radv_queue_submission *submission)
+{
+       RADV_FROM_HANDLE(radv_fence, fence, submission->fence);
+       struct radeon_cmdbuf **cs_array;
+       struct radeon_winsys_ctx *ctx = queue->hw_ctx;
+       uint32_t max_cs_submission = queue->device->trace_bo ? 1 : RADV_MAX_IBS_PER_SUBMIT;
+       struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL;
+       bool do_flush = submission->flush_caches || submission->wait_dst_stage_mask;
+       bool can_patch = true;
+       uint32_t advance;
+       struct radv_winsys_sem_info sem_info;
+       VkResult result;
+       int ret;
+       struct radeon_cmdbuf *initial_preamble_cs = NULL;
+       struct radeon_cmdbuf *initial_flush_preamble_cs = NULL;
+       struct radeon_cmdbuf *continue_preamble_cs = NULL;
+
+       result = radv_get_preambles(queue, submission->cmd_buffers,
+                                   submission->cmd_buffer_count,
+                                   &initial_preamble_cs,
+                                   &initial_flush_preamble_cs,
+                                   &continue_preamble_cs);
        if (result != VK_SUCCESS)
                return result;
 
-       for (uint32_t i = 0; i < submitCount; i++) {
-               struct radeon_cmdbuf **cs_array;
-               bool do_flush = !i || pSubmits[i].pWaitDstStageMask;
-               bool can_patch = true;
-               uint32_t advance;
-               struct radv_winsys_sem_info sem_info;
-
-               result = radv_alloc_sem_info(queue->device->instance,
-                                            &sem_info,
-                                            pSubmits[i].waitSemaphoreCount,
-                                            pSubmits[i].pWaitSemaphores,
-                                            pSubmits[i].signalSemaphoreCount,
-                                            pSubmits[i].pSignalSemaphores,
-                                            _fence);
-               if (result != VK_SUCCESS)
-                       return result;
+       result = radv_alloc_sem_info(queue->device->instance,
+                                    &sem_info,
+                                    submission->wait_semaphore_count,
+                                    submission->wait_semaphores,
+                                    submission->signal_semaphore_count,
+                                    submission->signal_semaphores,
+                                    submission->fence);
+       if (result != VK_SUCCESS)
+               return result;
 
-               if (!pSubmits[i].commandBufferCount) {
-                       if (pSubmits[i].waitSemaphoreCount || pSubmits[i].signalSemaphoreCount) {
-                               ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
-                                                                  &queue->device->empty_cs[queue->queue_family_index],
-                                                                  1, NULL, NULL,
-                                                                  &sem_info, NULL,
-                                                                  false, base_fence);
-                               if (ret) {
-                                       radv_loge("failed to submit CS %d\n", i);
-                                       abort();
-                               }
-                               fence_emitted = true;
-                       }
-                       radv_free_sem_info(&sem_info);
-                       continue;
+       if (!submission->cmd_buffer_count) {
+               ret = queue->device->ws->cs_submit(ctx, queue->queue_idx,
+                                                  &queue->device->empty_cs[queue->queue_family_index],
+                                                  1, NULL, NULL,
+                                                  &sem_info, NULL,
+                                                  false, base_fence);
+               if (ret) {
+                       radv_loge("failed to submit CS\n");
+                       abort();
                }
+               radv_free_sem_info(&sem_info);
+               return VK_SUCCESS;
+       }
 
-               cs_array = malloc(sizeof(struct radeon_cmdbuf *) *
-                                               (pSubmits[i].commandBufferCount));
+       cs_array = malloc(sizeof(struct radeon_cmdbuf *) *
+                                       (submission->cmd_buffer_count));
 
-               for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
-                       RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer,
-                                        pSubmits[i].pCommandBuffers[j]);
-                       assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+       for (uint32_t j = 0; j < submission->cmd_buffer_count; j++) {
+               RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, submission->cmd_buffers[j]);
+               assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
 
-                       cs_array[j] = cmd_buffer->cs;
-                       if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
-                               can_patch = false;
+               cs_array[j] = cmd_buffer->cs;
+               if ((cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
+                       can_patch = false;
 
-                       cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING;
-               }
+               cmd_buffer->status = RADV_CMD_BUFFER_STATUS_PENDING;
+       }
 
-               for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) {
-                       struct radeon_cmdbuf *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
-                       const struct radv_winsys_bo_list *bo_list = NULL;
+       for (uint32_t j = 0; j < submission->cmd_buffer_count; j += advance) {
+               struct radeon_cmdbuf *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
+               const struct radv_winsys_bo_list *bo_list = NULL;
 
-                       advance = MIN2(max_cs_submission,
-                                      pSubmits[i].commandBufferCount - j);
+               advance = MIN2(max_cs_submission,
+                              submission->cmd_buffer_count - j);
 
-                       if (queue->device->trace_bo)
-                               *queue->device->trace_id_ptr = 0;
+               if (queue->device->trace_bo)
+                       *queue->device->trace_id_ptr = 0;
 
-                       sem_info.cs_emit_wait = j == 0;
-                       sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount;
+               sem_info.cs_emit_wait = j == 0;
+               sem_info.cs_emit_signal = j + advance == submission->cmd_buffer_count;
 
-                       if (unlikely(queue->device->use_global_bo_list)) {
-                               pthread_mutex_lock(&queue->device->bo_list.mutex);
-                               bo_list = &queue->device->bo_list.list;
-                       }
+               if (unlikely(queue->device->use_global_bo_list)) {
+                       pthread_mutex_lock(&queue->device->bo_list.mutex);
+                       bo_list = &queue->device->bo_list.list;
+               }
 
-                       ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
-                                                       advance, initial_preamble, continue_preamble_cs,
-                                                       &sem_info, bo_list,
-                                                       can_patch, base_fence);
+               ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
+                                               advance, initial_preamble, continue_preamble_cs,
+                                               &sem_info, bo_list,
+                                               can_patch, base_fence);
 
-                       if (unlikely(queue->device->use_global_bo_list))
-                               pthread_mutex_unlock(&queue->device->bo_list.mutex);
+               if (unlikely(queue->device->use_global_bo_list))
+                       pthread_mutex_unlock(&queue->device->bo_list.mutex);
 
-                       if (ret) {
-                               radv_loge("failed to submit CS %d\n", i);
-                               abort();
-                       }
-                       fence_emitted = true;
-                       if (queue->device->trace_bo) {
-                               radv_check_gpu_hangs(queue, cs_array[j]);
-                       }
+               if (ret) {
+                       radv_loge("failed to submit CS\n");
+                       abort();
+               }
+               if (queue->device->trace_bo) {
+                       radv_check_gpu_hangs(queue, cs_array[j]);
                }
+       }
 
-               radv_free_temp_syncobjs(queue->device,
-                                       pSubmits[i].waitSemaphoreCount,
-                                       pSubmits[i].pWaitSemaphores);
-               radv_free_sem_info(&sem_info);
-               free(cs_array);
+       radv_free_temp_syncobjs(queue->device,
+                               submission->wait_semaphore_count,
+                               submission->wait_semaphores);
+       radv_free_sem_info(&sem_info);
+       free(cs_array);
+       return VK_SUCCESS;
+}
+
+/* Signals fence as soon as all the work currently put on queue is done. */
+static VkResult radv_signal_fence(struct radv_queue *queue,
+                              VkFence fence)
+{
+       return radv_queue_submit(queue, &(struct radv_queue_submission) {
+                       .fence = fence
+               });
+}
+
+VkResult radv_QueueSubmit(
+       VkQueue                                     _queue,
+       uint32_t                                    submitCount,
+       const VkSubmitInfo*                         pSubmits,
+       VkFence                                     fence)
+{
+       RADV_FROM_HANDLE(radv_queue, queue, _queue);
+       VkResult result;
+       bool fence_emitted = false;
+
+       for (uint32_t i = 0; i < submitCount; i++) {
+               if (!pSubmits[i].commandBufferCount &&
+                   !pSubmits[i].waitSemaphoreCount &&
+                   !pSubmits[i].signalSemaphoreCount)
+                       continue;
+
+               VkPipelineStageFlags wait_dst_stage_mask = 0;
+               for (unsigned j = 0; j < pSubmits[i].waitSemaphoreCount; ++j) {
+                       wait_dst_stage_mask |= pSubmits[i].pWaitDstStageMask[j];
+               }
+
+               result = radv_queue_submit(queue, &(struct radv_queue_submission) {
+                               .cmd_buffers = pSubmits[i].pCommandBuffers,
+                               .cmd_buffer_count = pSubmits[i].commandBufferCount,
+                               .wait_dst_stage_mask = wait_dst_stage_mask,
+                               .flush_caches = !fence_emitted,
+                               .wait_semaphores = pSubmits[i].pWaitSemaphores,
+                               .wait_semaphore_count = pSubmits[i].waitSemaphoreCount,
+                               .signal_semaphores = pSubmits[i].pSignalSemaphores,
+                               .signal_semaphore_count = pSubmits[i].signalSemaphoreCount,
+                               .fence = fence
+                       });
+               if (result != VK_SUCCESS)
+                       return result;
+
+               fence_emitted = true;
        }
 
-       if (fence) {
+       if (fence != VK_NULL_HANDLE) {
                if (!fence_emitted) {
                        result = radv_signal_fence(queue, fence);
                        if (result != VK_SUCCESS)
@@ -4308,17 +4347,13 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,
        VkQueue                                     _queue,
        uint32_t                                    bindInfoCount,
        const VkBindSparseInfo*                     pBindInfo,
-       VkFence                                     _fence)
+       VkFence                                     fence)
 {
-       RADV_FROM_HANDLE(radv_fence, fence, _fence);
        RADV_FROM_HANDLE(radv_queue, queue, _queue);
-       struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL;
        bool fence_emitted = false;
        VkResult result;
-       int ret;
 
        for (uint32_t i = 0; i < bindInfoCount; ++i) {
-               struct radv_winsys_sem_info sem_info;
                for (uint32_t j = 0; j < pBindInfo[i].bufferBindCount; ++j) {
                        radv_sparse_buffer_bind_memory(queue->device,
                                                       pBindInfo[i].pBufferBinds + j);
@@ -4329,36 +4364,25 @@ radv_sparse_image_opaque_bind_memory(struct radv_device *device,
                                                             pBindInfo[i].pImageOpaqueBinds + j);
                }
 
-               VkResult result;
-               result = radv_alloc_sem_info(queue->device->instance,
-                                            &sem_info,
-                                            pBindInfo[i].waitSemaphoreCount,
-                                            pBindInfo[i].pWaitSemaphores,
-                                            pBindInfo[i].signalSemaphoreCount,
-                                            pBindInfo[i].pSignalSemaphores,
-                                            _fence);
-               if (result != VK_SUCCESS)
-                       return result;
-
-               if (pBindInfo[i].waitSemaphoreCount || pBindInfo[i].signalSemaphoreCount) {
-                       ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
-                                                         &queue->device->empty_cs[queue->queue_family_index],
-                                                         1, NULL, NULL,
-                                                         &sem_info, NULL,
-                                                         false, base_fence);
-                       if (ret) {
-                               radv_loge("failed to submit CS %d\n", i);
-                               abort();
-                       }
+               if (!pBindInfo[i].waitSemaphoreCount &&
+                   !pBindInfo[i].signalSemaphoreCount)
+                       continue;
 
-                       fence_emitted = true;
-               }
+               VkResult result = radv_queue_submit(queue, &(struct radv_queue_submission) {
+                               .wait_semaphores = pBindInfo[i].pWaitSemaphores,
+                               .wait_semaphore_count = pBindInfo[i].waitSemaphoreCount,
+                               .signal_semaphores = pBindInfo[i].pSignalSemaphores,
+                               .signal_semaphore_count = pBindInfo[i].signalSemaphoreCount,
+                               .fence = fence
+                       });
 
-               radv_free_sem_info(&sem_info);
+               if (result != VK_SUCCESS)
+                       return result;
 
+               fence_emitted = true;
        }
 
-       if (fence) {
+       if (fence != VK_NULL_HANDLE) {
                if (!fence_emitted) {
                        result = radv_signal_fence(queue, fence);
                        if (result != VK_SUCCESS)