From 246261f0addfc24f69ae412b9ef7e40e2c667a4a Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Mon, 30 Sep 2019 12:30:20 +0300 Subject: [PATCH] anv: prepare the driver for delayed submissions Timeline semaphore introduce support for wait before signal behavior, which means that it is now allowed to call vkQueueSubmit() with wait semaphores not yet submitted for execution. Our kernel driver requires all of the wait primitives to be created before calling the execbuf ioctl. As a result, we must delay submissions in the userspace driver. This change store the necessary information to be able to delay a VkSubmitInfo submission to the kernel driver. v2: Fold count++ into array access (Jason) Move queue list to another patch (Jason) v3: Document cleanup of temporary semaphores (Jason) v4: Track semaphores of SYNC_FD type that needs updating after delayed submission v5: Don't forget to update sync_fd in signaled semaphores after submission (Jason) Signed-off-by: Lionel Landwerlin Reviewed-by: Jason Ekstrand --- src/intel/vulkan/anv_batch_chain.c | 327 +++++----------- src/intel/vulkan/anv_private.h | 54 ++- src/intel/vulkan/anv_queue.c | 574 +++++++++++++++++++++++------ src/intel/vulkan/genX_query.c | 37 +- 4 files changed, 616 insertions(+), 376 deletions(-) diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 5720859a674..297cb641f47 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -1047,10 +1047,6 @@ struct anv_execbuf { const VkAllocationCallbacks * alloc; VkSystemAllocationScope alloc_scope; - uint32_t fence_count; - uint32_t fence_array_length; - struct drm_i915_gem_exec_fence * fences; - struct anv_syncobj ** syncobjs; }; static void @@ -1064,8 +1060,6 @@ anv_execbuf_finish(struct anv_execbuf *exec) { vk_free(exec->alloc, exec->objects); vk_free(exec->alloc, exec->bos); - vk_free(exec->alloc, exec->fences); - vk_free(exec->alloc, exec->syncobjs); } static VkResult @@ -1195,34 +1189,6 @@ anv_execbuf_add_bo_bitset(struct anv_device *device, return VK_SUCCESS; } -static VkResult -anv_execbuf_add_syncobj(struct anv_execbuf *exec, - uint32_t handle, uint32_t flags) -{ - assert(flags != 0); - - if (exec->fence_count >= exec->fence_array_length) { - uint32_t new_len = MAX2(exec->fence_array_length * 2, 64); - - exec->fences = vk_realloc(exec->alloc, exec->fences, - new_len * sizeof(*exec->fences), - 8, exec->alloc_scope); - if (exec->fences == NULL) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - - exec->fence_array_length = new_len; - } - - exec->fences[exec->fence_count] = (struct drm_i915_gem_exec_fence) { - .handle = handle, - .flags = flags, - }; - - exec->fence_count++; - - return VK_SUCCESS; -} - static void anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, struct anv_reloc_list *list) @@ -1614,241 +1580,132 @@ setup_empty_execbuf(struct anv_execbuf *execbuf, struct anv_device *device) } VkResult -anv_cmd_buffer_execbuf(struct anv_queue *queue, - struct anv_cmd_buffer *cmd_buffer, - const VkSemaphore *in_semaphores, - uint32_t num_in_semaphores, - const VkSemaphore *out_semaphores, - uint32_t num_out_semaphores, - VkFence _fence) +anv_queue_execbuf(struct anv_queue *queue, + struct anv_queue_submit *submit) { - ANV_FROM_HANDLE(anv_fence, fence, _fence); struct anv_device *device = queue->device; - UNUSED struct anv_physical_device *pdevice = &device->instance->physicalDevice; - struct anv_execbuf execbuf; anv_execbuf_init(&execbuf); - execbuf.alloc = &device->alloc; - execbuf.alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_COMMAND; + execbuf.alloc = submit->alloc; + execbuf.alloc_scope = submit->alloc_scope; - int in_fence = -1; - VkResult result = VK_SUCCESS; - for (uint32_t i = 0; i < num_in_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); - struct anv_semaphore_impl *impl = - semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? - &semaphore->temporary : &semaphore->permanent; - - switch (impl->type) { - case ANV_SEMAPHORE_TYPE_BO: - assert(!pdevice->has_syncobj); - result = anv_execbuf_add_bo(device, &execbuf, impl->bo, NULL, 0); - if (result != VK_SUCCESS) - return result; - break; + VkResult result; - case ANV_SEMAPHORE_TYPE_SYNC_FILE: - assert(!pdevice->has_syncobj); - if (in_fence == -1) { - in_fence = impl->fd; - if (in_fence == -1) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - impl->fd = -1; - } else { - int merge = anv_gem_sync_file_merge(device, in_fence, impl->fd); - if (merge == -1) - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); - - close(impl->fd); - close(in_fence); - impl->fd = -1; - in_fence = merge; - } - break; + /* We lock around execbuf for three main reasons: + * + * 1) When a block pool is resized, we create a new gem handle with a + * different size and, in the case of surface states, possibly a + * different center offset but we re-use the same anv_bo struct when + * we do so. If this happens in the middle of setting up an execbuf, + * we could end up with our list of BOs out of sync with our list of + * gem handles. + * + * 2) The algorithm we use for building the list of unique buffers isn't + * thread-safe. While the client is supposed to syncronize around + * QueueSubmit, this would be extremely difficult to debug if it ever + * came up in the wild due to a broken app. It's better to play it + * safe and just lock around QueueSubmit. + * + * 3) The anv_cmd_buffer_execbuf function may perform relocations in + * userspace. Due to the fact that the surface state buffer is shared + * between batches, we can't afford to have that happen from multiple + * threads at the same time. Even though the user is supposed to + * ensure this doesn't happen, we play it safe as in (2) above. + * + * Since the only other things that ever take the device lock such as block + * pool resize only rarely happen, this will almost never be contended so + * taking a lock isn't really an expensive operation in this case. + */ + pthread_mutex_lock(&device->mutex); - case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_WAIT); - if (result != VK_SUCCESS) - return result; - break; + for (uint32_t i = 0; i < submit->fence_bo_count; i++) { + int signaled; + struct anv_bo *bo = anv_unpack_ptr(submit->fence_bos[i], 1, &signaled); - default: - break; - } + result = anv_execbuf_add_bo(device, &execbuf, bo, NULL, + signaled ? EXEC_OBJECT_WRITE : 0); + if (result != VK_SUCCESS) + goto error; } - bool need_out_fence = false; - for (uint32_t i = 0; i < num_out_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); - - /* Under most circumstances, out fences won't be temporary. However, - * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * The spec says nothing whatsoever about signal operations on - * temporarily imported semaphores so it appears they are allowed. - * There are also CTS tests that require this to work. - */ - struct anv_semaphore_impl *impl = - semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? - &semaphore->temporary : &semaphore->permanent; - - switch (impl->type) { - case ANV_SEMAPHORE_TYPE_BO: - assert(!pdevice->has_syncobj); - result = anv_execbuf_add_bo(device, &execbuf, impl->bo, NULL, - EXEC_OBJECT_WRITE); - if (result != VK_SUCCESS) - return result; - break; - - case ANV_SEMAPHORE_TYPE_SYNC_FILE: - assert(!pdevice->has_syncobj); - need_out_fence = true; - break; - - case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_SIGNAL); - if (result != VK_SUCCESS) - return result; - break; - - default: - break; - } + if (submit->cmd_buffer) { + result = setup_execbuf_for_cmd_buffer(&execbuf, submit->cmd_buffer); + } else if (submit->simple_bo) { + result = anv_execbuf_add_bo(device, &execbuf, submit->simple_bo, NULL, 0); + if (result != VK_SUCCESS) + goto error; + + execbuf.execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf.objects, + .buffer_count = execbuf.bo_count, + .batch_start_offset = 0, + .batch_len = submit->simple_bo_size, + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, + .rsvd1 = device->context_id, + .rsvd2 = 0, + }; + } else { + result = setup_empty_execbuf(&execbuf, queue->device); } - if (fence) { - /* Under most circumstances, out fences won't be temporary. However, - * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * The spec says nothing whatsoever about signal operations on - * temporarily imported semaphores so it appears they are allowed. - * There are also CTS tests that require this to work. - */ - struct anv_fence_impl *impl = - fence->temporary.type != ANV_FENCE_TYPE_NONE ? - &fence->temporary : &fence->permanent; - - switch (impl->type) { - case ANV_FENCE_TYPE_BO: - assert(!pdevice->has_syncobj_wait); - result = anv_execbuf_add_bo(device, &execbuf, impl->bo.bo, NULL, - EXEC_OBJECT_WRITE); - if (result != VK_SUCCESS) - return result; - break; - - case ANV_FENCE_TYPE_SYNCOBJ: - result = anv_execbuf_add_syncobj(&execbuf, impl->syncobj, - I915_EXEC_FENCE_SIGNAL); - if (result != VK_SUCCESS) - return result; - break; - - default: - unreachable("Invalid fence type"); - } - } + if (result != VK_SUCCESS) + goto error; - if (cmd_buffer) { - if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { - struct anv_batch_bo **bo = u_vector_tail(&cmd_buffer->seen_bbos); + if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { + if (submit->cmd_buffer) { + struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos); - device->cmd_buffer_being_decoded = cmd_buffer; + device->cmd_buffer_being_decoded = submit->cmd_buffer; gen_print_batch(&device->decoder_ctx, (*bo)->bo->map, (*bo)->bo->size, (*bo)->bo->offset, false); device->cmd_buffer_being_decoded = NULL; + } else if (submit->simple_bo) { + gen_print_batch(&device->decoder_ctx, submit->simple_bo->map, + submit->simple_bo->size, submit->simple_bo->offset, false); + } else { + gen_print_batch(&device->decoder_ctx, + device->trivial_batch_bo->map, + device->trivial_batch_bo->size, + device->trivial_batch_bo->offset, false); } - - result = setup_execbuf_for_cmd_buffer(&execbuf, cmd_buffer); - } else { - result = setup_empty_execbuf(&execbuf, device); } - if (result != VK_SUCCESS) - return result; - - if (execbuf.fence_count > 0) { + if (submit->fence_count > 0) { assert(device->instance->physicalDevice.has_syncobj); execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; - execbuf.execbuf.num_cliprects = execbuf.fence_count; - execbuf.execbuf.cliprects_ptr = (uintptr_t) execbuf.fences; + execbuf.execbuf.num_cliprects = submit->fence_count; + execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences; } - if (in_fence != -1) { + if (submit->in_fence != -1) { execbuf.execbuf.flags |= I915_EXEC_FENCE_IN; - execbuf.execbuf.rsvd2 |= (uint32_t)in_fence; + execbuf.execbuf.rsvd2 |= (uint32_t)submit->in_fence; } - if (need_out_fence) + if (submit->need_out_fence) execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT; - result = anv_queue_execbuf(queue, &execbuf.execbuf, execbuf.bos); - - /* Execbuf does not consume the in_fence. It's our job to close it. */ - if (in_fence != -1) - close(in_fence); - - for (uint32_t i = 0; i < num_in_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); - /* From the Vulkan 1.0.53 spec: - * - * "If the import is temporary, the implementation must restore the - * semaphore to its prior permanent state after submitting the next - * semaphore wait operation." - * - * This has to happen after the execbuf in case we close any syncobjs in - * the process. - */ - anv_semaphore_reset_temporary(device, semaphore); + int ret = queue->device->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &execbuf.execbuf); + if (ret) { + result = anv_queue_set_lost(queue, + "execbuf2 failed: %s", + strerror(ret)); } - if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { - assert(!pdevice->has_syncobj_wait); - /* BO fences can't be shared, so they can't be temporary. */ - assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); - - /* Once the execbuf has returned, we need to set the fence state to - * SUBMITTED. We can't do this before calling execbuf because - * anv_GetFenceStatus does take the global device lock before checking - * fence->state. - * - * We set the fence state to SUBMITTED regardless of whether or not the - * execbuf succeeds because we need to ensure that vkWaitForFences() and - * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or - * VK_SUCCESS) in a finite amount of time even if execbuf fails. - */ - fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED; + struct drm_i915_gem_exec_object2 *objects = execbuf.objects; + for (uint32_t k = 0; k < execbuf.bo_count; k++) { + if (execbuf.bos[k]->flags & EXEC_OBJECT_PINNED) + assert(execbuf.bos[k]->offset == objects[k].offset); + execbuf.bos[k]->offset = objects[k].offset; } - if (result == VK_SUCCESS && need_out_fence) { - assert(!pdevice->has_syncobj_wait); - int out_fence = execbuf.execbuf.rsvd2 >> 32; - for (uint32_t i = 0; i < num_out_semaphores; i++) { - ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); - /* Out fences can't have temporary state because that would imply - * that we imported a sync file and are trying to signal it. - */ - assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); - struct anv_semaphore_impl *impl = &semaphore->permanent; + if (result == VK_SUCCESS && submit->need_out_fence) + submit->out_fence = execbuf.execbuf.rsvd2 >> 32; - if (impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE) { - assert(impl->fd == -1); - impl->fd = dup(out_fence); - } - } - close(out_fence); - } + error: + pthread_cond_broadcast(&device->queue_submit); + pthread_mutex_unlock(&queue->device->mutex); anv_execbuf_finish(&execbuf); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 3aae01e6b2f..67049cc37fe 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -300,6 +300,20 @@ vk_to_isl_color(VkClearColorValue color) }; } +static inline void *anv_unpack_ptr(uintptr_t ptr, int bits, int *flags) +{ + uintptr_t mask = (1ull << bits) - 1; + *flags = ptr & mask; + return (void *) (ptr & ~mask); +} + +static inline uintptr_t anv_pack_ptr(void *ptr, int bits, int flags) +{ + uintptr_t value = (uintptr_t) ptr; + uintptr_t mask = (1ull << bits) - 1; + return value | (mask & flags); +} + #define for_each_bit(b, dword) \ for (uint32_t __dword = (dword); \ (b) = __builtin_ffs(__dword) - 1, __dword; \ @@ -1050,6 +1064,42 @@ uint32_t anv_physical_device_api_version(struct anv_physical_device *dev); bool anv_physical_device_extension_supported(struct anv_physical_device *dev, const char *name); +struct anv_queue_submit { + struct anv_cmd_buffer * cmd_buffer; + + uint32_t fence_count; + uint32_t fence_array_length; + struct drm_i915_gem_exec_fence * fences; + + uint32_t temporary_semaphore_count; + uint32_t temporary_semaphore_array_length; + struct anv_semaphore_impl * temporary_semaphores; + + /* Semaphores to be signaled with a SYNC_FD. */ + struct anv_semaphore ** sync_fd_semaphores; + uint32_t sync_fd_semaphore_count; + uint32_t sync_fd_semaphore_array_length; + + int in_fence; + bool need_out_fence; + int out_fence; + + uint32_t fence_bo_count; + uint32_t fence_bo_array_length; + /* An array of struct anv_bo pointers with lower bit used as a flag to + * signal we will wait on that BO (see anv_(un)pack_ptr). + */ + uintptr_t * fence_bos; + + const VkAllocationCallbacks * alloc; + VkSystemAllocationScope alloc_scope; + + struct anv_bo * simple_bo; + uint32_t simple_bo_size; + + struct list_head link; +}; + struct anv_queue { VK_LOADER_DATA _loader_data; @@ -1318,9 +1368,7 @@ VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo, VkResult anv_queue_init(struct anv_device *device, struct anv_queue *queue); void anv_queue_finish(struct anv_queue *queue); -VkResult anv_queue_execbuf(struct anv_queue *queue, - struct drm_i915_gem_execbuffer2 *execbuf, - struct anv_bo **execbuf_bos); +VkResult anv_queue_execbuf(struct anv_queue *queue, struct anv_queue_submit *submit); VkResult anv_queue_submit_simple_batch(struct anv_queue *queue, struct anv_batch *batch); diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c index 74f3a3c4254..91eed4eeeed 100644 --- a/src/intel/vulkan/anv_queue.c +++ b/src/intel/vulkan/anv_queue.c @@ -76,27 +76,54 @@ static int64_t anv_get_relative_timeout(uint64_t abs_timeout) return rel_timeout; } -VkResult -anv_queue_execbuf(struct anv_queue *queue, - struct drm_i915_gem_execbuffer2 *execbuf, - struct anv_bo **execbuf_bos) +static struct anv_semaphore *anv_semaphore_ref(struct anv_semaphore *semaphore); +static void anv_semaphore_unref(struct anv_device *device, struct anv_semaphore *semaphore); +static void anv_semaphore_impl_cleanup(struct anv_device *device, + struct anv_semaphore_impl *impl); + +static void +anv_queue_submit_free(struct anv_device *device, + struct anv_queue_submit *submit) { - struct anv_device *device = queue->device; - int ret = device->no_hw ? 0 : anv_gem_execbuffer(device, execbuf); - if (ret != 0) { - /* We don't know the real error. */ - return anv_queue_set_lost(queue, "execbuf2 failed: %m"); - } + const VkAllocationCallbacks *alloc = submit->alloc; + + for (uint32_t i = 0; i < submit->temporary_semaphore_count; i++) + anv_semaphore_impl_cleanup(device, &submit->temporary_semaphores[i]); + for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) + anv_semaphore_unref(device, submit->sync_fd_semaphores[i]); + /* Execbuf does not consume the in_fence. It's our job to close it. */ + if (submit->in_fence != -1) + close(submit->in_fence); + if (submit->out_fence != -1) + close(submit->out_fence); + vk_free(alloc, submit->fences); + vk_free(alloc, submit->temporary_semaphores); + vk_free(alloc, submit->fence_bos); + vk_free(alloc, submit); +} - struct drm_i915_gem_exec_object2 *objects = - (void *)(uintptr_t)execbuf->buffers_ptr; - for (uint32_t k = 0; k < execbuf->buffer_count; k++) { - if (execbuf_bos[k]->flags & EXEC_OBJECT_PINNED) - assert(execbuf_bos[k]->offset == objects[k].offset); - execbuf_bos[k]->offset = objects[k].offset; +static VkResult +_anv_queue_submit(struct anv_queue *queue, struct anv_queue_submit **_submit) +{ + struct anv_queue_submit *submit = *_submit; + VkResult result = anv_queue_execbuf(queue, submit); + + if (result == VK_SUCCESS) { + /* Update signaled semaphores backed by syncfd. */ + for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) { + struct anv_semaphore *semaphore = submit->sync_fd_semaphores[i]; + /* Out fences can't have temporary state because that would imply + * that we imported a sync file and are trying to signal it. + */ + assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); + struct anv_semaphore_impl *impl = &semaphore->permanent; + + assert(impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE); + impl->fd = dup(submit->out_fence); + } } - return VK_SUCCESS; + return result; } VkResult @@ -114,69 +141,426 @@ anv_queue_finish(struct anv_queue *queue) { } +static VkResult +anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit, + struct anv_bo *bo, + bool signal) +{ + if (submit->fence_bo_count >= submit->fence_bo_array_length) { + uint32_t new_len = MAX2(submit->fence_bo_array_length * 2, 64); + + submit->fence_bos = + vk_realloc(submit->alloc, + submit->fence_bos, new_len * sizeof(*submit->fence_bos), + 8, submit->alloc_scope); + if (submit->fence_bos == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->fence_bo_array_length = new_len; + } + + /* Take advantage that anv_bo are allocated at 8 byte alignement so we can + * use the lowest bit to store whether this is a BO we need to signal. + */ + submit->fence_bos[submit->fence_bo_count++] = anv_pack_ptr(bo, 1, signal); + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_syncobj(struct anv_queue_submit* submit, + struct anv_device *device, + uint32_t handle, uint32_t flags) +{ + assert(flags != 0); + + if (submit->fence_count >= submit->fence_array_length) { + uint32_t new_len = MAX2(submit->fence_array_length * 2, 64); + + submit->fences = + vk_realloc(submit->alloc, + submit->fences, new_len * sizeof(*submit->fences), + 8, submit->alloc_scope); + if (submit->fences == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->fence_array_length = new_len; + } + + submit->fences[submit->fence_count++] = (struct drm_i915_gem_exec_fence) { + .handle = handle, + .flags = flags, + }; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_sync_fd_fence(struct anv_queue_submit *submit, + struct anv_semaphore *semaphore) +{ + if (submit->sync_fd_semaphore_count >= submit->sync_fd_semaphore_array_length) { + uint32_t new_len = MAX2(submit->sync_fd_semaphore_array_length * 2, 64); + struct anv_semaphore **new_semaphores = + vk_realloc(submit->alloc, submit->sync_fd_semaphores, + new_len * sizeof(*submit->sync_fd_semaphores), 8, + submit->alloc_scope); + if (new_semaphores == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->sync_fd_semaphores = new_semaphores; + } + + submit->sync_fd_semaphores[submit->sync_fd_semaphore_count++] = + anv_semaphore_ref(semaphore); + submit->need_out_fence = true; + + return VK_SUCCESS; +} + +static struct anv_queue_submit * +anv_queue_submit_alloc(struct anv_device *device) +{ + const VkAllocationCallbacks *alloc = &device->alloc; + VkSystemAllocationScope alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_COMMAND; + + struct anv_queue_submit *submit = vk_zalloc(alloc, sizeof(*submit), 8, alloc_scope); + if (!submit) + return NULL; + + submit->alloc = alloc; + submit->alloc_scope = alloc_scope; + submit->in_fence = -1; + submit->out_fence = -1; + + return submit; +} + VkResult anv_queue_submit_simple_batch(struct anv_queue *queue, struct anv_batch *batch) { struct anv_device *device = queue->device; - struct drm_i915_gem_execbuffer2 execbuf; - struct drm_i915_gem_exec_object2 exec2_objects[1]; - struct anv_bo *bo; - VkResult result = VK_SUCCESS; - uint32_t size; + struct anv_queue_submit *submit = anv_queue_submit_alloc(device); + if (!submit) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + bool has_syncobj_wait = device->instance->physicalDevice.has_syncobj_wait; + VkResult result; + uint32_t syncobj; + struct anv_bo *batch_bo, *sync_bo; + + if (has_syncobj_wait) { + syncobj = anv_gem_syncobj_create(device, 0); + if (!syncobj) { + result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto err_free_submit; + } + + result = anv_queue_submit_add_syncobj(submit, device, syncobj, + I915_EXEC_FENCE_SIGNAL); + } else { + result = anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + &sync_bo); + if (result != VK_SUCCESS) + goto err_free_submit; + + result = anv_queue_submit_add_fence_bo(submit, sync_bo, true /* signal */); + } + + if (result != VK_SUCCESS) + goto err_destroy_sync_primitive; if (batch) { - /* Kernel driver requires 8 byte aligned batch length */ - size = align_u32(batch->next - batch->start, 8); - result = anv_bo_pool_alloc(&device->batch_bo_pool, size, &bo); + uint32_t size = align_u32(batch->next - batch->start, 8); + result = anv_bo_pool_alloc(&device->batch_bo_pool, size, &batch_bo); if (result != VK_SUCCESS) - return result; + goto err_destroy_sync_primitive; - memcpy(bo->map, batch->start, size); + memcpy(batch_bo->map, batch->start, size); if (!device->info.has_llc) - gen_flush_range(bo->map, size); - } else { - size = device->trivial_batch_bo->size; - bo = device->trivial_batch_bo; + gen_flush_range(batch_bo->map, size); + + submit->simple_bo = batch_bo; + submit->simple_bo_size = size; + } + + result = _anv_queue_submit(queue, &submit); + + if (result == VK_SUCCESS) { + if (has_syncobj_wait) { + if (anv_gem_syncobj_wait(device, &syncobj, 1, + anv_get_absolute_timeout(INT64_MAX), true)) + result = anv_device_set_lost(device, "anv_gem_syncobj_wait failed: %m"); + anv_gem_syncobj_destroy(device, syncobj); + } else { + result = anv_device_wait(device, sync_bo, + anv_get_relative_timeout(INT64_MAX)); + anv_device_release_bo(device, sync_bo); + } + } + + if (batch) + anv_bo_pool_free(&device->batch_bo_pool, batch_bo); + + if (submit) + anv_queue_submit_free(device, submit); + + return result; + + err_destroy_sync_primitive: + if (has_syncobj_wait) + anv_gem_syncobj_destroy(device, syncobj); + else + anv_device_release_bo(device, sync_bo); + err_free_submit: + if (submit) + anv_queue_submit_free(device, submit); + + return result; +} + +/* Transfer ownership of temporary semaphores from the VkSemaphore object to + * the anv_queue_submit object. Those temporary semaphores are then freed in + * anv_queue_submit_free() once the driver is finished with them. + */ +static VkResult +maybe_transfer_temporary_semaphore(struct anv_queue_submit *submit, + struct anv_semaphore *semaphore, + struct anv_semaphore_impl **out_impl) +{ + struct anv_semaphore_impl *impl = &semaphore->temporary; + + if (impl->type == ANV_SEMAPHORE_TYPE_NONE) { + *out_impl = &semaphore->permanent; + return VK_SUCCESS; } - exec2_objects[0].handle = bo->gem_handle; - exec2_objects[0].relocation_count = 0; - exec2_objects[0].relocs_ptr = 0; - exec2_objects[0].alignment = 0; - exec2_objects[0].offset = bo->offset; - exec2_objects[0].flags = bo->flags; - exec2_objects[0].rsvd1 = 0; - exec2_objects[0].rsvd2 = 0; - - execbuf.buffers_ptr = (uintptr_t) exec2_objects; - execbuf.buffer_count = 1; - execbuf.batch_start_offset = 0; - execbuf.batch_len = size; - execbuf.cliprects_ptr = 0; - execbuf.num_cliprects = 0; - execbuf.DR1 = 0; - execbuf.DR4 = 0; - - execbuf.flags = - I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER; - execbuf.rsvd1 = device->context_id; - execbuf.rsvd2 = 0; - - if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { - gen_print_batch(&device->decoder_ctx, bo->map, - bo->size, bo->offset, false); + /* + * There is a requirement to reset semaphore to their permanent state after + * submission. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * In the case we defer the actual submission to a thread because of the + * wait-before-submit behavior required for timeline semaphores, we need to + * make copies of the temporary syncobj to ensure they stay alive until we + * do the actual execbuffer ioctl. + */ + if (submit->temporary_semaphore_count >= submit->temporary_semaphore_array_length) { + uint32_t new_len = MAX2(submit->temporary_semaphore_array_length * 2, 8); + /* Make sure that if the realloc fails, we still have the old semaphore + * array around to properly clean things up on failure. + */ + struct anv_semaphore_impl *new_array = + vk_realloc(submit->alloc, + submit->temporary_semaphores, + new_len * sizeof(*submit->temporary_semaphores), + 8, submit->alloc_scope); + if (new_array == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->temporary_semaphores = new_array; + submit->temporary_semaphore_array_length = new_len; } - result = anv_queue_execbuf(queue, &execbuf, &bo); + /* Copy anv_semaphore_impl into anv_queue_submit. */ + submit->temporary_semaphores[submit->temporary_semaphore_count++] = *impl; + *out_impl = &submit->temporary_semaphores[submit->temporary_semaphore_count - 1]; + + /* Clear the incoming semaphore */ + impl->type = ANV_SEMAPHORE_TYPE_NONE; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit(struct anv_queue *queue, + struct anv_cmd_buffer *cmd_buffer, + const VkSemaphore *in_semaphores, + uint32_t num_in_semaphores, + const VkSemaphore *out_semaphores, + uint32_t num_out_semaphores, + VkFence _fence) +{ + ANV_FROM_HANDLE(anv_fence, fence, _fence); + struct anv_device *device = queue->device; + UNUSED struct anv_physical_device *pdevice = &device->instance->physicalDevice; + struct anv_queue_submit *submit = anv_queue_submit_alloc(device); + if (!submit) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->cmd_buffer = cmd_buffer; + + VkResult result = VK_SUCCESS; + + for (uint32_t i = 0; i < num_in_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); + struct anv_semaphore_impl *impl; + + result = maybe_transfer_temporary_semaphore(submit, semaphore, &impl); + if (result != VK_SUCCESS) + goto error; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_BO: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_fence_bo(submit, impl->bo, false /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_SYNC_FILE: + assert(!pdevice->has_syncobj); + if (submit->in_fence == -1) { + submit->in_fence = impl->fd; + if (submit->in_fence == -1) { + result = vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto error; + } + impl->fd = -1; + } else { + int merge = anv_gem_sync_file_merge(device, submit->in_fence, impl->fd); + if (merge == -1) { + result = vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto error; + } + close(impl->fd); + close(submit->in_fence); + impl->fd = -1; + submit->in_fence = merge; + } + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: { + result = anv_queue_submit_add_syncobj(submit, device, + impl->syncobj, + I915_EXEC_FENCE_WAIT); + if (result != VK_SUCCESS) + goto error; + break; + } + + default: + break; + } + } + + for (uint32_t i = 0; i < num_out_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); + + /* Under most circumstances, out fences won't be temporary. However, + * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * The spec says nothing whatsoever about signal operations on + * temporarily imported semaphores so it appears they are allowed. + * There are also CTS tests that require this to work. + */ + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_BO: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_fence_bo(submit, impl->bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_SYNC_FILE: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_sync_fd_fence(submit, semaphore); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: { + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, + I915_EXEC_FENCE_SIGNAL); + if (result != VK_SUCCESS) + goto error; + break; + } + + default: + break; + } + } + + if (fence) { + /* Under most circumstances, out fences won't be temporary. However, + * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * The spec says nothing whatsoever about signal operations on + * temporarily imported semaphores so it appears they are allowed. + * There are also CTS tests that require this to work. + */ + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + + switch (impl->type) { + case ANV_FENCE_TYPE_BO: + result = anv_queue_submit_add_fence_bo(submit, impl->bo.bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_FENCE_TYPE_SYNCOBJ: { + /* + * For the same reason we reset the signaled binary syncobj above, + * also reset the fence's syncobj so that they don't contain a + * signaled dma-fence. + */ + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, + I915_EXEC_FENCE_SIGNAL); + if (result != VK_SUCCESS) + goto error; + break; + } + + default: + unreachable("Invalid fence type"); + } + } + + result = _anv_queue_submit(queue, &submit); if (result != VK_SUCCESS) - goto fail; + goto error; - result = anv_device_wait(device, bo, INT64_MAX); + if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { + /* BO fences can't be shared, so they can't be temporary. */ + assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); - fail: - if (batch) - anv_bo_pool_free(&device->batch_bo_pool, bo); + /* Once the execbuf has returned, we need to set the fence state to + * SUBMITTED. We can't do this before calling execbuf because + * anv_GetFenceStatus does take the global device lock before checking + * fence->state. + * + * We set the fence state to SUBMITTED regardless of whether or not the + * execbuf succeeds because we need to ensure that vkWaitForFences() and + * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or + * VK_SUCCESS) in a finite amount of time even if execbuf fails. + */ + fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED; + } + + error: + if (submit) + anv_queue_submit_free(device, submit); return result; } @@ -188,7 +572,6 @@ VkResult anv_QueueSubmit( VkFence fence) { ANV_FROM_HANDLE(anv_queue, queue, _queue); - struct anv_device *device = queue->device; /* Query for device status prior to submitting. Technically, we don't need * to do this. However, if we have a client that's submitting piles of @@ -197,44 +580,17 @@ VkResult anv_QueueSubmit( * the kernel to kick us or we'll have to wait until the client waits on a * fence before we actually know whether or not we've hung. */ - VkResult result = anv_device_query_status(device); + VkResult result = anv_device_query_status(queue->device); if (result != VK_SUCCESS) return result; - /* We lock around QueueSubmit for three main reasons: - * - * 1) When a block pool is resized, we create a new gem handle with a - * different size and, in the case of surface states, possibly a - * different center offset but we re-use the same anv_bo struct when - * we do so. If this happens in the middle of setting up an execbuf, - * we could end up with our list of BOs out of sync with our list of - * gem handles. - * - * 2) The algorithm we use for building the list of unique buffers isn't - * thread-safe. While the client is supposed to syncronize around - * QueueSubmit, this would be extremely difficult to debug if it ever - * came up in the wild due to a broken app. It's better to play it - * safe and just lock around QueueSubmit. - * - * 3) The anv_cmd_buffer_execbuf function may perform relocations in - * userspace. Due to the fact that the surface state buffer is shared - * between batches, we can't afford to have that happen from multiple - * threads at the same time. Even though the user is supposed to - * ensure this doesn't happen, we play it safe as in (2) above. - * - * Since the only other things that ever take the device lock such as block - * pool resize only rarely happen, this will almost never be contended so - * taking a lock isn't really an expensive operation in this case. - */ - pthread_mutex_lock(&device->mutex); - if (fence && submitCount == 0) { /* If we don't have any command buffers, we need to submit a dummy * batch to give GEM something to wait on. We could, potentially, * come up with something more efficient but this shouldn't be a * common case. */ - result = anv_cmd_buffer_execbuf(queue, NULL, NULL, 0, NULL, 0, fence); + result = anv_queue_submit(queue, NULL, NULL, 0, NULL, 0, fence); goto out; } @@ -248,12 +604,12 @@ VkResult anv_QueueSubmit( * come up with something more efficient but this shouldn't be a * common case. */ - result = anv_cmd_buffer_execbuf(queue, NULL, - pSubmits[i].pWaitSemaphores, - pSubmits[i].waitSemaphoreCount, - pSubmits[i].pSignalSemaphores, - pSubmits[i].signalSemaphoreCount, - submit_fence); + result = anv_queue_submit(queue, NULL, + pSubmits[i].pWaitSemaphores, + pSubmits[i].waitSemaphoreCount, + pSubmits[i].pSignalSemaphores, + pSubmits[i].signalSemaphoreCount, + submit_fence); if (result != VK_SUCCESS) goto out; @@ -285,19 +641,17 @@ VkResult anv_QueueSubmit( num_out_semaphores = pSubmits[i].signalSemaphoreCount; } - result = anv_cmd_buffer_execbuf(queue, cmd_buffer, - in_semaphores, num_in_semaphores, - out_semaphores, num_out_semaphores, - execbuf_fence); + result = anv_queue_submit(queue, cmd_buffer, + in_semaphores, num_in_semaphores, + out_semaphores, num_out_semaphores, + execbuf_fence); if (result != VK_SUCCESS) goto out; } } - pthread_cond_broadcast(&device->queue_submit); - out: - if (result != VK_SUCCESS) { + if (result != VK_SUCCESS && result != VK_ERROR_DEVICE_LOST) { /* In the case that something has gone wrong we may end up with an * inconsistent state from which it may not be trivial to recover. * For example, we might have computed address relocations and @@ -309,12 +663,14 @@ out: * anyway (such us being out of memory) and return * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to * submit the same job again to this device. + * + * We skip doing this on VK_ERROR_DEVICE_LOST because + * anv_device_set_lost() would have been called already by a callee of + * anv_queue_submit(). */ - result = anv_device_set_lost(device, "vkQueueSubmit() failed"); + result = anv_device_set_lost(queue->device, "vkQueueSubmit() failed"); } - pthread_mutex_unlock(&device->mutex); - return result; } diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 4ff85f7ae66..0a295cebb87 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -240,38 +240,17 @@ static VkResult wait_for_available(struct anv_device *device, struct anv_query_pool *pool, uint32_t query) { - while (true) { + uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC); + + while (anv_gettime_ns() < abs_timeout) { if (query_is_available(pool, query)) return VK_SUCCESS; - - int ret = anv_gem_busy(device, pool->bo->gem_handle); - if (ret == 1) { - /* The BO is still busy, keep waiting. */ - continue; - } else if (ret == -1) { - /* We don't know the real error. */ - return anv_device_set_lost(device, "gem wait failed: %m"); - } else { - assert(ret == 0); - /* The BO is no longer busy. */ - if (query_is_available(pool, query)) { - return VK_SUCCESS; - } else { - VkResult status = anv_device_query_status(device); - if (status != VK_SUCCESS) - return status; - - /* If we haven't seen availability yet, then we never will. This - * can only happen if we have a client error where they call - * GetQueryPoolResults on a query that they haven't submitted to - * the GPU yet. The spec allows us to do anything in this case, - * but returning VK_SUCCESS doesn't seem right and we shouldn't - * just keep spinning. - */ - return VK_NOT_READY; - } - } + VkResult status = anv_device_query_status(device); + if (status != VK_SUCCESS) + return status; } + + return anv_device_set_lost(device, "query timeout"); } VkResult genX(GetQueryPoolResults)( -- 2.30.2