From: Lionel Landwerlin Date: Wed, 28 Aug 2019 10:22:30 +0000 (+0300) Subject: anv: implement shareable timeline semaphores X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=829699ba632b2b78e4de372baf42ae01095158a7 anv: implement shareable timeline semaphores This implements timeline semaphores using a new type of dma-fence stored into drm-syncobjs. We use a thread to implement delayed submissions. v2: Drop cloning of temporary semaphores and just transfer their ownership (Jason) Drain queue when dealing with binary semaphore Ensure we don't submit to the thread as long as we don't need to v3: Use __u64 not uintptr_t for kernel pointers Fix commented code for INTEL_DEBUG=bat Set DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES in timeline fence execbuf extension Add new anv_queue_set_lost() Drop multi queue stuff meant for the fake multi queue patch Rework temporary syncobj handling Don't use syncobj when not available (DeviceWaitIdle/CreateDevice) Use ANV_MULTIALLOC And a few more tweaks... v4: Drop drained condition helper (Lionel) Fix missing EXEC_OBJECT_WRITE on BOs we want to wait on (Jason) v5: Add missing device->lost_reported in _anv_device_report_lost (Lionel) Fix missing free on submit->simple_bo (Lionel) Don't drop setting the device in lost state on QueueSubmit error (Jason) Store submit->fence_bos as an array of uintptr_t (Jason) v6: condition device->has_thread_submit to i915 & core DRM support (Jason) v7: Fix submit->in_fence leakage on error (Jason) Keep dummy semaphore with no thread submission (Jason) v8: Move ownership of submit->out_fence to submit (Jason) v9: Don't forget to read the VkFence's syncobj binary payload (Lionel) v10: Take the mutex lock on anv_gem_close() (Jason/Lionel) v11: Fix void* -> u64 cast on 32bit (Lionel) v12: Rebase after BO backed timeline semaphore (Lionel) v13: Fix missing snippets lost after rebase (Lionel) v14: Drop update_binary usage (Lionel) v15: Use ANV_MULTIALLOC (Lionel) v16: Fix some realloc issues (Ivan) Signed-off-by: Lionel Landwerlin Reviewed-by: Jason Ekstrand (v8) Part-of: --- diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 21cead1df7b..4aefa58ea1e 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -1091,6 +1091,8 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, struct anv_execbuf { struct drm_i915_gem_execbuffer2 execbuf; + struct drm_i915_gem_execbuffer_ext_timeline_fences timeline_fences; + struct drm_i915_gem_exec_object2 * objects; uint32_t bo_count; struct anv_bo ** bos; @@ -1119,6 +1121,24 @@ anv_execbuf_finish(struct anv_execbuf *exec) vk_free(exec->alloc, exec->bos); } +static void +anv_execbuf_add_ext(struct anv_execbuf *exec, + uint32_t ext_name, + struct i915_user_extension *ext) +{ + __u64 *iter = &exec->execbuf.cliprects_ptr; + + exec->execbuf.flags |= I915_EXEC_USE_EXTENSIONS; + + while (*iter != 0) { + iter = (__u64 *) &((struct i915_user_extension *)(uintptr_t)*iter)->next_extension; + } + + ext->name = ext_name; + + *iter = (uintptr_t) ext; +} + static VkResult anv_execbuf_add_bo_bitset(struct anv_device *device, struct anv_execbuf *exec, @@ -1754,18 +1774,30 @@ anv_queue_execbuf_locked(struct anv_queue *queue, if (submit->fence_count > 0) { assert(device->physical->has_syncobj); - execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; - execbuf.execbuf.num_cliprects = submit->fence_count; - execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences; + if (device->has_thread_submit) { + execbuf.timeline_fences.fence_count = submit->fence_count; + execbuf.timeline_fences.handles_ptr = (uintptr_t)submit->fences; + execbuf.timeline_fences.values_ptr = (uintptr_t)submit->fence_values; + anv_execbuf_add_ext(&execbuf, + DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES, + &execbuf.timeline_fences.base); + } else { + execbuf.execbuf.flags |= I915_EXEC_FENCE_ARRAY; + execbuf.execbuf.num_cliprects = submit->fence_count; + execbuf.execbuf.cliprects_ptr = (uintptr_t)submit->fences; + } } if (submit->in_fence != -1) { + assert(!device->has_thread_submit); execbuf.execbuf.flags |= I915_EXEC_FENCE_IN; execbuf.execbuf.rsvd2 |= (uint32_t)submit->in_fence; } - if (submit->need_out_fence) + if (submit->need_out_fence) { + assert(!device->has_thread_submit); execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT; + } if (has_perf_query) { struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool; diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 55d079e133f..be4d1909d2e 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -460,6 +460,9 @@ anv_physical_device_try_create(struct anv_instance *instance, if (env_var_as_boolean("ANV_QUEUE_THREAD_DISABLE", false)) device->has_exec_timeline = false; + device->has_thread_submit = + device->has_syncobj_wait_available && device->has_exec_timeline; + device->always_use_bindless = env_var_as_boolean("ANV_ALWAYS_BINDLESS", false); @@ -2821,6 +2824,8 @@ VkResult anv_CreateDevice( goto fail_fd; } + device->has_thread_submit = physical_device->has_thread_submit; + result = anv_queue_init(device, &device->queue); if (result != VK_SUCCESS) goto fail_context_id; @@ -3111,12 +3116,12 @@ void anv_DestroyDevice( if (!device) return; + anv_queue_finish(&device->queue); + anv_device_finish_blorp(device); anv_pipeline_cache_finish(&device->default_pipeline_cache); - anv_queue_finish(&device->queue); - #ifdef HAVE_VALGRIND /* We only need to free these to prevent valgrind errors. The backing * BO will go away in a couple of lines so we don't actually leak. @@ -3228,6 +3233,22 @@ void anv_GetDeviceQueue2( *pQueue = NULL; } +void +_anv_device_report_lost(struct anv_device *device) +{ + assert(p_atomic_read(&device->_lost) > 0); + + device->lost_reported = true; + + struct anv_queue *queue = &device->queue; + + __vk_errorf(device->physical->instance, device, + VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT, + VK_ERROR_DEVICE_LOST, + queue->error_file, queue->error_line, + "%s", queue->error_msg); +} + VkResult _anv_device_set_lost(struct anv_device *device, const char *file, int line, @@ -3236,7 +3257,11 @@ _anv_device_set_lost(struct anv_device *device, VkResult err; va_list ap; + if (p_atomic_read(&device->_lost) > 0) + return VK_ERROR_DEVICE_LOST; + p_atomic_inc(&device->_lost); + device->lost_reported = true; va_start(ap, msg); err = __vk_errorv(device->physical->instance, device, @@ -3252,24 +3277,29 @@ _anv_device_set_lost(struct anv_device *device, VkResult _anv_queue_set_lost(struct anv_queue *queue, - const char *file, int line, - const char *msg, ...) + const char *file, int line, + const char *msg, ...) { - VkResult err; va_list ap; - p_atomic_inc(&queue->device->_lost); + if (queue->lost) + return VK_ERROR_DEVICE_LOST; + queue->lost = true; + + queue->error_file = file; + queue->error_line = line; va_start(ap, msg); - err = __vk_errorv(queue->device->physical->instance, queue->device, - VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT, - VK_ERROR_DEVICE_LOST, file, line, msg, ap); + vsnprintf(queue->error_msg, sizeof(queue->error_msg), + msg, ap); va_end(ap); + p_atomic_inc(&queue->device->_lost); + if (env_var_as_boolean("ANV_ABORT_ON_DEVICE_LOSS", false)) abort(); - return err; + return VK_ERROR_DEVICE_LOST; } VkResult diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 54b1c730791..46f64cfd8b2 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1082,6 +1082,7 @@ struct anv_physical_device { bool has_syncobj_wait_available; bool has_context_priority; bool has_context_isolation; + bool has_thread_submit; bool has_mem_available; bool has_mmap_offset; uint64_t gtt_size; @@ -1183,6 +1184,7 @@ struct anv_queue_submit { uint32_t fence_count; uint32_t fence_array_length; struct drm_i915_gem_exec_fence * fences; + uint64_t * fence_values; uint32_t temporary_semaphore_count; uint32_t temporary_semaphore_array_length; @@ -1194,7 +1196,10 @@ struct anv_queue_submit { uint32_t sync_fd_semaphore_array_length; /* Allocated only with non shareable timelines. */ - struct anv_timeline ** wait_timelines; + union { + struct anv_timeline ** wait_timelines; + uint32_t * wait_timeline_syncobjs; + }; uint32_t wait_timeline_count; uint32_t wait_timeline_array_length; uint64_t * wait_timeline_values; @@ -1229,14 +1234,34 @@ struct anv_queue_submit { struct anv_queue { struct vk_object_base base; - struct anv_device * device; + struct anv_device * device; - /* - * A list of struct anv_queue_submit to be submitted to i915. - */ - struct list_head queued_submits; + VkDeviceQueueCreateFlags flags; + + /* Set once from the device api calls. */ + bool lost_signaled; + + /* Only set once atomically by the queue */ + int lost; + int error_line; + const char * error_file; + char error_msg[80]; + + /* + * This mutext protects the variables below. + */ + pthread_mutex_t mutex; + + pthread_t thread; + pthread_cond_t cond; + + /* + * A list of struct anv_queue_submit to be submitted to i915. + */ + struct list_head queued_submits; - VkDeviceQueueCreateFlags flags; + /* Set to true to stop the submission thread */ + bool quit; }; struct anv_pipeline_cache { @@ -1330,6 +1355,7 @@ struct anv_device { int fd; bool can_chain_batches; bool robust_buffer_access; + bool has_thread_submit; struct anv_device_extension_table enabled_extensions; struct anv_device_dispatch_table dispatch; @@ -1382,6 +1408,7 @@ struct anv_device { pthread_mutex_t mutex; pthread_cond_t queue_submit; int _lost; + int lost_reported; struct gen_batch_decode_ctx decoder_ctx; /* @@ -1439,7 +1466,7 @@ anv_mocs_for_bo(const struct anv_device *device, const struct anv_bo *bo) void anv_device_init_blorp(struct anv_device *device); void anv_device_finish_blorp(struct anv_device *device); -void _anv_device_set_all_queue_lost(struct anv_device *device); +void _anv_device_report_lost(struct anv_device *device); VkResult _anv_device_set_lost(struct anv_device *device, const char *file, int line, const char *msg, ...) @@ -1451,12 +1478,17 @@ VkResult _anv_queue_set_lost(struct anv_queue *queue, #define anv_device_set_lost(dev, ...) \ _anv_device_set_lost(dev, __FILE__, __LINE__, __VA_ARGS__) #define anv_queue_set_lost(queue, ...) \ - _anv_queue_set_lost(queue, __FILE__, __LINE__, __VA_ARGS__) + (queue)->device->has_thread_submit ? \ + _anv_queue_set_lost(queue, __FILE__, __LINE__, __VA_ARGS__) : \ + _anv_device_set_lost(queue->device, __FILE__, __LINE__, __VA_ARGS__) static inline bool anv_device_is_lost(struct anv_device *device) { - return unlikely(p_atomic_read(&device->_lost)); + int lost = p_atomic_read(&device->_lost); + if (unlikely(lost && !device->lost_reported)) + _anv_device_report_lost(device); + return lost; } VkResult anv_device_query_status(struct anv_device *device); @@ -3176,6 +3208,7 @@ enum anv_semaphore_type { ANV_SEMAPHORE_TYPE_SYNC_FILE, ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ, ANV_SEMAPHORE_TYPE_TIMELINE, + ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE, }; struct anv_timeline_point { diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c index 25646d07f1a..fdf10f2c012 100644 --- a/src/intel/vulkan/anv_queue.c +++ b/src/intel/vulkan/anv_queue.c @@ -95,11 +95,16 @@ anv_queue_submit_free(struct anv_device *device, for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) anv_semaphore_unref(device, submit->sync_fd_semaphores[i]); /* Execbuf does not consume the in_fence. It's our job to close it. */ - if (submit->in_fence != -1) + if (submit->in_fence != -1) { + assert(!device->has_thread_submit); close(submit->in_fence); - if (submit->out_fence != -1) + } + if (submit->out_fence != -1) { + assert(!device->has_thread_submit); close(submit->out_fence); + } vk_free(alloc, submit->fences); + vk_free(alloc, submit->fence_values); vk_free(alloc, submit->temporary_semaphores); vk_free(alloc, submit->wait_timelines); vk_free(alloc, submit->wait_timeline_values); @@ -349,6 +354,98 @@ anv_device_submit_deferred_locked(struct anv_device *device) return anv_queue_submit_deferred_locked(&device->queue, &advance); } +static void +anv_queue_submit_signal_fences(struct anv_device *device, + struct anv_queue_submit *submit) +{ + for (uint32_t i = 0; i < submit->fence_count; i++) { + if (submit->fences[i].flags & I915_EXEC_FENCE_SIGNAL) { + anv_gem_syncobj_timeline_signal(device, &submit->fences[i].handle, + &submit->fence_values[i], 1); + } + } +} + +static void * +anv_queue_task(void *_queue) +{ + struct anv_queue *queue = _queue; + + pthread_mutex_lock(&queue->mutex); + + while (!queue->quit) { + while (!list_is_empty(&queue->queued_submits)) { + struct anv_queue_submit *submit = + list_first_entry(&queue->queued_submits, struct anv_queue_submit, link); + list_del(&submit->link); + + pthread_mutex_unlock(&queue->mutex); + + VkResult result = VK_ERROR_DEVICE_LOST; + + /* Wait for timeline points to materialize before submitting. We need + * to do this because we're using threads to do the submit to i915. + * We could end up in a situation where the application submits to 2 + * queues with the first submit creating the dma-fence for the + * second. But because the scheduling of the submission threads might + * wakeup the second queue thread first, this would make that execbuf + * fail because the dma-fence it depends on hasn't materialized yet. + */ + if (!queue->lost && submit->wait_timeline_count > 0) { + int ret = queue->device->no_hw ? 0 : + anv_gem_syncobj_timeline_wait( + queue->device, submit->wait_timeline_syncobjs, + submit->wait_timeline_values, submit->wait_timeline_count, + anv_get_absolute_timeout(UINT64_MAX) /* wait forever */, + true /* wait for all */, true /* wait for materialize */); + if (ret) { + result = anv_queue_set_lost(queue, "timeline timeout: %s", + strerror(errno)); + } + } + + /* Now submit */ + if (!queue->lost) { + pthread_mutex_lock(&queue->device->mutex); + result = anv_queue_execbuf_locked(queue, submit); + pthread_mutex_unlock(&queue->device->mutex); + } + + for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) { + struct anv_semaphore *semaphore = submit->sync_fd_semaphores[i]; + /* Out fences can't have temporary state because that would imply + * that we imported a sync file and are trying to signal it. + */ + assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); + struct anv_semaphore_impl *impl = &semaphore->permanent; + + assert(impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE); + impl->fd = dup(submit->out_fence); + } + + if (result != VK_SUCCESS) { + /* vkQueueSubmit or some other entry point will report the + * DEVICE_LOST error at some point, but until we have emptied our + * list of execbufs we need to wake up all potential the waiters + * until one of them spots the error. + */ + anv_queue_submit_signal_fences(queue->device, submit); + } + + anv_queue_submit_free(queue->device, submit); + + pthread_mutex_lock(&queue->mutex); + } + + if (!queue->quit) + pthread_cond_wait(&queue->cond, &queue->mutex); + } + + pthread_mutex_unlock(&queue->mutex); + + return NULL; +} + static VkResult _anv_queue_submit(struct anv_queue *queue, struct anv_queue_submit **_submit, bool flush_queue) @@ -360,42 +457,92 @@ _anv_queue_submit(struct anv_queue *queue, struct anv_queue_submit **_submit, * anv_queue. */ *_submit = NULL; + if (queue->device->has_thread_submit) { + pthread_mutex_lock(&queue->mutex); + pthread_cond_broadcast(&queue->cond); + list_addtail(&submit->link, &queue->queued_submits); + pthread_mutex_unlock(&queue->mutex); + return VK_SUCCESS; + } else { + pthread_mutex_lock(&queue->device->mutex); + list_addtail(&submit->link, &queue->queued_submits); + VkResult result = anv_device_submit_deferred_locked(queue->device); + if (flush_queue) { + while (result == VK_SUCCESS && !list_is_empty(&queue->queued_submits)) { + int ret = pthread_cond_wait(&queue->device->queue_submit, + &queue->device->mutex); + if (ret != 0) { + result = anv_device_set_lost(queue->device, "wait timeout"); + break; + } - pthread_mutex_lock(&queue->device->mutex); - list_addtail(&submit->link, &queue->queued_submits); - VkResult result = anv_device_submit_deferred_locked(queue->device); - if (flush_queue) { - while (result == VK_SUCCESS && !list_is_empty(&queue->queued_submits)) { - int ret = pthread_cond_wait(&queue->device->queue_submit, - &queue->device->mutex); - if (ret != 0) { - result = anv_device_set_lost(queue->device, "wait timeout"); - break; + result = anv_device_submit_deferred_locked(queue->device); } - - result = anv_device_submit_deferred_locked(queue->device); } + pthread_mutex_unlock(&queue->device->mutex); + return result; } - pthread_mutex_unlock(&queue->device->mutex); - return result; } VkResult anv_queue_init(struct anv_device *device, struct anv_queue *queue) { - vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE); + VkResult result; + queue->device = device; queue->flags = 0; + queue->lost = false; + queue->quit = false; list_inithead(&queue->queued_submits); + /* We only need those additional thread/mutex when using a thread for + * submission. + */ + if (device->has_thread_submit) { + if (pthread_mutex_init(&queue->mutex, NULL) != 0) + return vk_error(VK_ERROR_INITIALIZATION_FAILED); + + if (pthread_cond_init(&queue->cond, NULL) != 0) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail_mutex; + } + if (pthread_create(&queue->thread, NULL, anv_queue_task, queue)) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail_cond; + } + } + + vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE); + return VK_SUCCESS; + + fail_cond: + pthread_cond_destroy(&queue->cond); + fail_mutex: + pthread_mutex_destroy(&queue->mutex); + + return result; } void anv_queue_finish(struct anv_queue *queue) { vk_object_base_finish(&queue->base); + + if (!queue->device->has_thread_submit) + return; + + pthread_mutex_lock(&queue->mutex); + pthread_cond_broadcast(&queue->cond); + queue->quit = true; + pthread_mutex_unlock(&queue->mutex); + + void *ret; + pthread_join(queue->thread, &ret); + + pthread_cond_destroy(&queue->cond); + pthread_mutex_destroy(&queue->mutex); } static VkResult @@ -427,10 +574,42 @@ anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit, static VkResult anv_queue_submit_add_syncobj(struct anv_queue_submit* submit, struct anv_device *device, - uint32_t handle, uint32_t flags) + uint32_t handle, uint32_t flags, + uint64_t value) { assert(flags != 0); + if (device->has_thread_submit && (flags & I915_EXEC_FENCE_WAIT)) { + if (submit->wait_timeline_count >= submit->wait_timeline_array_length) { + uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64); + + uint32_t *new_wait_timeline_syncobjs = + vk_realloc(submit->alloc, + submit->wait_timeline_syncobjs, + new_len * sizeof(*submit->wait_timeline_syncobjs), + 8, submit->alloc_scope); + if (new_wait_timeline_syncobjs == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->wait_timeline_syncobjs = new_wait_timeline_syncobjs; + + uint64_t *new_wait_timeline_values = + vk_realloc(submit->alloc, + submit->wait_timeline_values, new_len * sizeof(*submit->wait_timeline_values), + 8, submit->alloc_scope); + if (new_wait_timeline_values == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->wait_timeline_values = new_wait_timeline_values; + submit->wait_timeline_array_length = new_len; + } + + submit->wait_timeline_syncobjs[submit->wait_timeline_count] = handle; + submit->wait_timeline_values[submit->wait_timeline_count] = value; + + submit->wait_timeline_count++; + } + if (submit->fence_count >= submit->fence_array_length) { uint32_t new_len = MAX2(submit->fence_array_length * 2, 64); struct drm_i915_gem_exec_fence *new_fences = @@ -441,13 +620,24 @@ anv_queue_submit_add_syncobj(struct anv_queue_submit* submit, return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); submit->fences = new_fences; + + uint64_t *new_fence_values = + vk_realloc(submit->alloc, + submit->fence_values, new_len * sizeof(*submit->fence_values), + 8, submit->alloc_scope); + if (new_fence_values == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->fence_values = new_fence_values; submit->fence_array_length = new_len; } - submit->fences[submit->fence_count++] = (struct drm_i915_gem_exec_fence) { + submit->fences[submit->fence_count] = (struct drm_i915_gem_exec_fence) { .handle = handle, .flags = flags, }; + submit->fence_values[submit->fence_count] = value; + submit->fence_count++; return VK_SUCCESS; } @@ -595,7 +785,7 @@ anv_queue_submit_simple_batch(struct anv_queue *queue, } result = anv_queue_submit_add_syncobj(submit, device, syncobj, - I915_EXEC_FENCE_SIGNAL); + I915_EXEC_FENCE_SIGNAL, 0); } else { result = anv_device_alloc_bo(device, 4096, ANV_BO_ALLOC_EXTERNAL | @@ -742,7 +932,6 @@ anv_queue_submit(struct anv_queue *queue, submit->cmd_buffer = cmd_buffer; VkResult result = VK_SUCCESS; - for (uint32_t i = 0; i < num_in_semaphores; i++) { ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); struct anv_semaphore_impl *impl; @@ -796,7 +985,8 @@ anv_queue_submit(struct anv_queue *queue, case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: { result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, - I915_EXEC_FENCE_WAIT); + I915_EXEC_FENCE_WAIT, + 0); if (result != VK_SUCCESS) goto error; break; @@ -810,6 +1000,15 @@ anv_queue_submit(struct anv_queue *queue, goto error; break; + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: + result = anv_queue_submit_add_syncobj(submit, device, + impl->syncobj, + I915_EXEC_FENCE_WAIT, + in_values ? in_values[i] : 0); + if (result != VK_SUCCESS) + goto error; + break; + default: break; } @@ -850,7 +1049,8 @@ anv_queue_submit(struct anv_queue *queue, case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: { result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, - I915_EXEC_FENCE_SIGNAL); + I915_EXEC_FENCE_SIGNAL, + 0); if (result != VK_SUCCESS) goto error; break; @@ -864,6 +1064,14 @@ anv_queue_submit(struct anv_queue *queue, goto error; break; + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, + I915_EXEC_FENCE_SIGNAL, + out_values ? out_values[i] : 0); + if (result != VK_SUCCESS) + goto error; + break; + default: break; } @@ -893,6 +1101,7 @@ anv_queue_submit(struct anv_queue *queue, switch (impl->type) { case ANV_FENCE_TYPE_BO: + assert(!device->has_thread_submit); result = anv_queue_submit_add_fence_bo(submit, impl->bo.bo, true /* signal */); if (result != VK_SUCCESS) goto error; @@ -904,8 +1113,11 @@ anv_queue_submit(struct anv_queue *queue, * also reset the fence's syncobj so that they don't contain a * signaled dma-fence. */ + anv_gem_syncobj_reset(device, impl->syncobj); + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, - I915_EXEC_FENCE_SIGNAL); + I915_EXEC_FENCE_SIGNAL, + 0); if (result != VK_SUCCESS) goto error; break; @@ -921,6 +1133,7 @@ anv_queue_submit(struct anv_queue *queue, goto error; if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { + assert(!device->has_thread_submit); /* If we have permanent BO fence, the only type of temporary possible * would be BO_WSI (because BO fences are not shareable). The Vulkan spec * also requires that the fence passed to vkQueueSubmit() be : @@ -1291,16 +1504,34 @@ VkResult anv_GetFenceStatus( } case ANV_FENCE_TYPE_SYNCOBJ: { - int ret = anv_gem_syncobj_wait(device, &impl->syncobj, 1, 0, true); - if (ret == -1) { - if (errno == ETIME) { - return VK_NOT_READY; + if (device->has_thread_submit) { + uint64_t binary_value = 0; + int ret = anv_gem_syncobj_timeline_wait(device, &impl->syncobj, + &binary_value, 1, 0, + true /* wait_all */, + false /* wait_materialize */); + if (ret == -1) { + if (errno == ETIME) { + return VK_NOT_READY; + } else { + /* We don't know the real error. */ + return anv_device_set_lost(device, "drm_syncobj_wait failed: %m"); + } } else { - /* We don't know the real error. */ - return anv_device_set_lost(device, "drm_syncobj_wait failed: %m"); + return VK_SUCCESS; } } else { - return VK_SUCCESS; + int ret = anv_gem_syncobj_wait(device, &impl->syncobj, 1, 0, false); + if (ret == -1) { + if (errno == ETIME) { + return VK_NOT_READY; + } else { + /* We don't know the real error. */ + return anv_device_set_lost(device, "drm_syncobj_wait failed: %m"); + } + } else { + return VK_SUCCESS; + } } } @@ -1334,11 +1565,11 @@ anv_wait_for_syncobj_fences(struct anv_device *device, syncobjs[i] = impl->syncobj; } + int ret = 0; /* The gem_syncobj_wait ioctl may return early due to an inherent - * limitation in the way it computes timeouts. Loop until we've actually + * limitation in the way it computes timeouts. Loop until we've actually * passed the timeout. */ - int ret; do { ret = anv_gem_syncobj_wait(device, syncobjs, fenceCount, abs_timeout_ns, waitAll); @@ -1496,6 +1727,8 @@ anv_wait_for_fences(struct anv_device *device, switch (impl->type) { case ANV_FENCE_TYPE_BO: + assert(!device->physical->has_syncobj_wait); + /* fall-through */ case ANV_FENCE_TYPE_WSI_BO: result = anv_wait_for_bo_fences(device, 1, &pFences[i], true, abs_timeout); @@ -1695,6 +1928,31 @@ VkResult anv_ImportFenceFdKHR( return VK_SUCCESS; } +/* The sideband payload of the DRM syncobj was incremented when the + * application called vkQueueSubmit(). Here we wait for a fence with the same + * value to materialize so that we can exporting (typically as a SyncFD). + */ +static VkResult +wait_syncobj_materialize(struct anv_device *device, + uint32_t syncobj, + int *fd) +{ + if (!device->has_thread_submit) + return VK_SUCCESS; + + uint64_t binary_value = 0; + /* We might need to wait until the fence materializes before we can + * export to a sync FD when we use a thread for submission. + */ + if (anv_gem_syncobj_timeline_wait(device, &syncobj, &binary_value, 1, + anv_get_absolute_timeout(5ull * NSEC_PER_SEC), + true /* wait_all */, + true /* wait_materialize */)) + return anv_device_set_lost(device, "anv_gem_syncobj_timeline_wait failed: %m"); + + return VK_SUCCESS; +} + VkResult anv_GetFenceFdKHR( VkDevice _device, const VkFenceGetFdInfoKHR* pGetFdInfo, @@ -1721,6 +1979,10 @@ VkResult anv_GetFenceFdKHR( } case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { + VkResult result = wait_syncobj_materialize(device, impl->syncobj, pFd); + if (result != VK_SUCCESS) + return result; + int fd = anv_gem_syncobj_export_sync_file(device, impl->syncobj); if (fd < 0) return vk_error(VK_ERROR_TOO_MANY_OBJECTS); @@ -1794,8 +2056,24 @@ timeline_semaphore_create(struct anv_device *device, struct anv_semaphore_impl *impl, uint64_t initial_value) { - impl->type = ANV_SEMAPHORE_TYPE_TIMELINE; - anv_timeline_init(device, &impl->timeline, initial_value); + if (device->has_thread_submit) { + impl->type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE; + impl->syncobj = anv_gem_syncobj_create(device, 0); + if (!impl->syncobj) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + if (initial_value) { + if (anv_gem_syncobj_timeline_signal(device, + &impl->syncobj, + &initial_value, 1)) { + anv_gem_syncobj_destroy(device, impl->syncobj); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + } + } else { + impl->type = ANV_SEMAPHORE_TYPE_TIMELINE; + anv_timeline_init(device, &impl->timeline, initial_value); + } + return VK_SUCCESS; } @@ -1824,7 +2102,7 @@ VkResult anv_CreateSemaphore( const VkExportSemaphoreCreateInfo *export = vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO); - VkExternalSemaphoreHandleTypeFlags handleTypes = + VkExternalSemaphoreHandleTypeFlags handleTypes = export ? export->handleTypes : 0; VkResult result; @@ -1839,8 +2117,10 @@ VkResult anv_CreateSemaphore( } } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT); - assert(sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR); - result = binary_semaphore_create(device, &semaphore->permanent, true); + if (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR) + result = binary_semaphore_create(device, &semaphore->permanent, true); + else + result = timeline_semaphore_create(device, &semaphore->permanent, timeline_value); if (result != VK_SUCCESS) { vk_free2(&device->vk.alloc, pAllocator, semaphore); return result; @@ -1897,6 +2177,7 @@ anv_semaphore_impl_cleanup(struct anv_device *device, break; case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: anv_gem_syncobj_destroy(device, impl->syncobj); break; @@ -1964,8 +2245,10 @@ void anv_GetPhysicalDeviceExternalSemaphoreProperties( switch (pExternalSemaphoreInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - /* Timeline semaphores are not exportable. */ - if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR) + /* Timeline semaphores are not exportable, unless we have threaded + * submission. + */ + if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR && !device->has_thread_submit) break; pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; @@ -2014,7 +2297,15 @@ VkResult anv_ImportSemaphoreFdKHR( switch (pImportSemaphoreFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: if (device->physical->has_syncobj) { - new_impl.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; + /* When importing non temporarily, reuse the semaphore's existing + * type. The Linux/DRM implementation allows to interchangeably use + * binary & timeline semaphores and we have no way to differenciate + * them. + */ + if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT) + new_impl.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; + else + new_impl.type = semaphore->permanent.type; new_impl.syncobj = anv_gem_syncobj_fd_to_handle(device, fd); if (!new_impl.syncobj) @@ -2168,9 +2459,13 @@ VkResult anv_GetSemaphoreFdKHR( } case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: - if (pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) + if (pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { + VkResult result = wait_syncobj_materialize(device, impl->syncobj, pFd); + if (result != VK_SUCCESS) + return result; + fd = anv_gem_syncobj_export_sync_file(device, impl->syncobj); - else { + } else { assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT); fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj); } @@ -2179,6 +2474,14 @@ VkResult anv_GetSemaphoreFdKHR( *pFd = fd; break; + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: + assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT); + fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj); + if (fd < 0) + return vk_error(VK_ERROR_TOO_MANY_OBJECTS); + *pFd = fd; + break; + default: return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); } @@ -2217,6 +2520,15 @@ VkResult anv_GetSemaphoreCounterValue( return VK_SUCCESS; } + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: { + int ret = anv_gem_syncobj_timeline_query(device, &impl->syncobj, pValue, 1); + + if (ret != 0) + return anv_device_set_lost(device, "unable to query timeline syncobj"); + + return VK_SUCCESS; + } + default: unreachable("Invalid semaphore type"); } @@ -2236,8 +2548,8 @@ anv_timeline_wait_locked(struct anv_device *device, .tv_nsec = abs_timeout_ns % NSEC_PER_SEC, }; - int ret = pthread_cond_timedwait(&device->queue_submit, - &device->mutex, &abstime); + UNUSED int ret = pthread_cond_timedwait(&device->queue_submit, + &device->mutex, &abstime); assert(ret != EINVAL); if (anv_gettime_ns() >= abs_timeout_ns && timeline->highest_pending < serial) @@ -2336,24 +2648,22 @@ VkResult anv_WaitSemaphores( uint64_t timeout) { ANV_FROM_HANDLE(anv_device, device, _device); + uint32_t *handles; + struct anv_timeline **timelines; + uint64_t *values; - if (device->no_hw) - return VK_SUCCESS; + ANV_MULTIALLOC(ma); - struct anv_timeline **timelines = - vk_alloc(&device->vk.alloc, - pWaitInfo->semaphoreCount * sizeof(*timelines), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!timelines) - return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + anv_multialloc_add(&ma, &values, pWaitInfo->semaphoreCount); + if (device->has_thread_submit) { + anv_multialloc_add(&ma, &handles, pWaitInfo->semaphoreCount); + } else { + anv_multialloc_add(&ma, &timelines, pWaitInfo->semaphoreCount); + } - uint64_t *values = vk_alloc(&device->vk.alloc, - pWaitInfo->semaphoreCount * sizeof(*values), - 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!values) { - vk_free(&device->vk.alloc, timelines); + if (!anv_multialloc_alloc(&ma, &device->vk.alloc, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - } uint32_t handle_count = 0; for (uint32_t i = 0; i < pWaitInfo->semaphoreCount; i++) { @@ -2362,24 +2672,40 @@ VkResult anv_WaitSemaphores( semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? &semaphore->temporary : &semaphore->permanent; - assert(impl->type == ANV_SEMAPHORE_TYPE_TIMELINE); - if (pWaitInfo->pValues[i] == 0) continue; - timelines[handle_count] = &impl->timeline; + if (device->has_thread_submit) { + assert(impl->type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE); + handles[handle_count] = impl->syncobj; + } else { + assert(impl->type == ANV_SEMAPHORE_TYPE_TIMELINE); + timelines[handle_count] = &impl->timeline; + } values[handle_count] = pWaitInfo->pValues[i]; handle_count++; } VkResult result = VK_SUCCESS; if (handle_count > 0) { - result = anv_timelines_wait(device, timelines, values, handle_count, - !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR), - anv_get_absolute_timeout(timeout)); + if (device->has_thread_submit) { + int ret = + anv_gem_syncobj_timeline_wait(device, + handles, values, handle_count, + anv_get_absolute_timeout(timeout), + !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR), + false); + if (ret != 0) + result = errno == ETIME ? VK_TIMEOUT : + anv_device_set_lost(device, "unable to wait on timeline syncobj"); + } else { + result = + anv_timelines_wait(device, timelines, values, handle_count, + !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR), + anv_get_absolute_timeout(timeout)); + } } - vk_free(&device->vk.alloc, timelines); vk_free(&device->vk.alloc, values); return result; @@ -2414,6 +2740,20 @@ VkResult anv_SignalSemaphore( return result; } + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: { + /* Timeline semaphores are created with a value of 0, so signaling on 0 + * is a waste of time. + */ + if (pSignalInfo->value == 0) + return VK_SUCCESS; + + int ret = anv_gem_syncobj_timeline_signal(device, &impl->syncobj, + &pSignalInfo->value, 1); + + return ret == 0 ? VK_SUCCESS : + anv_device_set_lost(device, "unable to signal timeline syncobj"); + } + default: unreachable("Invalid semaphore type"); } diff --git a/src/intel/vulkan/anv_wsi.c b/src/intel/vulkan/anv_wsi.c index 75bf4feadd3..cbe5bb02914 100644 --- a/src/intel/vulkan/anv_wsi.c +++ b/src/intel/vulkan/anv_wsi.c @@ -299,10 +299,62 @@ VkResult anv_QueuePresentKHR( } } - return wsi_common_queue_present(&queue->device->physical->wsi_device, - anv_device_to_handle(queue->device), - _queue, 0, - pPresentInfo); + if (device->has_thread_submit && + pPresentInfo->waitSemaphoreCount > 0) { + /* Make sure all of the dependency semaphores have materialized when + * using a threaded submission. + */ + uint32_t *syncobjs = vk_alloc(&device->vk.alloc, + sizeof(*syncobjs) * pPresentInfo->waitSemaphoreCount, 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + + if (!syncobjs) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + uint32_t wait_count = 0; + for (uint32_t i = 0; i < pPresentInfo->waitSemaphoreCount; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, pPresentInfo->pWaitSemaphores[i]); + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + if (impl->type == ANV_SEMAPHORE_TYPE_DUMMY) + continue; + assert(impl->type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ); + syncobjs[wait_count++] = impl->syncobj; + } + + int ret = 0; + if (wait_count > 0) { + ret = + anv_gem_syncobj_wait(device, syncobjs, wait_count, + anv_get_absolute_timeout(INT64_MAX), + true /* wait_all */); + } + + vk_free(&device->vk.alloc, syncobjs); + + if (ret) + return vk_error(VK_ERROR_DEVICE_LOST); + } + + VkResult result = wsi_common_queue_present(&device->physical->wsi_device, + anv_device_to_handle(queue->device), + _queue, 0, + pPresentInfo); + + for (uint32_t i = 0; i < pPresentInfo->waitSemaphoreCount; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, pPresentInfo->pWaitSemaphores[i]); + /* From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + */ + anv_semaphore_reset_temporary(queue->device, semaphore); + } + + return result; } VkResult anv_GetDeviceGroupPresentCapabilitiesKHR(