X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fintel%2Fvulkan%2Fanv_queue.c;h=fdf10f2c01266dc6013ccb90f2ec1994a34fb31a;hb=829699ba632b2b78e4de372baf42ae01095158a7;hp=2a8ed2eb4ed135e86c2ed6cd96c4fbfecae0c80a;hpb=1bd4f8fefc2728963fc37900fe75210ee24e09d1;p=mesa.git diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c index 2a8ed2eb4ed..fdf10f2c012 100644 --- a/src/intel/vulkan/anv_queue.c +++ b/src/intel/vulkan/anv_queue.c @@ -25,89 +25,1143 @@ * This file implements VkQueue, VkFence, and VkSemaphore */ +#include #include #include -#include + +#include "util/os_file.h" #include "anv_private.h" #include "vk_util.h" -#include "genxml/gen7_pack.h" +#include "genxml/gen7_pack.h" + +uint64_t anv_gettime_ns(void) +{ + struct timespec current; + clock_gettime(CLOCK_MONOTONIC, ¤t); + return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec; +} + +uint64_t anv_get_absolute_timeout(uint64_t timeout) +{ + if (timeout == 0) + return 0; + uint64_t current_time = anv_gettime_ns(); + uint64_t max_timeout = (uint64_t) INT64_MAX - current_time; + + timeout = MIN2(max_timeout, timeout); + + return (current_time + timeout); +} + +static int64_t anv_get_relative_timeout(uint64_t abs_timeout) +{ + uint64_t now = anv_gettime_ns(); + + /* We don't want negative timeouts. + * + * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is + * supposed to block indefinitely timeouts < 0. Unfortunately, + * this was broken for a couple of kernel releases. Since there's + * no way to know whether or not the kernel we're using is one of + * the broken ones, the best we can do is to clamp the timeout to + * INT64_MAX. This limits the maximum timeout from 584 years to + * 292 years - likely not a big deal. + */ + if (abs_timeout < now) + return 0; + + uint64_t rel_timeout = abs_timeout - now; + if (rel_timeout > (uint64_t) INT64_MAX) + rel_timeout = INT64_MAX; + + return rel_timeout; +} + +static struct anv_semaphore *anv_semaphore_ref(struct anv_semaphore *semaphore); +static void anv_semaphore_unref(struct anv_device *device, struct anv_semaphore *semaphore); +static void anv_semaphore_impl_cleanup(struct anv_device *device, + struct anv_semaphore_impl *impl); + +static void +anv_queue_submit_free(struct anv_device *device, + struct anv_queue_submit *submit) +{ + const VkAllocationCallbacks *alloc = submit->alloc; + + for (uint32_t i = 0; i < submit->temporary_semaphore_count; i++) + anv_semaphore_impl_cleanup(device, &submit->temporary_semaphores[i]); + for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) + anv_semaphore_unref(device, submit->sync_fd_semaphores[i]); + /* Execbuf does not consume the in_fence. It's our job to close it. */ + if (submit->in_fence != -1) { + assert(!device->has_thread_submit); + close(submit->in_fence); + } + if (submit->out_fence != -1) { + assert(!device->has_thread_submit); + close(submit->out_fence); + } + vk_free(alloc, submit->fences); + vk_free(alloc, submit->fence_values); + vk_free(alloc, submit->temporary_semaphores); + vk_free(alloc, submit->wait_timelines); + vk_free(alloc, submit->wait_timeline_values); + vk_free(alloc, submit->signal_timelines); + vk_free(alloc, submit->signal_timeline_values); + vk_free(alloc, submit->fence_bos); + vk_free(alloc, submit); +} + +static bool +anv_queue_submit_ready_locked(struct anv_queue_submit *submit) +{ + for (uint32_t i = 0; i < submit->wait_timeline_count; i++) { + if (submit->wait_timeline_values[i] > submit->wait_timelines[i]->highest_pending) + return false; + } + + return true; +} + +static VkResult +anv_timeline_init(struct anv_device *device, + struct anv_timeline *timeline, + uint64_t initial_value) +{ + timeline->highest_past = + timeline->highest_pending = initial_value; + list_inithead(&timeline->points); + list_inithead(&timeline->free_points); + + return VK_SUCCESS; +} + +static void +anv_timeline_finish(struct anv_device *device, + struct anv_timeline *timeline) +{ + list_for_each_entry_safe(struct anv_timeline_point, point, + &timeline->free_points, link) { + list_del(&point->link); + anv_device_release_bo(device, point->bo); + vk_free(&device->vk.alloc, point); + } + list_for_each_entry_safe(struct anv_timeline_point, point, + &timeline->points, link) { + list_del(&point->link); + anv_device_release_bo(device, point->bo); + vk_free(&device->vk.alloc, point); + } +} + +static VkResult +anv_timeline_add_point_locked(struct anv_device *device, + struct anv_timeline *timeline, + uint64_t value, + struct anv_timeline_point **point) +{ + VkResult result = VK_SUCCESS; + + if (list_is_empty(&timeline->free_points)) { + *point = + vk_zalloc(&device->vk.alloc, sizeof(**point), + 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); + if (!(*point)) + result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + if (result == VK_SUCCESS) { + result = anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* explicit_address */, + &(*point)->bo); + if (result != VK_SUCCESS) + vk_free(&device->vk.alloc, *point); + } + } else { + *point = list_first_entry(&timeline->free_points, + struct anv_timeline_point, link); + list_del(&(*point)->link); + } + + if (result == VK_SUCCESS) { + (*point)->serial = value; + list_addtail(&(*point)->link, &timeline->points); + } + + return result; +} + +static VkResult +anv_timeline_gc_locked(struct anv_device *device, + struct anv_timeline *timeline) +{ + list_for_each_entry_safe(struct anv_timeline_point, point, + &timeline->points, link) { + /* timeline->higest_pending is only incremented once submission has + * happened. If this point has a greater serial, it means the point + * hasn't been submitted yet. + */ + if (point->serial > timeline->highest_pending) + return VK_SUCCESS; + + /* If someone is waiting on this time point, consider it busy and don't + * try to recycle it. There's a slim possibility that it's no longer + * busy by the time we look at it but we would be recycling it out from + * under a waiter and that can lead to weird races. + * + * We walk the list in-order so if this time point is still busy so is + * every following time point + */ + assert(point->waiting >= 0); + if (point->waiting) + return VK_SUCCESS; + + /* Garbage collect any signaled point. */ + VkResult result = anv_device_bo_busy(device, point->bo); + if (result == VK_NOT_READY) { + /* We walk the list in-order so if this time point is still busy so + * is every following time point + */ + return VK_SUCCESS; + } else if (result != VK_SUCCESS) { + return result; + } + + assert(timeline->highest_past < point->serial); + timeline->highest_past = point->serial; + + list_del(&point->link); + list_add(&point->link, &timeline->free_points); + } + + return VK_SUCCESS; +} + +static VkResult anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit, + struct anv_bo *bo, + bool signal); + +static VkResult +anv_queue_submit_timeline_locked(struct anv_queue *queue, + struct anv_queue_submit *submit) +{ + VkResult result; + + for (uint32_t i = 0; i < submit->wait_timeline_count; i++) { + struct anv_timeline *timeline = submit->wait_timelines[i]; + uint64_t wait_value = submit->wait_timeline_values[i]; + + if (timeline->highest_past >= wait_value) + continue; + + list_for_each_entry(struct anv_timeline_point, point, &timeline->points, link) { + if (point->serial < wait_value) + continue; + result = anv_queue_submit_add_fence_bo(submit, point->bo, false); + if (result != VK_SUCCESS) + return result; + break; + } + } + for (uint32_t i = 0; i < submit->signal_timeline_count; i++) { + struct anv_timeline *timeline = submit->signal_timelines[i]; + uint64_t signal_value = submit->signal_timeline_values[i]; + struct anv_timeline_point *point; + + result = anv_timeline_add_point_locked(queue->device, timeline, + signal_value, &point); + if (result != VK_SUCCESS) + return result; + + result = anv_queue_submit_add_fence_bo(submit, point->bo, true); + if (result != VK_SUCCESS) + return result; + } + + result = anv_queue_execbuf_locked(queue, submit); + + if (result == VK_SUCCESS) { + /* Update the pending values in the timeline objects. */ + for (uint32_t i = 0; i < submit->signal_timeline_count; i++) { + struct anv_timeline *timeline = submit->signal_timelines[i]; + uint64_t signal_value = submit->signal_timeline_values[i]; + + assert(signal_value > timeline->highest_pending); + timeline->highest_pending = signal_value; + } + + /* Update signaled semaphores backed by syncfd. */ + for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) { + struct anv_semaphore *semaphore = submit->sync_fd_semaphores[i]; + /* Out fences can't have temporary state because that would imply + * that we imported a sync file and are trying to signal it. + */ + assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); + struct anv_semaphore_impl *impl = &semaphore->permanent; + + assert(impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE); + impl->fd = os_dupfd_cloexec(submit->out_fence); + } + } else { + /* Unblock any waiter by signaling the points, the application will get + * a device lost error code. + */ + for (uint32_t i = 0; i < submit->signal_timeline_count; i++) { + struct anv_timeline *timeline = submit->signal_timelines[i]; + uint64_t signal_value = submit->signal_timeline_values[i]; + + assert(signal_value > timeline->highest_pending); + timeline->highest_past = timeline->highest_pending = signal_value; + } + } + + return result; +} + +static VkResult +anv_queue_submit_deferred_locked(struct anv_queue *queue, uint32_t *advance) +{ + VkResult result = VK_SUCCESS; + + /* Go through all the queued submissions and submit then until we find one + * that's waiting on a point that hasn't materialized yet. + */ + list_for_each_entry_safe(struct anv_queue_submit, submit, + &queue->queued_submits, link) { + if (!anv_queue_submit_ready_locked(submit)) + break; + + (*advance)++; + list_del(&submit->link); + + result = anv_queue_submit_timeline_locked(queue, submit); + + anv_queue_submit_free(queue->device, submit); + + if (result != VK_SUCCESS) + break; + } + + return result; +} + +static VkResult +anv_device_submit_deferred_locked(struct anv_device *device) +{ + uint32_t advance = 0; + return anv_queue_submit_deferred_locked(&device->queue, &advance); +} + +static void +anv_queue_submit_signal_fences(struct anv_device *device, + struct anv_queue_submit *submit) +{ + for (uint32_t i = 0; i < submit->fence_count; i++) { + if (submit->fences[i].flags & I915_EXEC_FENCE_SIGNAL) { + anv_gem_syncobj_timeline_signal(device, &submit->fences[i].handle, + &submit->fence_values[i], 1); + } + } +} + +static void * +anv_queue_task(void *_queue) +{ + struct anv_queue *queue = _queue; + + pthread_mutex_lock(&queue->mutex); + + while (!queue->quit) { + while (!list_is_empty(&queue->queued_submits)) { + struct anv_queue_submit *submit = + list_first_entry(&queue->queued_submits, struct anv_queue_submit, link); + list_del(&submit->link); + + pthread_mutex_unlock(&queue->mutex); + + VkResult result = VK_ERROR_DEVICE_LOST; + + /* Wait for timeline points to materialize before submitting. We need + * to do this because we're using threads to do the submit to i915. + * We could end up in a situation where the application submits to 2 + * queues with the first submit creating the dma-fence for the + * second. But because the scheduling of the submission threads might + * wakeup the second queue thread first, this would make that execbuf + * fail because the dma-fence it depends on hasn't materialized yet. + */ + if (!queue->lost && submit->wait_timeline_count > 0) { + int ret = queue->device->no_hw ? 0 : + anv_gem_syncobj_timeline_wait( + queue->device, submit->wait_timeline_syncobjs, + submit->wait_timeline_values, submit->wait_timeline_count, + anv_get_absolute_timeout(UINT64_MAX) /* wait forever */, + true /* wait for all */, true /* wait for materialize */); + if (ret) { + result = anv_queue_set_lost(queue, "timeline timeout: %s", + strerror(errno)); + } + } + + /* Now submit */ + if (!queue->lost) { + pthread_mutex_lock(&queue->device->mutex); + result = anv_queue_execbuf_locked(queue, submit); + pthread_mutex_unlock(&queue->device->mutex); + } + + for (uint32_t i = 0; i < submit->sync_fd_semaphore_count; i++) { + struct anv_semaphore *semaphore = submit->sync_fd_semaphores[i]; + /* Out fences can't have temporary state because that would imply + * that we imported a sync file and are trying to signal it. + */ + assert(semaphore->temporary.type == ANV_SEMAPHORE_TYPE_NONE); + struct anv_semaphore_impl *impl = &semaphore->permanent; + + assert(impl->type == ANV_SEMAPHORE_TYPE_SYNC_FILE); + impl->fd = dup(submit->out_fence); + } + + if (result != VK_SUCCESS) { + /* vkQueueSubmit or some other entry point will report the + * DEVICE_LOST error at some point, but until we have emptied our + * list of execbufs we need to wake up all potential the waiters + * until one of them spots the error. + */ + anv_queue_submit_signal_fences(queue->device, submit); + } + + anv_queue_submit_free(queue->device, submit); + + pthread_mutex_lock(&queue->mutex); + } + + if (!queue->quit) + pthread_cond_wait(&queue->cond, &queue->mutex); + } + + pthread_mutex_unlock(&queue->mutex); + + return NULL; +} + +static VkResult +_anv_queue_submit(struct anv_queue *queue, struct anv_queue_submit **_submit, + bool flush_queue) +{ + struct anv_queue_submit *submit = *_submit; + + /* Wait before signal behavior means we might keep alive the + * anv_queue_submit object a bit longer, so transfer the ownership to the + * anv_queue. + */ + *_submit = NULL; + if (queue->device->has_thread_submit) { + pthread_mutex_lock(&queue->mutex); + pthread_cond_broadcast(&queue->cond); + list_addtail(&submit->link, &queue->queued_submits); + pthread_mutex_unlock(&queue->mutex); + return VK_SUCCESS; + } else { + pthread_mutex_lock(&queue->device->mutex); + list_addtail(&submit->link, &queue->queued_submits); + VkResult result = anv_device_submit_deferred_locked(queue->device); + if (flush_queue) { + while (result == VK_SUCCESS && !list_is_empty(&queue->queued_submits)) { + int ret = pthread_cond_wait(&queue->device->queue_submit, + &queue->device->mutex); + if (ret != 0) { + result = anv_device_set_lost(queue->device, "wait timeout"); + break; + } + + result = anv_device_submit_deferred_locked(queue->device); + } + } + pthread_mutex_unlock(&queue->device->mutex); + return result; + } +} + +VkResult +anv_queue_init(struct anv_device *device, struct anv_queue *queue) +{ + VkResult result; + + queue->device = device; + queue->flags = 0; + queue->lost = false; + queue->quit = false; + + list_inithead(&queue->queued_submits); + + /* We only need those additional thread/mutex when using a thread for + * submission. + */ + if (device->has_thread_submit) { + if (pthread_mutex_init(&queue->mutex, NULL) != 0) + return vk_error(VK_ERROR_INITIALIZATION_FAILED); + + if (pthread_cond_init(&queue->cond, NULL) != 0) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail_mutex; + } + if (pthread_create(&queue->thread, NULL, anv_queue_task, queue)) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail_cond; + } + } + + vk_object_base_init(&device->vk, &queue->base, VK_OBJECT_TYPE_QUEUE); + + return VK_SUCCESS; + + fail_cond: + pthread_cond_destroy(&queue->cond); + fail_mutex: + pthread_mutex_destroy(&queue->mutex); + + return result; +} + +void +anv_queue_finish(struct anv_queue *queue) +{ + vk_object_base_finish(&queue->base); + + if (!queue->device->has_thread_submit) + return; + + pthread_mutex_lock(&queue->mutex); + pthread_cond_broadcast(&queue->cond); + queue->quit = true; + pthread_mutex_unlock(&queue->mutex); + + void *ret; + pthread_join(queue->thread, &ret); + + pthread_cond_destroy(&queue->cond); + pthread_mutex_destroy(&queue->mutex); +} + +static VkResult +anv_queue_submit_add_fence_bo(struct anv_queue_submit *submit, + struct anv_bo *bo, + bool signal) +{ + if (submit->fence_bo_count >= submit->fence_bo_array_length) { + uint32_t new_len = MAX2(submit->fence_bo_array_length * 2, 64); + uintptr_t *new_fence_bos = + vk_realloc(submit->alloc, + submit->fence_bos, new_len * sizeof(*submit->fence_bos), + 8, submit->alloc_scope); + if (new_fence_bos == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->fence_bos = new_fence_bos; + submit->fence_bo_array_length = new_len; + } + + /* Take advantage that anv_bo are allocated at 8 byte alignement so we can + * use the lowest bit to store whether this is a BO we need to signal. + */ + submit->fence_bos[submit->fence_bo_count++] = anv_pack_ptr(bo, 1, signal); + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_syncobj(struct anv_queue_submit* submit, + struct anv_device *device, + uint32_t handle, uint32_t flags, + uint64_t value) +{ + assert(flags != 0); + + if (device->has_thread_submit && (flags & I915_EXEC_FENCE_WAIT)) { + if (submit->wait_timeline_count >= submit->wait_timeline_array_length) { + uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64); + + uint32_t *new_wait_timeline_syncobjs = + vk_realloc(submit->alloc, + submit->wait_timeline_syncobjs, + new_len * sizeof(*submit->wait_timeline_syncobjs), + 8, submit->alloc_scope); + if (new_wait_timeline_syncobjs == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->wait_timeline_syncobjs = new_wait_timeline_syncobjs; + + uint64_t *new_wait_timeline_values = + vk_realloc(submit->alloc, + submit->wait_timeline_values, new_len * sizeof(*submit->wait_timeline_values), + 8, submit->alloc_scope); + if (new_wait_timeline_values == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->wait_timeline_values = new_wait_timeline_values; + submit->wait_timeline_array_length = new_len; + } + + submit->wait_timeline_syncobjs[submit->wait_timeline_count] = handle; + submit->wait_timeline_values[submit->wait_timeline_count] = value; + + submit->wait_timeline_count++; + } + + if (submit->fence_count >= submit->fence_array_length) { + uint32_t new_len = MAX2(submit->fence_array_length * 2, 64); + struct drm_i915_gem_exec_fence *new_fences = + vk_realloc(submit->alloc, + submit->fences, new_len * sizeof(*submit->fences), + 8, submit->alloc_scope); + if (new_fences == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->fences = new_fences; + + uint64_t *new_fence_values = + vk_realloc(submit->alloc, + submit->fence_values, new_len * sizeof(*submit->fence_values), + 8, submit->alloc_scope); + if (new_fence_values == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->fence_values = new_fence_values; + submit->fence_array_length = new_len; + } + + submit->fences[submit->fence_count] = (struct drm_i915_gem_exec_fence) { + .handle = handle, + .flags = flags, + }; + submit->fence_values[submit->fence_count] = value; + submit->fence_count++; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_sync_fd_fence(struct anv_queue_submit *submit, + struct anv_semaphore *semaphore) +{ + if (submit->sync_fd_semaphore_count >= submit->sync_fd_semaphore_array_length) { + uint32_t new_len = MAX2(submit->sync_fd_semaphore_array_length * 2, 64); + struct anv_semaphore **new_semaphores = + vk_realloc(submit->alloc, submit->sync_fd_semaphores, + new_len * sizeof(*submit->sync_fd_semaphores), 8, + submit->alloc_scope); + if (new_semaphores == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->sync_fd_semaphores = new_semaphores; + } + + submit->sync_fd_semaphores[submit->sync_fd_semaphore_count++] = + anv_semaphore_ref(semaphore); + submit->need_out_fence = true; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_timeline_wait(struct anv_queue_submit* submit, + struct anv_device *device, + struct anv_timeline *timeline, + uint64_t value) +{ + if (submit->wait_timeline_count >= submit->wait_timeline_array_length) { + uint32_t new_len = MAX2(submit->wait_timeline_array_length * 2, 64); + struct anv_timeline **new_wait_timelines = + vk_realloc(submit->alloc, + submit->wait_timelines, new_len * sizeof(*submit->wait_timelines), + 8, submit->alloc_scope); + if (new_wait_timelines == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->wait_timelines = new_wait_timelines; + + uint64_t *new_wait_timeline_values = + vk_realloc(submit->alloc, + submit->wait_timeline_values, new_len * sizeof(*submit->wait_timeline_values), + 8, submit->alloc_scope); + if (new_wait_timeline_values == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->wait_timeline_values = new_wait_timeline_values; + + submit->wait_timeline_array_length = new_len; + } + + submit->wait_timelines[submit->wait_timeline_count] = timeline; + submit->wait_timeline_values[submit->wait_timeline_count] = value; + + submit->wait_timeline_count++; + + return VK_SUCCESS; +} + +static VkResult +anv_queue_submit_add_timeline_signal(struct anv_queue_submit* submit, + struct anv_device *device, + struct anv_timeline *timeline, + uint64_t value) +{ + assert(timeline->highest_pending < value); + + if (submit->signal_timeline_count >= submit->signal_timeline_array_length) { + uint32_t new_len = MAX2(submit->signal_timeline_array_length * 2, 64); + struct anv_timeline **new_signal_timelines = + vk_realloc(submit->alloc, + submit->signal_timelines, new_len * sizeof(*submit->signal_timelines), + 8, submit->alloc_scope); + if (new_signal_timelines == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->signal_timelines = new_signal_timelines; + + uint64_t *new_signal_timeline_values = + vk_realloc(submit->alloc, + submit->signal_timeline_values, new_len * sizeof(*submit->signal_timeline_values), + 8, submit->alloc_scope); + if (new_signal_timeline_values == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->signal_timeline_values = new_signal_timeline_values; + + submit->signal_timeline_array_length = new_len; + } + + submit->signal_timelines[submit->signal_timeline_count] = timeline; + submit->signal_timeline_values[submit->signal_timeline_count] = value; + + submit->signal_timeline_count++; + + return VK_SUCCESS; +} + +static struct anv_queue_submit * +anv_queue_submit_alloc(struct anv_device *device, int perf_query_pass) +{ + const VkAllocationCallbacks *alloc = &device->vk.alloc; + VkSystemAllocationScope alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE; + + struct anv_queue_submit *submit = vk_zalloc(alloc, sizeof(*submit), 8, alloc_scope); + if (!submit) + return NULL; + + submit->alloc = alloc; + submit->alloc_scope = alloc_scope; + submit->in_fence = -1; + submit->out_fence = -1; + submit->perf_query_pass = perf_query_pass; + + return submit; +} VkResult -anv_device_execbuf(struct anv_device *device, - struct drm_i915_gem_execbuffer2 *execbuf, - struct anv_bo **execbuf_bos) +anv_queue_submit_simple_batch(struct anv_queue *queue, + struct anv_batch *batch) +{ + if (queue->device->no_hw) + return VK_SUCCESS; + + struct anv_device *device = queue->device; + struct anv_queue_submit *submit = anv_queue_submit_alloc(device, -1); + if (!submit) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + bool has_syncobj_wait = device->physical->has_syncobj_wait; + VkResult result; + uint32_t syncobj; + struct anv_bo *batch_bo, *sync_bo; + + if (has_syncobj_wait) { + syncobj = anv_gem_syncobj_create(device, 0); + if (!syncobj) { + result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); + goto err_free_submit; + } + + result = anv_queue_submit_add_syncobj(submit, device, syncobj, + I915_EXEC_FENCE_SIGNAL, 0); + } else { + result = anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* explicit_address */, + &sync_bo); + if (result != VK_SUCCESS) + goto err_free_submit; + + result = anv_queue_submit_add_fence_bo(submit, sync_bo, true /* signal */); + } + + if (result != VK_SUCCESS) + goto err_destroy_sync_primitive; + + if (batch) { + uint32_t size = align_u32(batch->next - batch->start, 8); + result = anv_bo_pool_alloc(&device->batch_bo_pool, size, &batch_bo); + if (result != VK_SUCCESS) + goto err_destroy_sync_primitive; + + memcpy(batch_bo->map, batch->start, size); + if (!device->info.has_llc) + gen_flush_range(batch_bo->map, size); + + submit->simple_bo = batch_bo; + submit->simple_bo_size = size; + } + + result = _anv_queue_submit(queue, &submit, true); + + if (result == VK_SUCCESS) { + if (has_syncobj_wait) { + if (anv_gem_syncobj_wait(device, &syncobj, 1, + anv_get_absolute_timeout(INT64_MAX), true)) + result = anv_device_set_lost(device, "anv_gem_syncobj_wait failed: %m"); + anv_gem_syncobj_destroy(device, syncobj); + } else { + result = anv_device_wait(device, sync_bo, + anv_get_relative_timeout(INT64_MAX)); + anv_device_release_bo(device, sync_bo); + } + } + + if (batch) + anv_bo_pool_free(&device->batch_bo_pool, batch_bo); + + if (submit) + anv_queue_submit_free(device, submit); + + return result; + + err_destroy_sync_primitive: + if (has_syncobj_wait) + anv_gem_syncobj_destroy(device, syncobj); + else + anv_device_release_bo(device, sync_bo); + err_free_submit: + if (submit) + anv_queue_submit_free(device, submit); + + return result; +} + +/* Transfer ownership of temporary semaphores from the VkSemaphore object to + * the anv_queue_submit object. Those temporary semaphores are then freed in + * anv_queue_submit_free() once the driver is finished with them. + */ +static VkResult +maybe_transfer_temporary_semaphore(struct anv_queue_submit *submit, + struct anv_semaphore *semaphore, + struct anv_semaphore_impl **out_impl) { - int ret = device->no_hw ? 0 : anv_gem_execbuffer(device, execbuf); - if (ret != 0) { - /* We don't know the real error. */ - return anv_device_set_lost(device, "execbuf2 failed: %m"); + struct anv_semaphore_impl *impl = &semaphore->temporary; + + if (impl->type == ANV_SEMAPHORE_TYPE_NONE) { + *out_impl = &semaphore->permanent; + return VK_SUCCESS; } - struct drm_i915_gem_exec_object2 *objects = - (void *)(uintptr_t)execbuf->buffers_ptr; - for (uint32_t k = 0; k < execbuf->buffer_count; k++) { - if (execbuf_bos[k]->flags & EXEC_OBJECT_PINNED) - assert(execbuf_bos[k]->offset == objects[k].offset); - execbuf_bos[k]->offset = objects[k].offset; + /* BO backed timeline semaphores cannot be temporary. */ + assert(impl->type != ANV_SEMAPHORE_TYPE_TIMELINE); + + /* + * There is a requirement to reset semaphore to their permanent state after + * submission. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * In the case we defer the actual submission to a thread because of the + * wait-before-submit behavior required for timeline semaphores, we need to + * make copies of the temporary syncobj to ensure they stay alive until we + * do the actual execbuffer ioctl. + */ + if (submit->temporary_semaphore_count >= submit->temporary_semaphore_array_length) { + uint32_t new_len = MAX2(submit->temporary_semaphore_array_length * 2, 8); + /* Make sure that if the realloc fails, we still have the old semaphore + * array around to properly clean things up on failure. + */ + struct anv_semaphore_impl *new_array = + vk_realloc(submit->alloc, + submit->temporary_semaphores, + new_len * sizeof(*submit->temporary_semaphores), + 8, submit->alloc_scope); + if (new_array == NULL) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->temporary_semaphores = new_array; + submit->temporary_semaphore_array_length = new_len; } + /* Copy anv_semaphore_impl into anv_queue_submit. */ + submit->temporary_semaphores[submit->temporary_semaphore_count++] = *impl; + *out_impl = &submit->temporary_semaphores[submit->temporary_semaphore_count - 1]; + + /* Clear the incoming semaphore */ + impl->type = ANV_SEMAPHORE_TYPE_NONE; + return VK_SUCCESS; } -VkResult -anv_device_submit_simple_batch(struct anv_device *device, - struct anv_batch *batch) +static VkResult +anv_queue_submit(struct anv_queue *queue, + struct anv_cmd_buffer *cmd_buffer, + const VkSemaphore *in_semaphores, + const uint64_t *in_values, + uint32_t num_in_semaphores, + const VkSemaphore *out_semaphores, + const uint64_t *out_values, + uint32_t num_out_semaphores, + struct anv_bo *wsi_signal_bo, + VkFence _fence, + int perf_query_pass) { - struct drm_i915_gem_execbuffer2 execbuf; - struct drm_i915_gem_exec_object2 exec2_objects[1]; - struct anv_bo bo, *exec_bos[1]; + ANV_FROM_HANDLE(anv_fence, fence, _fence); + struct anv_device *device = queue->device; + UNUSED struct anv_physical_device *pdevice = device->physical; + struct anv_queue_submit *submit = anv_queue_submit_alloc(device, perf_query_pass); + if (!submit) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + submit->cmd_buffer = cmd_buffer; + VkResult result = VK_SUCCESS; - uint32_t size; + for (uint32_t i = 0; i < num_in_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, in_semaphores[i]); + struct anv_semaphore_impl *impl; - /* Kernel driver requires 8 byte aligned batch length */ - size = align_u32(batch->next - batch->start, 8); - result = anv_bo_pool_alloc(&device->batch_bo_pool, &bo, size); - if (result != VK_SUCCESS) - return result; + result = maybe_transfer_temporary_semaphore(submit, semaphore, &impl); + if (result != VK_SUCCESS) + goto error; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_BO: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_fence_bo(submit, impl->bo, false /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_WSI_BO: + /* When using a window-system buffer as a semaphore, always enable + * EXEC_OBJECT_WRITE. This gives us a WaR hazard with the display or + * compositor's read of the buffer and enforces that we don't start + * rendering until they are finished. This is exactly the + * synchronization we want with vkAcquireNextImage. + */ + result = anv_queue_submit_add_fence_bo(submit, impl->bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_SYNC_FILE: + assert(!pdevice->has_syncobj); + if (submit->in_fence == -1) { + submit->in_fence = impl->fd; + if (submit->in_fence == -1) { + result = vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto error; + } + impl->fd = -1; + } else { + int merge = anv_gem_sync_file_merge(device, submit->in_fence, impl->fd); + if (merge == -1) { + result = vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); + goto error; + } + close(impl->fd); + close(submit->in_fence); + impl->fd = -1; + submit->in_fence = merge; + } + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: { + result = anv_queue_submit_add_syncobj(submit, device, + impl->syncobj, + I915_EXEC_FENCE_WAIT, + 0); + if (result != VK_SUCCESS) + goto error; + break; + } + + case ANV_SEMAPHORE_TYPE_TIMELINE: + result = anv_queue_submit_add_timeline_wait(submit, device, + &impl->timeline, + in_values ? in_values[i] : 0); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: + result = anv_queue_submit_add_syncobj(submit, device, + impl->syncobj, + I915_EXEC_FENCE_WAIT, + in_values ? in_values[i] : 0); + if (result != VK_SUCCESS) + goto error; + break; + + default: + break; + } + } + + for (uint32_t i = 0; i < num_out_semaphores; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, out_semaphores[i]); + + /* Under most circumstances, out fences won't be temporary. However, + * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * The spec says nothing whatsoever about signal operations on + * temporarily imported semaphores so it appears they are allowed. + * There are also CTS tests that require this to work. + */ + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_BO: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_fence_bo(submit, impl->bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_SYNC_FILE: + assert(!pdevice->has_syncobj); + result = anv_queue_submit_add_sync_fd_fence(submit, semaphore); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: { + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, + I915_EXEC_FENCE_SIGNAL, + 0); + if (result != VK_SUCCESS) + goto error; + break; + } + + case ANV_SEMAPHORE_TYPE_TIMELINE: + result = anv_queue_submit_add_timeline_signal(submit, device, + &impl->timeline, + out_values ? out_values[i] : 0); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, + I915_EXEC_FENCE_SIGNAL, + out_values ? out_values[i] : 0); + if (result != VK_SUCCESS) + goto error; + break; + + default: + break; + } + } + + if (wsi_signal_bo) { + result = anv_queue_submit_add_fence_bo(submit, wsi_signal_bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + } + + if (fence) { + /* Under most circumstances, out fences won't be temporary. However, + * the spec does allow it for opaque_fd. From the Vulkan 1.0.53 spec: + * + * "If the import is temporary, the implementation must restore the + * semaphore to its prior permanent state after submitting the next + * semaphore wait operation." + * + * The spec says nothing whatsoever about signal operations on + * temporarily imported semaphores so it appears they are allowed. + * There are also CTS tests that require this to work. + */ + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + + switch (impl->type) { + case ANV_FENCE_TYPE_BO: + assert(!device->has_thread_submit); + result = anv_queue_submit_add_fence_bo(submit, impl->bo.bo, true /* signal */); + if (result != VK_SUCCESS) + goto error; + break; + + case ANV_FENCE_TYPE_SYNCOBJ: { + /* + * For the same reason we reset the signaled binary syncobj above, + * also reset the fence's syncobj so that they don't contain a + * signaled dma-fence. + */ + anv_gem_syncobj_reset(device, impl->syncobj); + + result = anv_queue_submit_add_syncobj(submit, device, impl->syncobj, + I915_EXEC_FENCE_SIGNAL, + 0); + if (result != VK_SUCCESS) + goto error; + break; + } + + default: + unreachable("Invalid fence type"); + } + } - memcpy(bo.map, batch->start, size); - if (!device->info.has_llc) - gen_flush_range(bo.map, size); - - exec_bos[0] = &bo; - exec2_objects[0].handle = bo.gem_handle; - exec2_objects[0].relocation_count = 0; - exec2_objects[0].relocs_ptr = 0; - exec2_objects[0].alignment = 0; - exec2_objects[0].offset = bo.offset; - exec2_objects[0].flags = bo.flags; - exec2_objects[0].rsvd1 = 0; - exec2_objects[0].rsvd2 = 0; - - execbuf.buffers_ptr = (uintptr_t) exec2_objects; - execbuf.buffer_count = 1; - execbuf.batch_start_offset = 0; - execbuf.batch_len = size; - execbuf.cliprects_ptr = 0; - execbuf.num_cliprects = 0; - execbuf.DR1 = 0; - execbuf.DR4 = 0; - - execbuf.flags = - I915_EXEC_HANDLE_LUT | I915_EXEC_NO_RELOC | I915_EXEC_RENDER; - execbuf.rsvd1 = device->context_id; - execbuf.rsvd2 = 0; - - result = anv_device_execbuf(device, &execbuf, exec_bos); + result = _anv_queue_submit(queue, &submit, false); if (result != VK_SUCCESS) - goto fail; + goto error; + + if (fence && fence->permanent.type == ANV_FENCE_TYPE_BO) { + assert(!device->has_thread_submit); + /* If we have permanent BO fence, the only type of temporary possible + * would be BO_WSI (because BO fences are not shareable). The Vulkan spec + * also requires that the fence passed to vkQueueSubmit() be : + * + * * unsignaled + * * not be associated with any other queue command that has not yet + * completed execution on that queue + * + * So the only acceptable type for the temporary is NONE. + */ + assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); - result = anv_device_wait(device, &bo, INT64_MAX); + /* Once the execbuf has returned, we need to set the fence state to + * SUBMITTED. We can't do this before calling execbuf because + * anv_GetFenceStatus does take the global device lock before checking + * fence->state. + * + * We set the fence state to SUBMITTED regardless of whether or not the + * execbuf succeeds because we need to ensure that vkWaitForFences() and + * vkGetFenceStatus() return a valid result (VK_ERROR_DEVICE_LOST or + * VK_SUCCESS) in a finite amount of time even if execbuf fails. + */ + fence->permanent.bo.state = ANV_BO_FENCE_STATE_SUBMITTED; + } - fail: - anv_bo_pool_free(&device->batch_bo_pool, &bo); + error: + if (submit) + anv_queue_submit_free(device, submit); return result; } @@ -119,7 +1173,9 @@ VkResult anv_QueueSubmit( VkFence fence) { ANV_FROM_HANDLE(anv_queue, queue, _queue); - struct anv_device *device = queue->device; + + if (queue->device->no_hw) + return VK_SUCCESS; /* Query for device status prior to submitting. Technically, we don't need * to do this. However, if we have a client that's submitting piles of @@ -128,44 +1184,18 @@ VkResult anv_QueueSubmit( * the kernel to kick us or we'll have to wait until the client waits on a * fence before we actually know whether or not we've hung. */ - VkResult result = anv_device_query_status(device); + VkResult result = anv_device_query_status(queue->device); if (result != VK_SUCCESS) return result; - /* We lock around QueueSubmit for three main reasons: - * - * 1) When a block pool is resized, we create a new gem handle with a - * different size and, in the case of surface states, possibly a - * different center offset but we re-use the same anv_bo struct when - * we do so. If this happens in the middle of setting up an execbuf, - * we could end up with our list of BOs out of sync with our list of - * gem handles. - * - * 2) The algorithm we use for building the list of unique buffers isn't - * thread-safe. While the client is supposed to syncronize around - * QueueSubmit, this would be extremely difficult to debug if it ever - * came up in the wild due to a broken app. It's better to play it - * safe and just lock around QueueSubmit. - * - * 3) The anv_cmd_buffer_execbuf function may perform relocations in - * userspace. Due to the fact that the surface state buffer is shared - * between batches, we can't afford to have that happen from multiple - * threads at the same time. Even though the user is supposed to - * ensure this doesn't happen, we play it safe as in (2) above. - * - * Since the only other things that ever take the device lock such as block - * pool resize only rarely happen, this will almost never be contended so - * taking a lock isn't really an expensive operation in this case. - */ - pthread_mutex_lock(&device->mutex); - if (fence && submitCount == 0) { /* If we don't have any command buffers, we need to submit a dummy * batch to give GEM something to wait on. We could, potentially, * come up with something more efficient but this shouldn't be a * common case. */ - result = anv_cmd_buffer_execbuf(device, NULL, NULL, 0, NULL, 0, fence); + result = anv_queue_submit(queue, NULL, NULL, NULL, 0, NULL, NULL, 0, + NULL, fence, -1); goto out; } @@ -173,18 +1203,42 @@ VkResult anv_QueueSubmit( /* Fence for this submit. NULL for all but the last one */ VkFence submit_fence = (i == submitCount - 1) ? fence : VK_NULL_HANDLE; + const struct wsi_memory_signal_submit_info *mem_signal_info = + vk_find_struct_const(pSubmits[i].pNext, + WSI_MEMORY_SIGNAL_SUBMIT_INFO_MESA); + struct anv_bo *wsi_signal_bo = + mem_signal_info && mem_signal_info->memory != VK_NULL_HANDLE ? + anv_device_memory_from_handle(mem_signal_info->memory)->bo : NULL; + + const VkTimelineSemaphoreSubmitInfoKHR *timeline_info = + vk_find_struct_const(pSubmits[i].pNext, + TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR); + const VkPerformanceQuerySubmitInfoKHR *perf_info = + vk_find_struct_const(pSubmits[i].pNext, + PERFORMANCE_QUERY_SUBMIT_INFO_KHR); + const uint64_t *wait_values = + timeline_info && timeline_info->waitSemaphoreValueCount ? + timeline_info->pWaitSemaphoreValues : NULL; + const uint64_t *signal_values = + timeline_info && timeline_info->signalSemaphoreValueCount ? + timeline_info->pSignalSemaphoreValues : NULL; + if (pSubmits[i].commandBufferCount == 0) { /* If we don't have any command buffers, we need to submit a dummy * batch to give GEM something to wait on. We could, potentially, * come up with something more efficient but this shouldn't be a * common case. */ - result = anv_cmd_buffer_execbuf(device, NULL, - pSubmits[i].pWaitSemaphores, - pSubmits[i].waitSemaphoreCount, - pSubmits[i].pSignalSemaphores, - pSubmits[i].signalSemaphoreCount, - submit_fence); + result = anv_queue_submit(queue, NULL, + pSubmits[i].pWaitSemaphores, + wait_values, + pSubmits[i].waitSemaphoreCount, + pSubmits[i].pSignalSemaphores, + signal_values, + pSubmits[i].signalSemaphoreCount, + wsi_signal_bo, + submit_fence, + -1); if (result != VK_SUCCESS) goto out; @@ -203,32 +1257,34 @@ VkResult anv_QueueSubmit( submit_fence : VK_NULL_HANDLE; const VkSemaphore *in_semaphores = NULL, *out_semaphores = NULL; + const uint64_t *in_values = NULL, *out_values = NULL; uint32_t num_in_semaphores = 0, num_out_semaphores = 0; if (j == 0) { /* Only the first batch gets the in semaphores */ in_semaphores = pSubmits[i].pWaitSemaphores; + in_values = wait_values; num_in_semaphores = pSubmits[i].waitSemaphoreCount; } if (j == pSubmits[i].commandBufferCount - 1) { /* Only the last batch gets the out semaphores */ out_semaphores = pSubmits[i].pSignalSemaphores; + out_values = signal_values; num_out_semaphores = pSubmits[i].signalSemaphoreCount; } - result = anv_cmd_buffer_execbuf(device, cmd_buffer, - in_semaphores, num_in_semaphores, - out_semaphores, num_out_semaphores, - execbuf_fence); + result = anv_queue_submit(queue, cmd_buffer, + in_semaphores, in_values, num_in_semaphores, + out_semaphores, out_values, num_out_semaphores, + wsi_signal_bo, execbuf_fence, + perf_info ? perf_info->counterPassIndex : 0); if (result != VK_SUCCESS) goto out; } } - pthread_cond_broadcast(&device->queue_submit); - out: - if (result != VK_SUCCESS) { + if (result != VK_SUCCESS && result != VK_ERROR_DEVICE_LOST) { /* In the case that something has gone wrong we may end up with an * inconsistent state from which it may not be trivial to recover. * For example, we might have computed address relocations and @@ -240,12 +1296,14 @@ out: * anyway (such us being out of memory) and return * VK_ERROR_DEVICE_LOST to ensure that clients do not attempt to * submit the same job again to this device. + * + * We skip doing this on VK_ERROR_DEVICE_LOST because + * anv_device_set_lost() would have been called already by a callee of + * anv_queue_submit(). */ - result = anv_device_set_lost(device, "vkQueueSubmit() failed"); + result = anv_device_set_lost(queue->device, "vkQueueSubmit() failed"); } - pthread_mutex_unlock(&device->mutex); - return result; } @@ -254,7 +1312,10 @@ VkResult anv_QueueWaitIdle( { ANV_FROM_HANDLE(anv_queue, queue, _queue); - return anv_DeviceWaitIdle(anv_device_to_handle(queue->device)); + if (anv_device_is_lost(queue->device)) + return VK_ERROR_DEVICE_LOST; + + return anv_queue_submit_simple_batch(queue, NULL); } VkResult anv_CreateFence( @@ -268,12 +1329,14 @@ VkResult anv_CreateFence( assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_FENCE_CREATE_INFO); - fence = vk_zalloc2(&device->alloc, pAllocator, sizeof(*fence), 8, + fence = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*fence), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (fence == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - if (device->instance->physicalDevice.has_syncobj_wait) { + vk_object_base_init(&device->vk, &fence->base, VK_OBJECT_TYPE_FENCE); + + if (device->physical->has_syncobj_wait) { fence->permanent.type = ANV_FENCE_TYPE_SYNCOBJ; uint32_t create_flags = 0; @@ -286,8 +1349,8 @@ VkResult anv_CreateFence( } else { fence->permanent.type = ANV_FENCE_TYPE_BO; - VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, - &fence->permanent.bo.bo, 4096); + VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, 4096, + &fence->permanent.bo.bo); if (result != VK_SUCCESS) return result; @@ -313,7 +1376,11 @@ anv_fence_impl_cleanup(struct anv_device *device, break; case ANV_FENCE_TYPE_BO: - anv_bo_pool_free(&device->batch_bo_pool, &impl->bo.bo); + anv_bo_pool_free(&device->batch_bo_pool, impl->bo.bo); + break; + + case ANV_FENCE_TYPE_WSI_BO: + anv_device_release_bo(device, impl->bo.bo); break; case ANV_FENCE_TYPE_SYNCOBJ: @@ -331,6 +1398,16 @@ anv_fence_impl_cleanup(struct anv_device *device, impl->type = ANV_FENCE_TYPE_NONE; } +void +anv_fence_reset_temporary(struct anv_device *device, + struct anv_fence *fence) +{ + if (fence->temporary.type == ANV_FENCE_TYPE_NONE) + return; + + anv_fence_impl_cleanup(device, &fence->temporary); +} + void anv_DestroyFence( VkDevice _device, VkFence _fence, @@ -345,7 +1422,8 @@ void anv_DestroyFence( anv_fence_impl_cleanup(device, &fence->temporary); anv_fence_impl_cleanup(device, &fence->permanent); - vk_free2(&device->alloc, pAllocator, fence); + vk_object_base_finish(&fence->base); + vk_free2(&device->vk.alloc, pAllocator, fence); } VkResult anv_ResetFences( @@ -365,8 +1443,7 @@ VkResult anv_ResetFences( * first restored. The remaining operations described therefore * operate on the restored payload. */ - if (fence->temporary.type != ANV_FENCE_TYPE_NONE) - anv_fence_impl_cleanup(device, &fence->temporary); + anv_fence_reset_temporary(device, fence); struct anv_fence_impl *impl = &fence->permanent; @@ -403,8 +1480,7 @@ VkResult anv_GetFenceStatus( switch (impl->type) { case ANV_FENCE_TYPE_BO: - /* BO fences don't support import/export */ - assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); + case ANV_FENCE_TYPE_WSI_BO: switch (impl->bo.state) { case ANV_BO_FENCE_STATE_RESET: /* If it hasn't even been sent off to the GPU yet, it's not ready */ @@ -415,7 +1491,7 @@ VkResult anv_GetFenceStatus( return VK_SUCCESS; case ANV_BO_FENCE_STATE_SUBMITTED: { - VkResult result = anv_device_bo_busy(device, &impl->bo.bo); + VkResult result = anv_device_bo_busy(device, impl->bo.bo); if (result == VK_SUCCESS) { impl->bo.state = ANV_BO_FENCE_STATE_SIGNALED; return VK_SUCCESS; @@ -428,16 +1504,34 @@ VkResult anv_GetFenceStatus( } case ANV_FENCE_TYPE_SYNCOBJ: { - int ret = anv_gem_syncobj_wait(device, &impl->syncobj, 1, 0, true); - if (ret == -1) { - if (errno == ETIME) { - return VK_NOT_READY; + if (device->has_thread_submit) { + uint64_t binary_value = 0; + int ret = anv_gem_syncobj_timeline_wait(device, &impl->syncobj, + &binary_value, 1, 0, + true /* wait_all */, + false /* wait_materialize */); + if (ret == -1) { + if (errno == ETIME) { + return VK_NOT_READY; + } else { + /* We don't know the real error. */ + return anv_device_set_lost(device, "drm_syncobj_wait failed: %m"); + } } else { - /* We don't know the real error. */ - return anv_device_set_lost(device, "drm_syncobj_wait failed: %m"); + return VK_SUCCESS; } } else { - return VK_SUCCESS; + int ret = anv_gem_syncobj_wait(device, &impl->syncobj, 1, 0, false); + if (ret == -1) { + if (errno == ETIME) { + return VK_NOT_READY; + } else { + /* We don't know the real error. */ + return anv_device_set_lost(device, "drm_syncobj_wait failed: %m"); + } + } else { + return VK_SUCCESS; + } } } @@ -446,53 +1540,6 @@ VkResult anv_GetFenceStatus( } } -#define NSEC_PER_SEC 1000000000 -#define INT_TYPE_MAX(type) ((1ull << (sizeof(type) * 8 - 1)) - 1) - -static uint64_t -gettime_ns(void) -{ - struct timespec current; - clock_gettime(CLOCK_MONOTONIC, ¤t); - return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec; -} - -static uint64_t anv_get_absolute_timeout(uint64_t timeout) -{ - if (timeout == 0) - return 0; - uint64_t current_time = gettime_ns(); - uint64_t max_timeout = (uint64_t) INT64_MAX - current_time; - - timeout = MIN2(max_timeout, timeout); - - return (current_time + timeout); -} - -static int64_t anv_get_relative_timeout(uint64_t abs_timeout) -{ - uint64_t now = gettime_ns(); - - /* We don't want negative timeouts. - * - * DRM_IOCTL_I915_GEM_WAIT uses a signed 64 bit timeout and is - * supposed to block indefinitely timeouts < 0. Unfortunately, - * this was broken for a couple of kernel releases. Since there's - * no way to know whether or not the kernel we're using is one of - * the broken ones, the best we can do is to clamp the timeout to - * INT64_MAX. This limits the maximum timeout from 584 years to - * 292 years - likely not a big deal. - */ - if (abs_timeout < now) - return 0; - - uint64_t rel_timeout = abs_timeout - now; - if (rel_timeout > (uint64_t) INT64_MAX) - rel_timeout = INT64_MAX; - - return rel_timeout; -} - static VkResult anv_wait_for_syncobj_fences(struct anv_device *device, uint32_t fenceCount, @@ -500,7 +1547,7 @@ anv_wait_for_syncobj_fences(struct anv_device *device, bool waitAll, uint64_t abs_timeout_ns) { - uint32_t *syncobjs = vk_zalloc(&device->alloc, + uint32_t *syncobjs = vk_zalloc(&device->vk.alloc, sizeof(*syncobjs) * fenceCount, 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (!syncobjs) @@ -518,17 +1565,17 @@ anv_wait_for_syncobj_fences(struct anv_device *device, syncobjs[i] = impl->syncobj; } + int ret = 0; /* The gem_syncobj_wait ioctl may return early due to an inherent - * limitation in the way it computes timeouts. Loop until we've actually + * limitation in the way it computes timeouts. Loop until we've actually * passed the timeout. */ - int ret; do { ret = anv_gem_syncobj_wait(device, syncobjs, fenceCount, abs_timeout_ns, waitAll); - } while (ret == -1 && errno == ETIME && gettime_ns() < abs_timeout_ns); + } while (ret == -1 && errno == ETIME && anv_gettime_ns() < abs_timeout_ns); - vk_free(&device->alloc, syncobjs); + vk_free(&device->vk.alloc, syncobjs); if (ret == -1) { if (errno == ETIME) { @@ -557,13 +1604,11 @@ anv_wait_for_bo_fences(struct anv_device *device, for (uint32_t i = 0; i < fenceCount; i++) { ANV_FROM_HANDLE(anv_fence, fence, pFences[i]); - /* This function assumes that all fences are BO fences and that they - * have no temporary state. Since BO fences will never be exported, - * this should be a safe assumption. - */ - assert(fence->permanent.type == ANV_FENCE_TYPE_BO); - assert(fence->temporary.type == ANV_FENCE_TYPE_NONE); - struct anv_fence_impl *impl = &fence->permanent; + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + assert(impl->type == ANV_FENCE_TYPE_BO || + impl->type == ANV_FENCE_TYPE_WSI_BO); switch (impl->bo.state) { case ANV_BO_FENCE_STATE_RESET: @@ -589,7 +1634,7 @@ anv_wait_for_bo_fences(struct anv_device *device, /* These are the fences we really care about. Go ahead and wait * on it until we hit a timeout. */ - result = anv_device_wait(device, &impl->bo.bo, + result = anv_device_wait(device, impl->bo.bo, anv_get_relative_timeout(abs_timeout_ns)); switch (result) { case VK_SUCCESS: @@ -634,11 +1679,11 @@ anv_wait_for_bo_fences(struct anv_device *device, .tv_nsec = abs_timeout_ns % NSEC_PER_SEC, }; - MAYBE_UNUSED int ret; + ASSERTED int ret; ret = pthread_cond_timedwait(&device->queue_submit, &device->mutex, &abstime); assert(ret != EINVAL); - if (gettime_ns() >= abs_timeout_ns) { + if (anv_gettime_ns() >= abs_timeout_ns) { pthread_mutex_unlock(&device->mutex); result = VK_TIMEOUT; goto done; @@ -658,12 +1703,9 @@ done: static VkResult anv_wait_for_wsi_fence(struct anv_device *device, - const VkFence _fence, + struct anv_fence_impl *impl, uint64_t abs_timeout) { - ANV_FROM_HANDLE(anv_fence, fence, _fence); - struct anv_fence_impl *impl = &fence->permanent; - return impl->fence_wsi->wait(impl->fence_wsi, abs_timeout); } @@ -679,8 +1721,15 @@ anv_wait_for_fences(struct anv_device *device, if (fenceCount <= 1 || waitAll) { for (uint32_t i = 0; i < fenceCount; i++) { ANV_FROM_HANDLE(anv_fence, fence, pFences[i]); - switch (fence->permanent.type) { + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + + switch (impl->type) { case ANV_FENCE_TYPE_BO: + assert(!device->physical->has_syncobj_wait); + /* fall-through */ + case ANV_FENCE_TYPE_WSI_BO: result = anv_wait_for_bo_fences(device, 1, &pFences[i], true, abs_timeout); break; @@ -689,7 +1738,7 @@ anv_wait_for_fences(struct anv_device *device, true, abs_timeout); break; case ANV_FENCE_TYPE_WSI: - result = anv_wait_for_wsi_fence(device, pFences[i], abs_timeout); + result = anv_wait_for_wsi_fence(device, impl, abs_timeout); break; case ANV_FENCE_TYPE_NONE: result = VK_SUCCESS; @@ -704,7 +1753,7 @@ anv_wait_for_fences(struct anv_device *device, if (anv_wait_for_fences(device, 1, &pFences[i], true, 0) == VK_SUCCESS) return VK_SUCCESS; } - } while (gettime_ns() < abs_timeout); + } while (anv_gettime_ns() < abs_timeout); result = VK_TIMEOUT; } return result; @@ -714,7 +1763,10 @@ static bool anv_all_fences_syncobj(uint32_t fenceCount, const VkFence *pFences) { for (uint32_t i = 0; i < fenceCount; ++i) { ANV_FROM_HANDLE(anv_fence, fence, pFences[i]); - if (fence->permanent.type != ANV_FENCE_TYPE_SYNCOBJ) + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + if (impl->type != ANV_FENCE_TYPE_SYNCOBJ) return false; } return true; @@ -724,7 +1776,11 @@ static bool anv_all_fences_bo(uint32_t fenceCount, const VkFence *pFences) { for (uint32_t i = 0; i < fenceCount; ++i) { ANV_FROM_HANDLE(anv_fence, fence, pFences[i]); - if (fence->permanent.type != ANV_FENCE_TYPE_BO) + struct anv_fence_impl *impl = + fence->temporary.type != ANV_FENCE_TYPE_NONE ? + &fence->temporary : &fence->permanent; + if (impl->type != ANV_FENCE_TYPE_BO && + impl->type != ANV_FENCE_TYPE_WSI_BO) return false; } return true; @@ -739,6 +1795,9 @@ VkResult anv_WaitForFences( { ANV_FROM_HANDLE(anv_device, device, _device); + if (device->no_hw) + return VK_SUCCESS; + if (anv_device_is_lost(device)) return VK_ERROR_DEVICE_LOST; @@ -757,8 +1816,8 @@ VkResult anv_WaitForFences( void anv_GetPhysicalDeviceExternalFenceProperties( VkPhysicalDevice physicalDevice, - const VkPhysicalDeviceExternalFenceInfoKHR* pExternalFenceInfo, - VkExternalFencePropertiesKHR* pExternalFenceProperties) + const VkPhysicalDeviceExternalFenceInfo* pExternalFenceInfo, + VkExternalFenceProperties* pExternalFenceProperties) { ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice); @@ -813,24 +1872,35 @@ VkResult anv_ImportFenceFdKHR( break; - case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: + case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { /* Sync files are a bit tricky. Because we want to continue using the * syncobj implementation of WaitForFences, we don't use the sync file * directly but instead import it into a syncobj. */ new_impl.type = ANV_FENCE_TYPE_SYNCOBJ; - new_impl.syncobj = anv_gem_syncobj_create(device, 0); + /* "If handleType is VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT, the + * special value -1 for fd is treated like a valid sync file descriptor + * referring to an object that has already signaled. The import + * operation will succeed and the VkFence will have a temporarily + * imported payload as if a valid file descriptor had been provided." + */ + uint32_t create_flags = 0; + if (fd == -1) + create_flags |= DRM_SYNCOBJ_CREATE_SIGNALED; + + new_impl.syncobj = anv_gem_syncobj_create(device, create_flags); if (!new_impl.syncobj) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - if (anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) { + if (fd != -1 && + anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) { anv_gem_syncobj_destroy(device, new_impl.syncobj); - return vk_errorf(device->instance, NULL, - VK_ERROR_INVALID_EXTERNAL_HANDLE, + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, "syncobj sync file import failed: %m"); } break; + } default: return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); @@ -858,6 +1928,31 @@ VkResult anv_ImportFenceFdKHR( return VK_SUCCESS; } +/* The sideband payload of the DRM syncobj was incremented when the + * application called vkQueueSubmit(). Here we wait for a fence with the same + * value to materialize so that we can exporting (typically as a SyncFD). + */ +static VkResult +wait_syncobj_materialize(struct anv_device *device, + uint32_t syncobj, + int *fd) +{ + if (!device->has_thread_submit) + return VK_SUCCESS; + + uint64_t binary_value = 0; + /* We might need to wait until the fence materializes before we can + * export to a sync FD when we use a thread for submission. + */ + if (anv_gem_syncobj_timeline_wait(device, &syncobj, &binary_value, 1, + anv_get_absolute_timeout(5ull * NSEC_PER_SEC), + true /* wait_all */, + true /* wait_materialize */)) + return anv_device_set_lost(device, "anv_gem_syncobj_timeline_wait failed: %m"); + + return VK_SUCCESS; +} + VkResult anv_GetFenceFdKHR( VkDevice _device, const VkFenceGetFdInfoKHR* pGetFdInfo, @@ -884,6 +1979,10 @@ VkResult anv_GetFenceFdKHR( } case VK_EXTERNAL_FENCE_HANDLE_TYPE_SYNC_FD_BIT: { + VkResult result = wait_syncobj_materialize(device, impl->syncobj, pFd); + if (result != VK_SUCCESS) + return result; + int fd = anv_gem_syncobj_export_sync_file(device, impl->syncobj); if (fd < 0) return vk_error(VK_ERROR_TOO_MANY_OBJECTS); @@ -911,6 +2010,73 @@ VkResult anv_GetFenceFdKHR( // Queue semaphore functions +static VkSemaphoreTypeKHR +get_semaphore_type(const void *pNext, uint64_t *initial_value) +{ + const VkSemaphoreTypeCreateInfoKHR *type_info = + vk_find_struct_const(pNext, SEMAPHORE_TYPE_CREATE_INFO_KHR); + + if (!type_info) + return VK_SEMAPHORE_TYPE_BINARY_KHR; + + if (initial_value) + *initial_value = type_info->initialValue; + return type_info->semaphoreType; +} + +static VkResult +binary_semaphore_create(struct anv_device *device, + struct anv_semaphore_impl *impl, + bool exportable) +{ + if (device->physical->has_syncobj) { + impl->type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; + impl->syncobj = anv_gem_syncobj_create(device, 0); + if (!impl->syncobj) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + return VK_SUCCESS; + } else { + impl->type = ANV_SEMAPHORE_TYPE_BO; + VkResult result = + anv_device_alloc_bo(device, 4096, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* explicit_address */, + &impl->bo); + /* If we're going to use this as a fence, we need to *not* have the + * EXEC_OBJECT_ASYNC bit set. + */ + assert(!(impl->bo->flags & EXEC_OBJECT_ASYNC)); + return result; + } +} + +static VkResult +timeline_semaphore_create(struct anv_device *device, + struct anv_semaphore_impl *impl, + uint64_t initial_value) +{ + if (device->has_thread_submit) { + impl->type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE; + impl->syncobj = anv_gem_syncobj_create(device, 0); + if (!impl->syncobj) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + if (initial_value) { + if (anv_gem_syncobj_timeline_signal(device, + &impl->syncobj, + &initial_value, 1)) { + anv_gem_syncobj_destroy(device, impl->syncobj); + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + } + } + } else { + impl->type = ANV_SEMAPHORE_TYPE_TIMELINE; + anv_timeline_init(device, &impl->timeline, initial_value); + } + + return VK_SUCCESS; +} + VkResult anv_CreateSemaphore( VkDevice _device, const VkSemaphoreCreateInfo* pCreateInfo, @@ -922,54 +2088,60 @@ VkResult anv_CreateSemaphore( assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO); - semaphore = vk_alloc2(&device->alloc, pAllocator, sizeof(*semaphore), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + uint64_t timeline_value = 0; + VkSemaphoreTypeKHR sem_type = get_semaphore_type(pCreateInfo->pNext, &timeline_value); + + semaphore = vk_alloc(&device->vk.alloc, sizeof(*semaphore), 8, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE); if (semaphore == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - const VkExportSemaphoreCreateInfoKHR *export = + vk_object_base_init(&device->vk, &semaphore->base, VK_OBJECT_TYPE_SEMAPHORE); + + p_atomic_set(&semaphore->refcount, 1); + + const VkExportSemaphoreCreateInfo *export = vk_find_struct_const(pCreateInfo->pNext, EXPORT_SEMAPHORE_CREATE_INFO); - VkExternalSemaphoreHandleTypeFlagsKHR handleTypes = + VkExternalSemaphoreHandleTypeFlags handleTypes = export ? export->handleTypes : 0; + VkResult result; if (handleTypes == 0) { - /* The DRM execbuffer ioctl always execute in-oder so long as you stay - * on the same ring. Since we don't expose the blit engine as a DMA - * queue, a dummy no-op semaphore is a perfectly valid implementation. - */ - semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DUMMY; + if (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR) + result = binary_semaphore_create(device, &semaphore->permanent, false); + else + result = timeline_semaphore_create(device, &semaphore->permanent, timeline_value); + if (result != VK_SUCCESS) { + vk_free2(&device->vk.alloc, pAllocator, semaphore); + return result; + } } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT); - if (device->instance->physicalDevice.has_syncobj) { + if (sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR) + result = binary_semaphore_create(device, &semaphore->permanent, true); + else + result = timeline_semaphore_create(device, &semaphore->permanent, timeline_value); + if (result != VK_SUCCESS) { + vk_free2(&device->vk.alloc, pAllocator, semaphore); + return result; + } + } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { + assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT); + assert(sem_type == VK_SEMAPHORE_TYPE_BINARY_KHR); + if (device->physical->has_syncobj) { semaphore->permanent.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; semaphore->permanent.syncobj = anv_gem_syncobj_create(device, 0); if (!semaphore->permanent.syncobj) { - vk_free2(&device->alloc, pAllocator, semaphore); + vk_free2(&device->vk.alloc, pAllocator, semaphore); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } } else { - semaphore->permanent.type = ANV_SEMAPHORE_TYPE_BO; - VkResult result = anv_bo_cache_alloc(device, &device->bo_cache, - 4096, ANV_BO_EXTERNAL, - &semaphore->permanent.bo); - if (result != VK_SUCCESS) { - vk_free2(&device->alloc, pAllocator, semaphore); - return result; - } - - /* If we're going to use this as a fence, we need to *not* have the - * EXEC_OBJECT_ASYNC bit set. - */ - assert(!(semaphore->permanent.bo->flags & EXEC_OBJECT_ASYNC)); + semaphore->permanent.type = ANV_SEMAPHORE_TYPE_SYNC_FILE; + semaphore->permanent.fd = -1; } - } else if (handleTypes & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { - assert(handleTypes == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT); - - semaphore->permanent.type = ANV_SEMAPHORE_TYPE_SYNC_FILE; - semaphore->permanent.fd = -1; } else { assert(!"Unknown handle type"); - vk_free2(&device->alloc, pAllocator, semaphore); + vk_free2(&device->vk.alloc, pAllocator, semaphore); return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); } @@ -991,14 +2163,21 @@ anv_semaphore_impl_cleanup(struct anv_device *device, break; case ANV_SEMAPHORE_TYPE_BO: - anv_bo_cache_release(device, &device->bo_cache, impl->bo); + case ANV_SEMAPHORE_TYPE_WSI_BO: + anv_device_release_bo(device, impl->bo); break; case ANV_SEMAPHORE_TYPE_SYNC_FILE: - close(impl->fd); + if (impl->fd >= 0) + close(impl->fd); + break; + + case ANV_SEMAPHORE_TYPE_TIMELINE: + anv_timeline_finish(device, &impl->timeline); break; case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: anv_gem_syncobj_destroy(device, impl->syncobj); break; @@ -1019,6 +2198,27 @@ anv_semaphore_reset_temporary(struct anv_device *device, anv_semaphore_impl_cleanup(device, &semaphore->temporary); } +static struct anv_semaphore * +anv_semaphore_ref(struct anv_semaphore *semaphore) +{ + assert(semaphore->refcount); + p_atomic_inc(&semaphore->refcount); + return semaphore; +} + +static void +anv_semaphore_unref(struct anv_device *device, struct anv_semaphore *semaphore) +{ + if (!p_atomic_dec_zero(&semaphore->refcount)) + return; + + anv_semaphore_impl_cleanup(device, &semaphore->temporary); + anv_semaphore_impl_cleanup(device, &semaphore->permanent); + + vk_object_base_finish(&semaphore->base); + vk_free(&device->vk.alloc, semaphore); +} + void anv_DestroySemaphore( VkDevice _device, VkSemaphore _semaphore, @@ -1030,21 +2230,26 @@ void anv_DestroySemaphore( if (semaphore == NULL) return; - anv_semaphore_impl_cleanup(device, &semaphore->temporary); - anv_semaphore_impl_cleanup(device, &semaphore->permanent); - - vk_free2(&device->alloc, pAllocator, semaphore); + anv_semaphore_unref(device, semaphore); } void anv_GetPhysicalDeviceExternalSemaphoreProperties( VkPhysicalDevice physicalDevice, - const VkPhysicalDeviceExternalSemaphoreInfoKHR* pExternalSemaphoreInfo, - VkExternalSemaphorePropertiesKHR* pExternalSemaphoreProperties) + const VkPhysicalDeviceExternalSemaphoreInfo* pExternalSemaphoreInfo, + VkExternalSemaphoreProperties* pExternalSemaphoreProperties) { ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice); + VkSemaphoreTypeKHR sem_type = + get_semaphore_type(pExternalSemaphoreInfo->pNext, NULL); + switch (pExternalSemaphoreInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: + /* Timeline semaphores are not exportable, unless we have threaded + * submission. + */ + if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR && !device->has_thread_submit) + break; pExternalSemaphoreProperties->exportFromImportedHandleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT; pExternalSemaphoreProperties->compatibleHandleTypes = @@ -1055,16 +2260,18 @@ void anv_GetPhysicalDeviceExternalSemaphoreProperties( return; case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: - if (device->has_exec_fence) { - pExternalSemaphoreProperties->exportFromImportedHandleTypes = 0; - pExternalSemaphoreProperties->compatibleHandleTypes = - VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; - pExternalSemaphoreProperties->externalSemaphoreFeatures = - VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT | - VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT; - return; - } - break; + if (sem_type == VK_SEMAPHORE_TYPE_TIMELINE_KHR) + break; + if (!device->has_exec_fence) + break; + pExternalSemaphoreProperties->exportFromImportedHandleTypes = + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; + pExternalSemaphoreProperties->compatibleHandleTypes = + VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT; + pExternalSemaphoreProperties->externalSemaphoreFeatures = + VK_EXTERNAL_SEMAPHORE_FEATURE_EXPORTABLE_BIT | + VK_EXTERNAL_SEMAPHORE_FEATURE_IMPORTABLE_BIT; + return; default: break; @@ -1089,8 +2296,16 @@ VkResult anv_ImportSemaphoreFdKHR( switch (pImportSemaphoreFdInfo->handleType) { case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT: - if (device->instance->physicalDevice.has_syncobj) { - new_impl.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; + if (device->physical->has_syncobj) { + /* When importing non temporarily, reuse the semaphore's existing + * type. The Linux/DRM implementation allows to interchangeably use + * binary & timeline semaphores and we have no way to differenciate + * them. + */ + if (pImportSemaphoreFdInfo->flags & VK_SEMAPHORE_IMPORT_TEMPORARY_BIT) + new_impl.type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ; + else + new_impl.type = semaphore->permanent.type; new_impl.syncobj = anv_gem_syncobj_fd_to_handle(device, fd); if (!new_impl.syncobj) @@ -1098,15 +2313,17 @@ VkResult anv_ImportSemaphoreFdKHR( } else { new_impl.type = ANV_SEMAPHORE_TYPE_BO; - VkResult result = anv_bo_cache_import(device, &device->bo_cache, - fd, ANV_BO_EXTERNAL, - &new_impl.bo); + VkResult result = anv_device_import_bo(device, fd, + ANV_BO_ALLOC_EXTERNAL | + ANV_BO_ALLOC_IMPLICIT_SYNC, + 0 /* client_address */, + &new_impl.bo); if (result != VK_SUCCESS) return result; if (new_impl.bo->size < 4096) { - anv_bo_cache_release(device, &device->bo_cache, new_impl.bo); - return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR); + anv_device_release_bo(device, new_impl.bo); + return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE); } /* If we're going to use this as a fence, we need to *not* have the @@ -1128,10 +2345,38 @@ VkResult anv_ImportSemaphoreFdKHR( break; case VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT: - new_impl = (struct anv_semaphore_impl) { - .type = ANV_SEMAPHORE_TYPE_SYNC_FILE, - .fd = fd, - }; + if (device->physical->has_syncobj) { + uint32_t create_flags = 0; + + if (fd == -1) + create_flags |= DRM_SYNCOBJ_CREATE_SIGNALED; + + new_impl = (struct anv_semaphore_impl) { + .type = ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ, + .syncobj = anv_gem_syncobj_create(device, create_flags), + }; + + if (!new_impl.syncobj) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + if (fd != -1) { + if (anv_gem_syncobj_import_sync_file(device, new_impl.syncobj, fd)) { + anv_gem_syncobj_destroy(device, new_impl.syncobj); + return vk_errorf(device, NULL, VK_ERROR_INVALID_EXTERNAL_HANDLE, + "syncobj sync file import failed: %m"); + } + /* Ownership of the FD is transfered to Anv. Since we don't need it + * anymore because the associated fence has been put into a syncobj, + * we must close the FD. + */ + close(fd); + } + } else { + new_impl = (struct anv_semaphore_impl) { + .type = ANV_SEMAPHORE_TYPE_SYNC_FILE, + .fd = fd, + }; + } break; default: @@ -1167,25 +2412,20 @@ VkResult anv_GetSemaphoreFdKHR( switch (impl->type) { case ANV_SEMAPHORE_TYPE_BO: - result = anv_bo_cache_export(device, &device->bo_cache, impl->bo, pFd); + result = anv_device_export_bo(device, impl->bo, pFd); if (result != VK_SUCCESS) return result; break; - case ANV_SEMAPHORE_TYPE_SYNC_FILE: - /* There are two reasons why this could happen: - * - * 1) The user is trying to export without submitting something that - * signals the semaphore. If this is the case, it's their bug so - * what we return here doesn't matter. - * - * 2) The kernel didn't give us a file descriptor. The most likely - * reason for this is running out of file descriptors. + case ANV_SEMAPHORE_TYPE_SYNC_FILE: { + /* There's a potential race here with vkQueueSubmit if you are trying + * to export a semaphore Fd while the queue submit is still happening. + * This can happen if we see all dependencies get resolved via timeline + * semaphore waits completing before the execbuf completes and we + * process the resulting out fence. To work around this, take a lock + * around grabbing the fd. */ - if (impl->fd < 0) - return vk_error(VK_ERROR_TOO_MANY_OBJECTS); - - *pFd = impl->fd; + pthread_mutex_lock(&device->mutex); /* From the Vulkan 1.0.53 spec: * @@ -1197,10 +2437,45 @@ VkResult anv_GetSemaphoreFdKHR( * considered to have been waited on and no longer has a sync file * attached. */ + int fd = impl->fd; impl->fd = -1; + + pthread_mutex_unlock(&device->mutex); + + /* There are two reasons why this could happen: + * + * 1) The user is trying to export without submitting something that + * signals the semaphore. If this is the case, it's their bug so + * what we return here doesn't matter. + * + * 2) The kernel didn't give us a file descriptor. The most likely + * reason for this is running out of file descriptors. + */ + if (fd < 0) + return vk_error(VK_ERROR_TOO_MANY_OBJECTS); + + *pFd = fd; return VK_SUCCESS; + } case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ: + if (pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_SYNC_FD_BIT) { + VkResult result = wait_syncobj_materialize(device, impl->syncobj, pFd); + if (result != VK_SUCCESS) + return result; + + fd = anv_gem_syncobj_export_sync_file(device, impl->syncobj); + } else { + assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT); + fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj); + } + if (fd < 0) + return vk_error(VK_ERROR_TOO_MANY_OBJECTS); + *pFd = fd; + break; + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: + assert(pGetFdInfo->handleType == VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT); fd = anv_gem_syncobj_handle_to_fd(device, impl->syncobj); if (fd < 0) return vk_error(VK_ERROR_TOO_MANY_OBJECTS); @@ -1223,3 +2498,263 @@ VkResult anv_GetSemaphoreFdKHR( return VK_SUCCESS; } + +VkResult anv_GetSemaphoreCounterValue( + VkDevice _device, + VkSemaphore _semaphore, + uint64_t* pValue) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_semaphore, semaphore, _semaphore); + + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_TIMELINE: { + pthread_mutex_lock(&device->mutex); + anv_timeline_gc_locked(device, &impl->timeline); + *pValue = impl->timeline.highest_past; + pthread_mutex_unlock(&device->mutex); + return VK_SUCCESS; + } + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: { + int ret = anv_gem_syncobj_timeline_query(device, &impl->syncobj, pValue, 1); + + if (ret != 0) + return anv_device_set_lost(device, "unable to query timeline syncobj"); + + return VK_SUCCESS; + } + + default: + unreachable("Invalid semaphore type"); + } +} + +static VkResult +anv_timeline_wait_locked(struct anv_device *device, + struct anv_timeline *timeline, + uint64_t serial, uint64_t abs_timeout_ns) +{ + /* Wait on the queue_submit condition variable until the timeline has a + * time point pending that's at least as high as serial. + */ + while (timeline->highest_pending < serial) { + struct timespec abstime = { + .tv_sec = abs_timeout_ns / NSEC_PER_SEC, + .tv_nsec = abs_timeout_ns % NSEC_PER_SEC, + }; + + UNUSED int ret = pthread_cond_timedwait(&device->queue_submit, + &device->mutex, &abstime); + assert(ret != EINVAL); + if (anv_gettime_ns() >= abs_timeout_ns && + timeline->highest_pending < serial) + return VK_TIMEOUT; + } + + while (1) { + VkResult result = anv_timeline_gc_locked(device, timeline); + if (result != VK_SUCCESS) + return result; + + if (timeline->highest_past >= serial) + return VK_SUCCESS; + + /* If we got here, our earliest time point has a busy BO */ + struct anv_timeline_point *point = + list_first_entry(&timeline->points, + struct anv_timeline_point, link); + + /* Drop the lock while we wait. */ + point->waiting++; + pthread_mutex_unlock(&device->mutex); + + result = anv_device_wait(device, point->bo, + anv_get_relative_timeout(abs_timeout_ns)); + + /* Pick the mutex back up */ + pthread_mutex_lock(&device->mutex); + point->waiting--; + + /* This covers both VK_TIMEOUT and VK_ERROR_DEVICE_LOST */ + if (result != VK_SUCCESS) + return result; + } +} + +static VkResult +anv_timelines_wait(struct anv_device *device, + struct anv_timeline **timelines, + const uint64_t *serials, + uint32_t n_timelines, + bool wait_all, + uint64_t abs_timeout_ns) +{ + if (!wait_all && n_timelines > 1) { + pthread_mutex_lock(&device->mutex); + + while (1) { + VkResult result; + for (uint32_t i = 0; i < n_timelines; i++) { + result = + anv_timeline_wait_locked(device, timelines[i], serials[i], 0); + if (result != VK_TIMEOUT) + break; + } + + if (result != VK_TIMEOUT || + anv_gettime_ns() >= abs_timeout_ns) { + pthread_mutex_unlock(&device->mutex); + return result; + } + + /* If none of them are ready do a short wait so we don't completely + * spin while holding the lock. The 10us is completely arbitrary. + */ + uint64_t abs_short_wait_ns = + anv_get_absolute_timeout( + MIN2((anv_gettime_ns() - abs_timeout_ns) / 10, 10 * 1000)); + struct timespec abstime = { + .tv_sec = abs_short_wait_ns / NSEC_PER_SEC, + .tv_nsec = abs_short_wait_ns % NSEC_PER_SEC, + }; + ASSERTED int ret; + ret = pthread_cond_timedwait(&device->queue_submit, + &device->mutex, &abstime); + assert(ret != EINVAL); + } + } else { + VkResult result = VK_SUCCESS; + pthread_mutex_lock(&device->mutex); + for (uint32_t i = 0; i < n_timelines; i++) { + result = + anv_timeline_wait_locked(device, timelines[i], + serials[i], abs_timeout_ns); + if (result != VK_SUCCESS) + break; + } + pthread_mutex_unlock(&device->mutex); + return result; + } +} + +VkResult anv_WaitSemaphores( + VkDevice _device, + const VkSemaphoreWaitInfoKHR* pWaitInfo, + uint64_t timeout) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + uint32_t *handles; + struct anv_timeline **timelines; + uint64_t *values; + + ANV_MULTIALLOC(ma); + + anv_multialloc_add(&ma, &values, pWaitInfo->semaphoreCount); + if (device->has_thread_submit) { + anv_multialloc_add(&ma, &handles, pWaitInfo->semaphoreCount); + } else { + anv_multialloc_add(&ma, &timelines, pWaitInfo->semaphoreCount); + } + + if (!anv_multialloc_alloc(&ma, &device->vk.alloc, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND)) + return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); + + uint32_t handle_count = 0; + for (uint32_t i = 0; i < pWaitInfo->semaphoreCount; i++) { + ANV_FROM_HANDLE(anv_semaphore, semaphore, pWaitInfo->pSemaphores[i]); + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + if (pWaitInfo->pValues[i] == 0) + continue; + + if (device->has_thread_submit) { + assert(impl->type == ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE); + handles[handle_count] = impl->syncobj; + } else { + assert(impl->type == ANV_SEMAPHORE_TYPE_TIMELINE); + timelines[handle_count] = &impl->timeline; + } + values[handle_count] = pWaitInfo->pValues[i]; + handle_count++; + } + + VkResult result = VK_SUCCESS; + if (handle_count > 0) { + if (device->has_thread_submit) { + int ret = + anv_gem_syncobj_timeline_wait(device, + handles, values, handle_count, + anv_get_absolute_timeout(timeout), + !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR), + false); + if (ret != 0) + result = errno == ETIME ? VK_TIMEOUT : + anv_device_set_lost(device, "unable to wait on timeline syncobj"); + } else { + result = + anv_timelines_wait(device, timelines, values, handle_count, + !(pWaitInfo->flags & VK_SEMAPHORE_WAIT_ANY_BIT_KHR), + anv_get_absolute_timeout(timeout)); + } + } + + vk_free(&device->vk.alloc, values); + + return result; +} + +VkResult anv_SignalSemaphore( + VkDevice _device, + const VkSemaphoreSignalInfoKHR* pSignalInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + ANV_FROM_HANDLE(anv_semaphore, semaphore, pSignalInfo->semaphore); + + struct anv_semaphore_impl *impl = + semaphore->temporary.type != ANV_SEMAPHORE_TYPE_NONE ? + &semaphore->temporary : &semaphore->permanent; + + switch (impl->type) { + case ANV_SEMAPHORE_TYPE_TIMELINE: { + pthread_mutex_lock(&device->mutex); + + VkResult result = anv_timeline_gc_locked(device, &impl->timeline); + + assert(pSignalInfo->value > impl->timeline.highest_pending); + + impl->timeline.highest_pending = impl->timeline.highest_past = pSignalInfo->value; + + if (result == VK_SUCCESS) + result = anv_device_submit_deferred_locked(device); + + pthread_cond_broadcast(&device->queue_submit); + pthread_mutex_unlock(&device->mutex); + return result; + } + + case ANV_SEMAPHORE_TYPE_DRM_SYNCOBJ_TIMELINE: { + /* Timeline semaphores are created with a value of 0, so signaling on 0 + * is a waste of time. + */ + if (pSignalInfo->value == 0) + return VK_SUCCESS; + + int ret = anv_gem_syncobj_timeline_signal(device, &impl->syncobj, + &pSignalInfo->value, 1); + + return ret == 0 ? VK_SUCCESS : + anv_device_set_lost(device, "unable to signal timeline syncobj"); + } + + default: + unreachable("Invalid semaphore type"); + } +}