From: Samuel Pitoiset Date: Mon, 13 Jul 2020 11:59:48 +0000 (+0200) Subject: radv: track and report if a logical device is lost X-Git-Url: https://git.libre-soc.org/?p=mesa.git;a=commitdiff_plain;h=d26f62c667099fc3d30a3155335ca4f0e73c8d88 radv: track and report if a logical device is lost This currently covers two situations where it's obvious that the GPU hung: 1) when wait-of-idle doesn't finish in a finite time 2) when a CS submission is cancelled by the kernel There is still probably some other situations that aren't yet handled. According to the Vulkan spec, some operations should return VK_ERROR_DEVICE_LOST when the corresponding logical device is known to be lost. Signed-off-by: Samuel Pitoiset Reviewed-by: Bas Nieuwenhuizen Part-of: --- diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index dda535d5538..5b93083913b 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -2590,6 +2590,25 @@ static void radv_device_finish_border_color(struct radv_device *device) } } +VkResult +_radv_device_set_lost(struct radv_device *device, + const char *file, int line, + const char *msg, ...) +{ + VkResult err; + va_list ap; + + p_atomic_inc(&device->lost); + + va_start(ap, msg); + err = __vk_errorv(device->physical_device->instance, device, + VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT, + VK_ERROR_DEVICE_LOST, file, line, msg, ap); + va_end(ap); + + return err; +} + VkResult radv_CreateDevice( VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo* pCreateInfo, @@ -4503,7 +4522,7 @@ fail: * VK_ERROR_DEVICE_LOST to ensure the clients do not attempt * to submit the same job again to this device. */ - result = VK_ERROR_DEVICE_LOST; + result = radv_device_set_lost(queue->device, "vkQueueSubmit() failed"); } radv_free_temp_syncobjs(queue->device, @@ -4724,6 +4743,9 @@ VkResult radv_QueueSubmit( uint32_t fence_idx = 0; bool flushed_caches = false; + if (radv_device_is_lost(queue->device)) + return VK_ERROR_DEVICE_LOST; + if (fence != VK_NULL_HANDLE) { for (uint32_t i = 0; i < submitCount; ++i) if (radv_submit_has_effects(pSubmits + i)) @@ -4793,6 +4815,9 @@ VkResult radv_QueueWaitIdle( { RADV_FROM_HANDLE(radv_queue, queue, _queue); + if (radv_device_is_lost(queue->device)) + return VK_ERROR_DEVICE_LOST; + pthread_mutex_lock(&queue->pending_mutex); while (!list_is_empty(&queue->pending_submissions)) { pthread_cond_wait(&queue->device->timeline_cond, &queue->pending_mutex); @@ -4802,9 +4827,10 @@ VkResult radv_QueueWaitIdle( if (!queue->device->ws->ctx_wait_idle(queue->hw_ctx, radv_queue_family_to_ring(queue->queue_family_index), queue->queue_idx)) { - return vk_errorf(queue->device->instance, VK_ERROR_DEVICE_LOST, - "Failed to wait for a '%s' queue to be idle. " - "GPU hang ?", radv_get_queue_family_name(queue)); + return radv_device_set_lost(queue->device, + "Failed to wait for a '%s' queue " + "to be idle. GPU hang ?", + radv_get_queue_family_name(queue)); } return VK_SUCCESS; @@ -5471,6 +5497,9 @@ static bool radv_sparse_bind_has_effects(const VkBindSparseInfo *info) VkResult result; uint32_t fence_idx = 0; + if (radv_device_is_lost(queue->device)) + return VK_ERROR_DEVICE_LOST; + if (fence != VK_NULL_HANDLE) { for (uint32_t i = 0; i < bindInfoCount; ++i) if (radv_sparse_bind_has_effects(pBindInfo + i)) @@ -5653,6 +5682,10 @@ VkResult radv_WaitForFences( uint64_t timeout) { RADV_FROM_HANDLE(radv_device, device, _device); + + if (radv_device_is_lost(device)) + return VK_ERROR_DEVICE_LOST; + timeout = radv_get_absolute_timeout(timeout); if (device->always_use_syncobj && @@ -5809,6 +5842,9 @@ VkResult radv_GetFenceStatus(VkDevice _device, VkFence _fence) fence->temporary.kind != RADV_FENCE_NONE ? &fence->temporary : &fence->permanent; + if (radv_device_is_lost(device)) + return VK_ERROR_DEVICE_LOST; + switch (part->kind) { case RADV_FENCE_NONE: break; @@ -6134,6 +6170,9 @@ radv_GetSemaphoreCounterValue(VkDevice _device, RADV_FROM_HANDLE(radv_device, device, _device); RADV_FROM_HANDLE(radv_semaphore, semaphore, _semaphore); + if (radv_device_is_lost(device)) + return VK_ERROR_DEVICE_LOST; + struct radv_semaphore_part *part = semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent; @@ -6191,6 +6230,10 @@ radv_WaitSemaphores(VkDevice _device, uint64_t timeout) { RADV_FROM_HANDLE(radv_device, device, _device); + + if (radv_device_is_lost(device)) + return VK_ERROR_DEVICE_LOST; + uint64_t abs_timeout = radv_get_absolute_timeout(timeout); if (radv_semaphore_from_handle(pWaitInfo->pSemaphores[0])->permanent.kind == RADV_SEMAPHORE_TIMELINE) @@ -6327,8 +6370,12 @@ VkResult radv_GetEventStatus( VkDevice _device, VkEvent _event) { + RADV_FROM_HANDLE(radv_device, device, _device); RADV_FROM_HANDLE(radv_event, event, _event); + if (radv_device_is_lost(device)) + return VK_ERROR_DEVICE_LOST; + if (*event->map == 1) return VK_EVENT_SET; return VK_EVENT_RESET; diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 22235e530c1..1f5c4403b4e 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -851,8 +851,25 @@ struct radv_device { bool overallocation_disallowed; uint64_t allocated_memory_size[VK_MAX_MEMORY_HEAPS]; mtx_t overallocation_mutex; + + /* Track the number of device loss occurs. */ + int lost; }; +VkResult _radv_device_set_lost(struct radv_device *device, + const char *file, int line, + const char *msg, ...) + radv_printflike(4, 5); + +#define radv_device_set_lost(dev, ...) \ + _radv_device_set_lost(dev, __FILE__, __LINE__, __VA_ARGS__) + +static inline bool +radv_device_is_lost(const struct radv_device *device) +{ + return unlikely(p_atomic_read(&device->lost)); +} + struct radv_device_memory { struct vk_object_base base; struct radeon_winsys_bo *bo; diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c index 1331028abb0..feeb5d84512 100644 --- a/src/amd/vulkan/radv_query.c +++ b/src/amd/vulkan/radv_query.c @@ -1368,6 +1368,9 @@ VkResult radv_GetQueryPoolResults( char *data = pData; VkResult result = VK_SUCCESS; + if (radv_device_is_lost(device)) + return VK_ERROR_DEVICE_LOST; + for(unsigned i = 0; i < queryCount; ++i, data += stride) { char *dest = data; unsigned query = firstQuery + i;