This currently covers two situations where it's obvious that
the GPU hung:
1) when wait-of-idle doesn't finish in a finite time
2) when a CS submission is cancelled by the kernel
There is still probably some other situations that aren't yet handled.
According to the Vulkan spec, some operations should return
VK_ERROR_DEVICE_LOST when the corresponding logical device is
known to be lost.
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/5878>
+VkResult
+_radv_device_set_lost(struct radv_device *device,
+ const char *file, int line,
+ const char *msg, ...)
+{
+ VkResult err;
+ va_list ap;
+
+ p_atomic_inc(&device->lost);
+
+ va_start(ap, msg);
+ err = __vk_errorv(device->physical_device->instance, device,
+ VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT,
+ VK_ERROR_DEVICE_LOST, file, line, msg, ap);
+ va_end(ap);
+
+ return err;
+}
+
VkResult radv_CreateDevice(
VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo* pCreateInfo,
VkResult radv_CreateDevice(
VkPhysicalDevice physicalDevice,
const VkDeviceCreateInfo* pCreateInfo,
* VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
* to submit the same job again to this device.
*/
* VK_ERROR_DEVICE_LOST to ensure the clients do not attempt
* to submit the same job again to this device.
*/
- result = VK_ERROR_DEVICE_LOST;
+ result = radv_device_set_lost(queue->device, "vkQueueSubmit() failed");
}
radv_free_temp_syncobjs(queue->device,
}
radv_free_temp_syncobjs(queue->device,
uint32_t fence_idx = 0;
bool flushed_caches = false;
uint32_t fence_idx = 0;
bool flushed_caches = false;
+ if (radv_device_is_lost(queue->device))
+ return VK_ERROR_DEVICE_LOST;
+
if (fence != VK_NULL_HANDLE) {
for (uint32_t i = 0; i < submitCount; ++i)
if (radv_submit_has_effects(pSubmits + i))
if (fence != VK_NULL_HANDLE) {
for (uint32_t i = 0; i < submitCount; ++i)
if (radv_submit_has_effects(pSubmits + i))
{
RADV_FROM_HANDLE(radv_queue, queue, _queue);
{
RADV_FROM_HANDLE(radv_queue, queue, _queue);
+ if (radv_device_is_lost(queue->device))
+ return VK_ERROR_DEVICE_LOST;
+
pthread_mutex_lock(&queue->pending_mutex);
while (!list_is_empty(&queue->pending_submissions)) {
pthread_cond_wait(&queue->device->timeline_cond, &queue->pending_mutex);
pthread_mutex_lock(&queue->pending_mutex);
while (!list_is_empty(&queue->pending_submissions)) {
pthread_cond_wait(&queue->device->timeline_cond, &queue->pending_mutex);
if (!queue->device->ws->ctx_wait_idle(queue->hw_ctx,
radv_queue_family_to_ring(queue->queue_family_index),
queue->queue_idx)) {
if (!queue->device->ws->ctx_wait_idle(queue->hw_ctx,
radv_queue_family_to_ring(queue->queue_family_index),
queue->queue_idx)) {
- return vk_errorf(queue->device->instance, VK_ERROR_DEVICE_LOST,
- "Failed to wait for a '%s' queue to be idle. "
- "GPU hang ?", radv_get_queue_family_name(queue));
+ return radv_device_set_lost(queue->device,
+ "Failed to wait for a '%s' queue "
+ "to be idle. GPU hang ?",
+ radv_get_queue_family_name(queue));
VkResult result;
uint32_t fence_idx = 0;
VkResult result;
uint32_t fence_idx = 0;
+ if (radv_device_is_lost(queue->device))
+ return VK_ERROR_DEVICE_LOST;
+
if (fence != VK_NULL_HANDLE) {
for (uint32_t i = 0; i < bindInfoCount; ++i)
if (radv_sparse_bind_has_effects(pBindInfo + i))
if (fence != VK_NULL_HANDLE) {
for (uint32_t i = 0; i < bindInfoCount; ++i)
if (radv_sparse_bind_has_effects(pBindInfo + i))
uint64_t timeout)
{
RADV_FROM_HANDLE(radv_device, device, _device);
uint64_t timeout)
{
RADV_FROM_HANDLE(radv_device, device, _device);
+
+ if (radv_device_is_lost(device))
+ return VK_ERROR_DEVICE_LOST;
+
timeout = radv_get_absolute_timeout(timeout);
if (device->always_use_syncobj &&
timeout = radv_get_absolute_timeout(timeout);
if (device->always_use_syncobj &&
fence->temporary.kind != RADV_FENCE_NONE ?
&fence->temporary : &fence->permanent;
fence->temporary.kind != RADV_FENCE_NONE ?
&fence->temporary : &fence->permanent;
+ if (radv_device_is_lost(device))
+ return VK_ERROR_DEVICE_LOST;
+
switch (part->kind) {
case RADV_FENCE_NONE:
break;
switch (part->kind) {
case RADV_FENCE_NONE:
break;
RADV_FROM_HANDLE(radv_device, device, _device);
RADV_FROM_HANDLE(radv_semaphore, semaphore, _semaphore);
RADV_FROM_HANDLE(radv_device, device, _device);
RADV_FROM_HANDLE(radv_semaphore, semaphore, _semaphore);
+ if (radv_device_is_lost(device))
+ return VK_ERROR_DEVICE_LOST;
+
struct radv_semaphore_part *part =
semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent;
struct radv_semaphore_part *part =
semaphore->temporary.kind != RADV_SEMAPHORE_NONE ? &semaphore->temporary : &semaphore->permanent;
uint64_t timeout)
{
RADV_FROM_HANDLE(radv_device, device, _device);
uint64_t timeout)
{
RADV_FROM_HANDLE(radv_device, device, _device);
+
+ if (radv_device_is_lost(device))
+ return VK_ERROR_DEVICE_LOST;
+
uint64_t abs_timeout = radv_get_absolute_timeout(timeout);
if (radv_semaphore_from_handle(pWaitInfo->pSemaphores[0])->permanent.kind == RADV_SEMAPHORE_TIMELINE)
uint64_t abs_timeout = radv_get_absolute_timeout(timeout);
if (radv_semaphore_from_handle(pWaitInfo->pSemaphores[0])->permanent.kind == RADV_SEMAPHORE_TIMELINE)
VkDevice _device,
VkEvent _event)
{
VkDevice _device,
VkEvent _event)
{
+ RADV_FROM_HANDLE(radv_device, device, _device);
RADV_FROM_HANDLE(radv_event, event, _event);
RADV_FROM_HANDLE(radv_event, event, _event);
+ if (radv_device_is_lost(device))
+ return VK_ERROR_DEVICE_LOST;
+
if (*event->map == 1)
return VK_EVENT_SET;
return VK_EVENT_RESET;
if (*event->map == 1)
return VK_EVENT_SET;
return VK_EVENT_RESET;
bool overallocation_disallowed;
uint64_t allocated_memory_size[VK_MAX_MEMORY_HEAPS];
mtx_t overallocation_mutex;
bool overallocation_disallowed;
uint64_t allocated_memory_size[VK_MAX_MEMORY_HEAPS];
mtx_t overallocation_mutex;
+
+ /* Track the number of device loss occurs. */
+ int lost;
+VkResult _radv_device_set_lost(struct radv_device *device,
+ const char *file, int line,
+ const char *msg, ...)
+ radv_printflike(4, 5);
+
+#define radv_device_set_lost(dev, ...) \
+ _radv_device_set_lost(dev, __FILE__, __LINE__, __VA_ARGS__)
+
+static inline bool
+radv_device_is_lost(const struct radv_device *device)
+{
+ return unlikely(p_atomic_read(&device->lost));
+}
+
struct radv_device_memory {
struct vk_object_base base;
struct radeon_winsys_bo *bo;
struct radv_device_memory {
struct vk_object_base base;
struct radeon_winsys_bo *bo;
char *data = pData;
VkResult result = VK_SUCCESS;
char *data = pData;
VkResult result = VK_SUCCESS;
+ if (radv_device_is_lost(device))
+ return VK_ERROR_DEVICE_LOST;
+
for(unsigned i = 0; i < queryCount; ++i, data += stride) {
char *dest = data;
unsigned query = firstQuery + i;
for(unsigned i = 0; i < queryCount; ++i, data += stride) {
char *dest = data;
unsigned query = firstQuery + i;