anv: Query the kernel for reset status

author Jason Ekstrand <jason.ekstrand@intel.com>

Mon, 27 Mar 2017 23:03:57 +0000 (16:03 -0700)

committer Jason Ekstrand <jason.ekstrand@intel.com>

Wed, 5 Apr 2017 01:33:52 +0000 (18:33 -0700)
author Jason Ekstrand <jason.ekstrand@intel.com>
Mon, 27 Mar 2017 23:03:57 +0000 (16:03 -0700)
committer Jason Ekstrand <jason.ekstrand@intel.com>
Wed, 5 Apr 2017 01:33:52 +0000 (18:33 -0700)
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c

index 45a6e333685c9fa2149e972020361b88a5d70422..9e860d51896a141086b5aa4c7d9161db8ef34e6a 100644 (file)
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -888,8 +888,6 @@ anv_device_submit_simple_batch(struct anv_device *device,
     struct anv_bo bo, *exec_bos[1];
     VkResult result = VK_SUCCESS;
     uint32_t size;
-   int64_t timeout;
-   int ret;
  
     /* Kernel driver requires 8 byte aligned batch length */
     size = align_u32(batch->next - batch->start, 8);
@@ -929,14 +927,7 @@ anv_device_submit_simple_batch(struct anv_device *device,
     if (result != VK_SUCCESS)
        goto fail;
  
-   timeout = INT64_MAX;
-   ret = anv_gem_wait(device, bo.gem_handle, &timeout);
-   if (ret != 0) {
-      /* We don't know the real error. */
-      device->lost = true;
-      result = vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m");
-      goto fail;
-   }
+   result = anv_device_wait(device, &bo, INT64_MAX);
  
   fail:
     anv_bo_pool_free(&device->batch_bo_pool, &bo);
@@ -1268,6 +1259,58 @@ anv_device_execbuf(struct anv_device *device,
     return VK_SUCCESS;
  }
  
+VkResult
+anv_device_query_status(struct anv_device *device)
+{
+   /* This isn't likely as most of the callers of this function already check
+    * for it.  However, it doesn't hurt to check and it potentially lets us
+    * avoid an ioctl.
+    */
+   if (unlikely(device->lost))
+      return VK_ERROR_DEVICE_LOST;
+
+   uint32_t active, pending;
+   int ret = anv_gem_gpu_get_reset_stats(device, &active, &pending);
+   if (ret == -1) {
+      /* We don't know the real error. */
+      device->lost = true;
+      return vk_errorf(VK_ERROR_DEVICE_LOST, "get_reset_stats failed: %m");
+   }
+
+   if (active) {
+      device->lost = true;
+      return vk_errorf(VK_ERROR_DEVICE_LOST,
+                       "GPU hung on one of our command buffers");
+   } else if (pending) {
+      device->lost = true;
+      return vk_errorf(VK_ERROR_DEVICE_LOST,
+                       "GPU hung with commands in-flight");
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+                int64_t timeout)
+{
+   int ret = anv_gem_wait(device, bo->gem_handle, &timeout);
+   if (ret == -1 && errno == ETIME) {
+      return VK_TIMEOUT;
+   } else if (ret == -1) {
+      /* We don't know the real error. */
+      device->lost = true;
+      return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
+   }
+
+   /* Query for device status after the wait.  If the BO we're waiting on got
+    * caught in a GPU hang we don't want to return VK_SUCCESS to the client
+    * because it clearly doesn't have valid data.  Yes, this most likely means
+    * an ioctl, but we just did an ioctl to wait so it's no great loss.
+    */
+   return anv_device_query_status(device);
+}
+
  VkResult anv_QueueSubmit(
      VkQueue                                     _queue,
      uint32_t                                    submitCount,
@@ -1277,10 +1320,17 @@ VkResult anv_QueueSubmit(
     ANV_FROM_HANDLE(anv_queue, queue, _queue);
     ANV_FROM_HANDLE(anv_fence, fence, _fence);
     struct anv_device *device = queue->device;
-   if (unlikely(device->lost))
-      return VK_ERROR_DEVICE_LOST;
  
-   VkResult result = VK_SUCCESS;
+   /* Query for device status prior to submitting.  Technically, we don't need
+    * to do this.  However, if we have a client that's submitting piles of
+    * garbage, we would rather break as early as possible to keep the GPU
+    * hanging contained.  If we don't check here, we'll either be waiting for
+    * the kernel to kick us or we'll have to wait until the client waits on a
+    * fence before we actually know whether or not we've hung.
+    */
+   VkResult result = anv_device_query_status(device);
+   if (result != VK_SUCCESS)
+      return result;
  
     /* We lock around QueueSubmit for three main reasons:
      *
@@ -1806,9 +1856,6 @@ VkResult anv_GetFenceStatus(
     if (unlikely(device->lost))
        return VK_ERROR_DEVICE_LOST;
  
-   int64_t t = 0;
-   int ret;
-
     switch (fence->state) {
     case ANV_FENCE_STATE_RESET:
        /* If it hasn't even been sent off to the GPU yet, it's not ready */
@@ -1818,15 +1865,18 @@ VkResult anv_GetFenceStatus(
        /* It's been signaled, return success */
        return VK_SUCCESS;
  
-   case ANV_FENCE_STATE_SUBMITTED:
-      /* It's been submitted to the GPU but we don't know if it's done yet. */
-      ret = anv_gem_wait(device, fence->bo.gem_handle, &t);
-      if (ret == 0) {
+   case ANV_FENCE_STATE_SUBMITTED: {
+      VkResult result = anv_device_wait(device, &fence->bo, 0);
+      switch (result) {
+      case VK_SUCCESS:
           fence->state = ANV_FENCE_STATE_SIGNALED;
           return VK_SUCCESS;
-      } else {
+      case VK_TIMEOUT:
           return VK_NOT_READY;
+      default:
+         return result;
        }
+   }
     default:
        unreachable("Invalid fence status");
     }
@@ -1888,20 +1938,20 @@ VkResult anv_WaitForFences(
              /* These are the fences we really care about.  Go ahead and wait
               * on it until we hit a timeout.
               */
-            ret = anv_gem_wait(device, fence->bo.gem_handle, &timeout);
-            if (ret == -1 && errno == ETIME) {
-               result = VK_TIMEOUT;
-               goto done;
-            } else if (ret == -1) {
-               /* We don't know the real error. */
-                device->lost = true;
-               return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
-            } else {
+            result = anv_device_wait(device, &fence->bo, timeout);
+            switch (result) {
+            case VK_SUCCESS:
                 fence->state = ANV_FENCE_STATE_SIGNALED;
                 signaled_fences = true;
                 if (!waitAll)
-                  return VK_SUCCESS;
-               continue;
+                  goto done;
+               break;
+
+            case VK_TIMEOUT:
+               goto done;
+
+            default:
+               return result;
              }
           }
        }
diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c

index 0dde6d9d6719bb35f5c1e11b9bdebbd731d9bc03..7612f493aa04d0d94e8f9684993d71a45c7b0026 100644 (file)
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -301,6 +301,23 @@ anv_gem_get_aperture(int fd, uint64_t *size)
     return 0;
  }
  
+int
+anv_gem_gpu_get_reset_stats(struct anv_device *device,
+                            uint32_t *active, uint32_t *pending)
+{
+   struct drm_i915_reset_stats stats = {
+      .ctx_id = device->context_id,
+   };
+
+   int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GET_RESET_STATS, &stats);
+   if (ret == 0) {
+      *active = stats.batch_active;
+      *pending = stats.batch_pending;
+   }
+
+   return ret;
+}
+
  int
  anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle)
  {
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h

index 97bac5077efd4852e45c2db00074d805b263020a..dc83b4ac44ff2886ff4835f12f9aa59d7da66279 100644 (file)
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -637,6 +637,9 @@ void anv_device_finish_blorp(struct anv_device *device);
  VkResult anv_device_execbuf(struct anv_device *device,
                              struct drm_i915_gem_execbuffer2 *execbuf,
                              struct anv_bo **execbuf_bos);
+VkResult anv_device_query_status(struct anv_device *device);
+VkResult anv_device_wait(struct anv_device *device, struct anv_bo *bo,
+                         int64_t timeout);
  
  void* anv_gem_mmap(struct anv_device *device,
                     uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags);
@@ -654,6 +657,8 @@ int anv_gem_destroy_context(struct anv_device *device, int context);
  int anv_gem_get_param(int fd, uint32_t param);
  bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling);
  int anv_gem_get_aperture(int fd, uint64_t *size);
+int anv_gem_gpu_get_reset_stats(struct anv_device *device,
+                                uint32_t *active, uint32_t *pending);
  int anv_gem_handle_to_fd(struct anv_device *device, uint32_t gem_handle);
  uint32_t anv_gem_fd_to_handle(struct anv_device *device, int fd);
  int anv_gem_set_caching(struct anv_device *device, uint32_t gem_handle, uint32_t caching);
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c

index 3610665b6e8da642af6bb9b7070807acfbd33267..7ea94044b129f675ebbbe904f82e52a6132f74aa 100644 (file)
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -143,8 +143,6 @@ VkResult genX(GetQueryPoolResults)(
  {
     ANV_FROM_HANDLE(anv_device, device, _device);
     ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
-   int64_t timeout = INT64_MAX;
-   int ret;
  
     assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
            pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
@@ -157,12 +155,9 @@ VkResult genX(GetQueryPoolResults)(
        return VK_SUCCESS;
  
     if (flags & VK_QUERY_RESULT_WAIT_BIT) {
-      ret = anv_gem_wait(device, pool->bo.gem_handle, &timeout);
-      if (ret == -1) {
-         /* We don't know the real error. */
-         return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
-                          "gem_wait failed %m");
-      }
+      VkResult result = anv_device_wait(device, &pool->bo, INT64_MAX);
+      if (result != VK_SUCCESS)
+         return result;
     }
  
     void *data_end = pData + dataSize;
author	Jason Ekstrand <jason.ekstrand@intel.com>
	Mon, 27 Mar 2017 23:03:57 +0000 (16:03 -0700)
committer	Jason Ekstrand <jason.ekstrand@intel.com>
	Wed, 5 Apr 2017 01:33:52 +0000 (18:33 -0700)
src/intel/vulkan/anv_device.c		patch \| blob \| history
src/intel/vulkan/anv_gem.c		patch \| blob \| history
src/intel/vulkan/anv_private.h		patch \| blob \| history
src/intel/vulkan/genX_query.c		patch \| blob \| history