From b2c97bc789198427043cd902bc76e194e7e81c7d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 4 Apr 2017 14:36:46 -0700
Subject: [PATCH] anv/query: Busy-wait for available query entries

Before, we were just looking at whether or not the user wanted us to
wait and waiting on the BO.  Some clients, such as the Serious engine,
use a single query pool for hundreds of individual query results where
the writes for those queries may be split across several command
buffers.  In this scenario, the individual query we're looking for may
become available long before the BO is idle so waiting on the query pool
BO to be finished is wasteful. This commit makes us instead busy-loop on
each query until it's available.

This significantly reduces pipeline bubbles and improves performance of
The Talos Principle on medium settings (where the GPU isn't overloaded
with drawing) by around 20% on my SkyLake gt4.

Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Tested-by: Eero Tamminen <eero.t.tamminen@intel.com>
Tested-by: Grazvydas Ignotas <notasas@gmail.com>
---
 src/intel/vulkan/genX_query.c | 62 +++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 6 deletions(-)

diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 7ea94044b12..235da8b8b6e 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -131,6 +131,54 @@ cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
    }
 }
 
+static bool
+query_is_available(struct anv_device *device, uint64_t *slot)
+{
+   if (!device->info.has_llc)
+      __builtin_ia32_clflush(slot);
+
+   return *(volatile uint64_t *)slot;
+}
+
+static VkResult
+wait_for_available(struct anv_device *device,
+                   struct anv_query_pool *pool, uint64_t *slot)
+{
+   while (true) {
+      if (query_is_available(device, slot))
+         return VK_SUCCESS;
+
+      int ret = anv_gem_busy(device, pool->bo.gem_handle);
+      if (ret == 1) {
+         /* The BO is still busy, keep waiting. */
+         continue;
+      } else if (ret == -1) {
+         /* We don't know the real error. */
+         device->lost = true;
+         return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
+      } else {
+         assert(ret == 0);
+         /* The BO is no longer busy. */
+         if (query_is_available(device, slot)) {
+            return VK_SUCCESS;
+         } else {
+            VkResult status = anv_device_query_status(device);
+            if (status != VK_SUCCESS)
+               return status;
+
+            /* If we haven't seen availability yet, then we never will.  This
+             * can only happen if we have a client error where they call
+             * GetQueryPoolResults on a query that they haven't submitted to
+             * the GPU yet.  The spec allows us to do anything in this case,
+             * but returning VK_SUCCESS doesn't seem right and we shouldn't
+             * just keep spinning.
+             */
+            return VK_NOT_READY;
+         }
+      }
+   }
+}
+
 VkResult genX(GetQueryPoolResults)(
     VkDevice                                    _device,
     VkQueryPool                                 queryPool,
@@ -154,12 +202,6 @@ VkResult genX(GetQueryPoolResults)(
    if (pData == NULL)
       return VK_SUCCESS;
 
-   if (flags & VK_QUERY_RESULT_WAIT_BIT) {
-      VkResult result = anv_device_wait(device, &pool->bo, INT64_MAX);
-      if (result != VK_SUCCESS)
-         return result;
-   }
-
    void *data_end = pData + dataSize;
 
    if (!device->info.has_llc) {
@@ -176,6 +218,14 @@ VkResult genX(GetQueryPoolResults)(
       /* Availability is always at the start of the slot */
       bool available = slot[0];
 
+      if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
+         status = wait_for_available(device, pool, slot);
+         if (status != VK_SUCCESS)
+            return status;
+
+         available = true;
+      }
+
       /* From the Vulkan 1.0.42 spec:
        *
        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
-- 
2.30.2