vulkan/overlay: Add a workaround semaphore for application presenting without one
[mesa.git] / src / vulkan / overlay-layer / overlay.cpp
index 7d24c64886eec2bbbe58c280168ed53ee2f9f7eb..1ed1d2b02e7882fed1d8485ee6de955772d8f069 100644 (file)
@@ -28,6 +28,8 @@
 #include <vulkan/vulkan.h>
 #include <vulkan/vk_layer.h>
 
+#include "git_sha1.h"
+
 #include "imgui.h"
 
 #include "overlay_params.h"
@@ -37,6 +39,7 @@
 #include "util/list.h"
 #include "util/ralloc.h"
 #include "util/os_time.h"
+#include "util/os_socket.h"
 #include "util/simple_mtx.h"
 
 #include "vk_enum_to_str.h"
@@ -51,6 +54,14 @@ struct instance_data {
    bool pipeline_statistics_enabled;
 
    bool first_line_printed;
+
+   int control_client;
+
+   /* Dumping of frame stats to a file has been enabled. */
+   bool capture_enabled;
+
+   /* Dumping of frame stats to a file has been enabled and started. */
+   bool capture_started;
 };
 
 struct frame_stat {
@@ -114,6 +125,8 @@ struct overlay_draw {
 
    VkCommandBuffer command_buffer;
 
+   VkSemaphore cross_engine_semaphore;
+
    VkSemaphore semaphore;
    VkFence fence;
 
@@ -211,12 +224,8 @@ static inline void ensure_vk_object_map(void)
 }
 
 #define HKEY(obj) ((uint64_t)(obj))
-#define FIND_SWAPCHAIN_DATA(obj) ((struct swapchain_data *)find_object_data(HKEY(obj)))
-#define FIND_CMD_BUFFER_DATA(obj) ((struct command_buffer_data *)find_object_data(HKEY(obj)))
-#define FIND_DEVICE_DATA(obj) ((struct device_data *)find_object_data(HKEY(obj)))
-#define FIND_QUEUE_DATA(obj) ((struct queue_data *)find_object_data(HKEY(obj)))
-#define FIND_PHYSICAL_DEVICE_DATA(obj) ((struct instance_data *)find_object_data(HKEY(obj)))
-#define FIND_INSTANCE_DATA(obj) ((struct instance_data *)find_object_data(HKEY(obj)))
+#define FIND(type, obj) ((type *)find_object_data(HKEY(obj)))
+
 static void *find_object_data(uint64_t obj)
 {
    simple_mtx_lock(&vk_object_to_data_mutex);
@@ -312,16 +321,11 @@ free_chain(struct VkBaseOutStructure *chain)
 
 /**/
 
-static void check_vk_result(VkResult err)
-{
-   if (err != VK_SUCCESS)
-      printf("ERROR!\n");
-}
-
 static struct instance_data *new_instance_data(VkInstance instance)
 {
    struct instance_data *data = rzalloc(NULL, struct instance_data);
    data->instance = instance;
+   data->control_client = -1;
    map_object(HKEY(data->instance), data);
    return data;
 }
@@ -330,6 +334,8 @@ static void destroy_instance_data(struct instance_data *data)
 {
    if (data->params.output_file)
       fclose(data->params.output_file);
+   if (data->params.control >= 0)
+      os_socket_close(data->params.control);
    unmap_object(HKEY(data->instance));
    ralloc_free(data);
 }
@@ -378,18 +384,17 @@ static struct queue_data *new_queue_data(VkQueue queue,
    data->flags = family_props->queueFlags;
    data->timestamp_mask = (1ull << family_props->timestampValidBits) - 1;
    data->family_index = family_index;
-   LIST_INITHEAD(&data->running_command_buffer);
+   list_inithead(&data->running_command_buffer);
    map_object(HKEY(data->queue), data);
 
    /* Fence synchronizing access to queries on that queue. */
    VkFenceCreateInfo fence_info = {};
    fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
    fence_info.flags = VK_FENCE_CREATE_SIGNALED_BIT;
-   VkResult err = device_data->vtable.CreateFence(device_data->device,
-                                                  &fence_info,
-                                                  NULL,
-                                                  &data->queries_fence);
-   check_vk_result(err);
+   VK_CHECK(device_data->vtable.CreateFence(device_data->device,
+                                            &fence_info,
+                                            NULL,
+                                            &data->queries_fence));
 
    if (data->flags & VK_QUEUE_GRAPHICS_BIT)
       device_data->graphic_queue = data;
@@ -504,7 +509,7 @@ static void destroy_swapchain_data(struct swapchain_data *data)
 struct overlay_draw *get_overlay_draw(struct swapchain_data *data)
 {
    struct device_data *device_data = data->device;
-   struct overlay_draw *draw = list_empty(&data->draws) ?
+   struct overlay_draw *draw = list_is_empty(&data->draws) ?
       NULL : list_first_entry(&data->draws, struct overlay_draw, link);
 
    VkSemaphoreCreateInfo sem_info = {};
@@ -541,6 +546,8 @@ struct overlay_draw *get_overlay_draw(struct swapchain_data *data)
 
    VK_CHECK(device_data->vtable.CreateSemaphore(device_data->device, &sem_info,
                                                 NULL, &draw->semaphore));
+   VK_CHECK(device_data->vtable.CreateSemaphore(device_data->device, &sem_info,
+                                                NULL, &draw->cross_engine_semaphore));
 
    list_addtail(&draw->link, &data->draws);
 
@@ -552,6 +559,7 @@ static const char *param_unit(enum overlay_param_enabled param)
    switch (param) {
    case OVERLAY_PARAM_ENABLED_frame_timing:
    case OVERLAY_PARAM_ENABLED_acquire_timing:
+   case OVERLAY_PARAM_ENABLED_present_timing:
       return "(us)";
    case OVERLAY_PARAM_ENABLED_gpu_timing:
       return "(ns)";
@@ -560,6 +568,204 @@ static const char *param_unit(enum overlay_param_enabled param)
    }
 }
 
+static void parse_command(struct instance_data *instance_data,
+                          const char *cmd, unsigned cmdlen,
+                          const char *param, unsigned paramlen)
+{
+   if (!strncmp(cmd, "capture", cmdlen)) {
+      int value = atoi(param);
+      bool enabled = value > 0;
+
+      if (enabled) {
+         instance_data->capture_enabled = true;
+      } else {
+         instance_data->capture_enabled = false;
+         instance_data->capture_started = false;
+      }
+   }
+}
+
+#define BUFSIZE 4096
+
+/**
+ * This function will process commands through the control file.
+ *
+ * A command starts with a colon, followed by the command, and followed by an
+ * option '=' and a parameter.  It has to end with a semi-colon. A full command
+ * + parameter looks like:
+ *
+ *    :cmd=param;
+ */
+static void process_char(struct instance_data *instance_data, char c)
+{
+   static char cmd[BUFSIZE];
+   static char param[BUFSIZE];
+
+   static unsigned cmdpos = 0;
+   static unsigned parampos = 0;
+   static bool reading_cmd = false;
+   static bool reading_param = false;
+
+   switch (c) {
+   case ':':
+      cmdpos = 0;
+      parampos = 0;
+      reading_cmd = true;
+      reading_param = false;
+      break;
+   case ';':
+      if (!reading_cmd)
+         break;
+      cmd[cmdpos++] = '\0';
+      param[parampos++] = '\0';
+      parse_command(instance_data, cmd, cmdpos, param, parampos);
+      reading_cmd = false;
+      reading_param = false;
+      break;
+   case '=':
+      if (!reading_cmd)
+         break;
+      reading_param = true;
+      break;
+   default:
+      if (!reading_cmd)
+         break;
+
+      if (reading_param) {
+         /* overflow means an invalid parameter */
+         if (parampos >= BUFSIZE - 1) {
+            reading_cmd = false;
+            reading_param = false;
+            break;
+         }
+
+         param[parampos++] = c;
+      } else {
+         /* overflow means an invalid command */
+         if (cmdpos >= BUFSIZE - 1) {
+            reading_cmd = false;
+            break;
+         }
+
+         cmd[cmdpos++] = c;
+      }
+   }
+}
+
+static void control_send(struct instance_data *instance_data,
+                         const char *cmd, unsigned cmdlen,
+                         const char *param, unsigned paramlen)
+{
+   unsigned msglen = 0;
+   char buffer[BUFSIZE];
+
+   assert(cmdlen + paramlen + 3 < BUFSIZE);
+
+   buffer[msglen++] = ':';
+
+   memcpy(&buffer[msglen], cmd, cmdlen);
+   msglen += cmdlen;
+
+   if (paramlen > 0) {
+      buffer[msglen++] = '=';
+      memcpy(&buffer[msglen], param, paramlen);
+      msglen += paramlen;
+      buffer[msglen++] = ';';
+   }
+
+   os_socket_send(instance_data->control_client, buffer, msglen, 0);
+}
+
+static void control_send_connection_string(struct device_data *device_data)
+{
+   struct instance_data *instance_data = device_data->instance;
+
+   const char *controlVersionCmd = "MesaOverlayControlVersion";
+   const char *controlVersionString = "1";
+
+   control_send(instance_data, controlVersionCmd, strlen(controlVersionCmd),
+                controlVersionString, strlen(controlVersionString));
+
+   const char *deviceCmd = "DeviceName";
+   const char *deviceName = device_data->properties.deviceName;
+
+   control_send(instance_data, deviceCmd, strlen(deviceCmd),
+                deviceName, strlen(deviceName));
+
+   const char *mesaVersionCmd = "MesaVersion";
+   const char *mesaVersionString = "Mesa " PACKAGE_VERSION MESA_GIT_SHA1;
+
+   control_send(instance_data, mesaVersionCmd, strlen(mesaVersionCmd),
+                mesaVersionString, strlen(mesaVersionString));
+}
+
+static void control_client_check(struct device_data *device_data)
+{
+   struct instance_data *instance_data = device_data->instance;
+
+   /* Already connected, just return. */
+   if (instance_data->control_client >= 0)
+      return;
+
+   int socket = os_socket_accept(instance_data->params.control);
+   if (socket == -1) {
+      if (errno != EAGAIN && errno != EWOULDBLOCK && errno != ECONNABORTED)
+         fprintf(stderr, "ERROR on socket: %s\n", strerror(errno));
+      return;
+   }
+
+   if (socket >= 0) {
+      os_socket_block(socket, false);
+      instance_data->control_client = socket;
+      control_send_connection_string(device_data);
+   }
+}
+
+static void control_client_disconnected(struct instance_data *instance_data)
+{
+   os_socket_close(instance_data->control_client);
+   instance_data->control_client = -1;
+}
+
+static void process_control_socket(struct instance_data *instance_data)
+{
+   const int client = instance_data->control_client;
+   if (client >= 0) {
+      char buf[BUFSIZE];
+
+      while (true) {
+         ssize_t n = os_socket_recv(client, buf, BUFSIZE, 0);
+
+         if (n == -1) {
+            if (errno == EAGAIN || errno == EWOULDBLOCK) {
+               /* nothing to read, try again later */
+               break;
+            }
+
+            if (errno != ECONNRESET)
+               fprintf(stderr, "ERROR on connection: %s\n", strerror(errno));
+
+            control_client_disconnected(instance_data);
+         } else if (n == 0) {
+            /* recv() returns 0 when the client disconnects */
+            control_client_disconnected(instance_data);
+         }
+
+         for (ssize_t i = 0; i < n; i++) {
+            process_char(instance_data, buf[i]);
+         }
+
+         /* If we try to read BUFSIZE and receive BUFSIZE bytes from the
+          * socket, there's a good chance that there's still more data to be
+          * read, so we will try again. Otherwise, simply be done for this
+          * iteration and try again on the next frame.
+          */
+         if (n < BUFSIZE)
+            break;
+      }
+   }
+}
+
 static void snapshot_swapchain_frame(struct swapchain_data *data)
 {
    struct device_data *device_data = data->device;
@@ -567,6 +773,11 @@ static void snapshot_swapchain_frame(struct swapchain_data *data)
    uint32_t f_idx = data->n_frames % ARRAY_SIZE(data->frames_stats);
    uint64_t now = os_time_get(); /* us */
 
+   if (instance_data->params.control >= 0) {
+      control_client_check(device_data);
+      process_control_socket(instance_data);
+   }
+
    if (data->last_present_time) {
       data->frame_stats.stats[OVERLAY_PARAM_ENABLED_frame_timing] =
          now - data->last_present_time;
@@ -578,11 +789,26 @@ static void snapshot_swapchain_frame(struct swapchain_data *data)
       data->accumulated_stats.stats[s] += device_data->frame_stats.stats[s] + data->frame_stats.stats[s];
    }
 
+   /* If capture has been enabled but it hasn't started yet, it means we are on
+    * the first snapshot after it has been enabled. At this point we want to
+    * use the stats captured so far to update the display, but we don't want
+    * this data to cause noise to the stats that we want to capture from now
+    * on.
+    *
+    * capture_begin == true will trigger an update of the fps on display, and a
+    * flush of the data, but no stats will be written to the output file. This
+    * way, we will have only stats from after the capture has been enabled
+    * written to the output_file.
+    */
+   const bool capture_begin =
+      instance_data->capture_enabled && !instance_data->capture_started;
+
    if (data->last_fps_update) {
       double elapsed = (double)(now - data->last_fps_update); /* us */
-      if (elapsed >= instance_data->params.fps_sampling_period) {
+      if (capture_begin ||
+          elapsed >= instance_data->params.fps_sampling_period) {
          data->fps = 1000000.0f * data->n_frames_since_update / elapsed;
-         if (instance_data->params.output_file) {
+         if (instance_data->capture_started) {
             if (!instance_data->first_line_printed) {
                bool first_column = true;
 
@@ -621,6 +847,9 @@ static void snapshot_swapchain_frame(struct swapchain_data *data)
          memset(&data->accumulated_stats, 0, sizeof(data->accumulated_stats));
          data->n_frames_since_update = 0;
          data->last_fps_update = now;
+
+         if (capture_begin)
+            instance_data->capture_started = true;
       }
    } else {
       data->last_fps_update = now;
@@ -740,6 +969,7 @@ static void compute_swapchain_display(struct swapchain_data *data)
 
       if (s == OVERLAY_PARAM_ENABLED_frame_timing ||
           s == OVERLAY_PARAM_ENABLED_acquire_timing ||
+          s == OVERLAY_PARAM_ENABLED_present_timing ||
           s == OVERLAY_PARAM_ENABLED_gpu_timing) {
          double min_time = data->stats_min.stats[s] / data->time_dividor;
          double max_time = data->stats_max.stats[s] / data->time_dividor;
@@ -921,6 +1151,7 @@ static void CreateOrResizeBuffer(struct device_data *data,
 }
 
 static struct overlay_draw *render_swapchain_display(struct swapchain_data *data,
+                                                     struct queue_data *present_queue,
                                                      const VkSemaphore *wait_semaphores,
                                                      unsigned n_wait_semaphores,
                                                      unsigned image_index)
@@ -964,7 +1195,7 @@ static struct overlay_draw *render_swapchain_display(struct swapchain_data *data
    imb.subresourceRange.levelCount = 1;
    imb.subresourceRange.baseArrayLayer = 0;
    imb.subresourceRange.layerCount = 1;
-   imb.srcQueueFamilyIndex = device_data->graphic_queue->family_index;
+   imb.srcQueueFamilyIndex = present_queue->family_index;
    imb.dstQueueFamilyIndex = device_data->graphic_queue->family_index;
    device_data->vtable.CmdPipelineBarrier(draw->command_buffer,
                                           VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
@@ -1091,20 +1322,86 @@ static struct overlay_draw *render_swapchain_display(struct swapchain_data *data
     }
 
    device_data->vtable.CmdEndRenderPass(draw->command_buffer);
+
+   if (device_data->graphic_queue->family_index != present_queue->family_index)
+   {
+      /* Transfer the image back to the present queue family
+       * image layout was already changed to present by the render pass 
+       */
+      imb.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+      imb.pNext = nullptr;
+      imb.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+      imb.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+      imb.oldLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+      imb.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+      imb.image = data->images[image_index];
+      imb.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+      imb.subresourceRange.baseMipLevel = 0;
+      imb.subresourceRange.levelCount = 1;
+      imb.subresourceRange.baseArrayLayer = 0;
+      imb.subresourceRange.layerCount = 1;
+      imb.srcQueueFamilyIndex = device_data->graphic_queue->family_index;
+      imb.dstQueueFamilyIndex = present_queue->family_index;
+      device_data->vtable.CmdPipelineBarrier(draw->command_buffer,
+                                             VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
+                                             VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
+                                             0,          /* dependency flags */
+                                             0, nullptr, /* memory barriers */
+                                             0, nullptr, /* buffer memory barriers */
+                                             1, &imb);   /* image memory barriers */
+   }
+
    device_data->vtable.EndCommandBuffer(draw->command_buffer);
 
-   VkSubmitInfo submit_info = {};
-   VkPipelineStageFlags stage_wait = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
-   submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-   submit_info.commandBufferCount = 1;
-   submit_info.pCommandBuffers = &draw->command_buffer;
-   submit_info.pWaitDstStageMask = &stage_wait;
-   submit_info.waitSemaphoreCount = n_wait_semaphores;
-   submit_info.pWaitSemaphores = wait_semaphores;
-   submit_info.signalSemaphoreCount = 1;
-   submit_info.pSignalSemaphores = &draw->semaphore;
+   /* When presenting on a different queue than where we're drawing the
+    * overlay *AND* when the application does not provide a semaphore to
+    * vkQueuePresent, insert our own cross engine synchronization
+    * semaphore.
+    */
+   if (n_wait_semaphores == 0 && device_data->graphic_queue->queue != present_queue->queue) {
+      VkPipelineStageFlags stages_wait = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
+      VkSubmitInfo submit_info = {};
+      submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+      submit_info.commandBufferCount = 0;
+      submit_info.pWaitDstStageMask = &stages_wait;
+      submit_info.waitSemaphoreCount = 0;
+      submit_info.signalSemaphoreCount = 1;
+      submit_info.pSignalSemaphores = &draw->cross_engine_semaphore;
+
+      device_data->vtable.QueueSubmit(present_queue->queue, 1, &submit_info, VK_NULL_HANDLE);
+
+      submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+      submit_info.commandBufferCount = 1;
+      submit_info.pWaitDstStageMask = &stages_wait;
+      submit_info.pCommandBuffers = &draw->command_buffer;
+      submit_info.waitSemaphoreCount = 1;
+      submit_info.pWaitSemaphores = &draw->cross_engine_semaphore;
+      submit_info.signalSemaphoreCount = 1;
+      submit_info.pSignalSemaphores = &draw->semaphore;
+
+      device_data->vtable.QueueSubmit(device_data->graphic_queue->queue, 1, &submit_info, draw->fence);
+   } else {
+      VkPipelineStageFlags *stages_wait = (VkPipelineStageFlags*) malloc(sizeof(VkPipelineStageFlags) * n_wait_semaphores);
+      for (unsigned i = 0; i < n_wait_semaphores; i++)
+      {
+         // wait in the fragment stage until the swapchain image is ready
+         stages_wait[i] = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT;
+      }
+
+      VkSubmitInfo submit_info = {};
+      submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+      submit_info.commandBufferCount = 1;
+      submit_info.pCommandBuffers = &draw->command_buffer;
+      submit_info.pWaitDstStageMask = stages_wait;
+      submit_info.waitSemaphoreCount = n_wait_semaphores;
+      submit_info.pWaitSemaphores = wait_semaphores;
+      submit_info.signalSemaphoreCount = 1;
+      submit_info.pSignalSemaphores = &draw->semaphore;
 
-   device_data->vtable.QueueSubmit(device_data->graphic_queue->queue, 1, &submit_info, draw->fence);
+      device_data->vtable.QueueSubmit(device_data->graphic_queue->queue, 1, &submit_info, draw->fence);
+
+      free(stages_wait);
+   }
 
    return draw;
 }
@@ -1486,6 +1783,7 @@ static void shutdown_swapchain_data(struct swapchain_data *data)
    struct device_data *device_data = data->device;
 
    list_for_each_entry_safe(struct overlay_draw, draw, &data->draws, link) {
+      device_data->vtable.DestroySemaphore(device_data->device, draw->cross_engine_semaphore, NULL);
       device_data->vtable.DestroySemaphore(device_data->device, draw->semaphore, NULL);
       device_data->vtable.DestroyFence(device_data->device, draw->fence, NULL);
       device_data->vtable.DestroyBuffer(device_data->device, draw->vertex_buffer, NULL);
@@ -1523,6 +1821,7 @@ static void shutdown_swapchain_data(struct swapchain_data *data)
 }
 
 static struct overlay_draw *before_present(struct swapchain_data *swapchain_data,
+                                           struct queue_data *present_queue,
                                            const VkSemaphore *wait_semaphores,
                                            unsigned n_wait_semaphores,
                                            unsigned imageIndex)
@@ -1534,7 +1833,7 @@ static struct overlay_draw *before_present(struct swapchain_data *swapchain_data
 
    if (!instance_data->params.no_display && swapchain_data->n_frames > 0) {
       compute_swapchain_display(swapchain_data);
-      draw = render_swapchain_display(swapchain_data,
+      draw = render_swapchain_display(swapchain_data, present_queue,
                                       wait_semaphores, n_wait_semaphores,
                                       imageIndex);
    }
@@ -1548,7 +1847,7 @@ static VkResult overlay_CreateSwapchainKHR(
     const VkAllocationCallbacks*                pAllocator,
     VkSwapchainKHR*                             pSwapchain)
 {
-   struct device_data *device_data = FIND_DEVICE_DATA(device);
+   struct device_data *device_data = FIND(struct device_data, device);
    VkResult result = device_data->vtable.CreateSwapchainKHR(device, pCreateInfo, pAllocator, pSwapchain);
    if (result != VK_SUCCESS) return result;
 
@@ -1562,7 +1861,8 @@ static void overlay_DestroySwapchainKHR(
     VkSwapchainKHR                              swapchain,
     const VkAllocationCallbacks*                pAllocator)
 {
-   struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(swapchain);
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, swapchain);
 
    shutdown_swapchain_data(swapchain_data);
    swapchain_data->device->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
@@ -1573,7 +1873,7 @@ static VkResult overlay_QueuePresentKHR(
     VkQueue                                     queue,
     const VkPresentInfoKHR*                     pPresentInfo)
 {
-   struct queue_data *queue_data = FIND_QUEUE_DATA(queue);
+   struct queue_data *queue_data = FIND(struct queue_data, queue);
    struct device_data *device_data = queue_data->device;
    struct instance_data *instance_data = device_data->instance;
    uint32_t query_results[OVERLAY_QUERY_COUNT];
@@ -1584,15 +1884,12 @@ static VkResult overlay_QueuePresentKHR(
       /* Before getting the query results, make sure the operations have
        * completed.
        */
-      VkResult err = device_data->vtable.ResetFences(device_data->device,
-                                                     1, &queue_data->queries_fence);
-      check_vk_result(err);
-      err = device_data->vtable.QueueSubmit(queue, 0, NULL, queue_data->queries_fence);
-      check_vk_result(err);
-      err = device_data->vtable.WaitForFences(device_data->device,
-                                              1, &queue_data->queries_fence,
-                                              VK_FALSE, UINT64_MAX);
-      check_vk_result(err);
+      VK_CHECK(device_data->vtable.ResetFences(device_data->device,
+                                               1, &queue_data->queries_fence));
+      VK_CHECK(device_data->vtable.QueueSubmit(queue, 0, NULL, queue_data->queries_fence));
+      VK_CHECK(device_data->vtable.WaitForFences(device_data->device,
+                                                 1, &queue_data->queries_fence,
+                                                 VK_FALSE, UINT64_MAX));
 
       /* Now get the results. */
       list_for_each_entry_safe(struct command_buffer_data, cmd_buffer_data,
@@ -1601,13 +1898,11 @@ static VkResult overlay_QueuePresentKHR(
 
          if (cmd_buffer_data->pipeline_query_pool) {
             memset(query_results, 0, sizeof(query_results));
-            err =
-               device_data->vtable.GetQueryPoolResults(device_data->device,
-                                                       cmd_buffer_data->pipeline_query_pool,
-                                                       cmd_buffer_data->query_index, 1,
-                                                       sizeof(uint32_t) * OVERLAY_QUERY_COUNT,
-                                                       query_results, 0, VK_QUERY_RESULT_WAIT_BIT);
-            check_vk_result(err);
+            VK_CHECK(device_data->vtable.GetQueryPoolResults(device_data->device,
+                                                             cmd_buffer_data->pipeline_query_pool,
+                                                             cmd_buffer_data->query_index, 1,
+                                                             sizeof(uint32_t) * OVERLAY_QUERY_COUNT,
+                                                             query_results, 0, VK_QUERY_RESULT_WAIT_BIT));
 
             for (uint32_t i = OVERLAY_PARAM_ENABLED_vertices;
                  i <= OVERLAY_PARAM_ENABLED_compute_invocations; i++) {
@@ -1616,13 +1911,11 @@ static VkResult overlay_QueuePresentKHR(
          }
          if (cmd_buffer_data->timestamp_query_pool) {
             uint64_t gpu_timestamps[2] = { 0 };
-            err =
-               device_data->vtable.GetQueryPoolResults(device_data->device,
-                                                       cmd_buffer_data->timestamp_query_pool,
-                                                       cmd_buffer_data->query_index * 2, 2,
-                                                       2 * sizeof(uint64_t), gpu_timestamps, sizeof(uint64_t),
-                                                       VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
-            check_vk_result(err);
+            VK_CHECK(device_data->vtable.GetQueryPoolResults(device_data->device,
+                                                             cmd_buffer_data->timestamp_query_pool,
+                                                             cmd_buffer_data->query_index * 2, 2,
+                                                             2 * sizeof(uint64_t), gpu_timestamps, sizeof(uint64_t),
+                                                             VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT));
 
             gpu_timestamps[0] &= queue_data->timestamp_mask;
             gpu_timestamps[1] &= queue_data->timestamp_mask;
@@ -1641,25 +1934,42 @@ static VkResult overlay_QueuePresentKHR(
    if (instance_data->params.no_display) {
       for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
          VkSwapchainKHR swapchain = pPresentInfo->pSwapchains[i];
-         struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(swapchain);
+         struct swapchain_data *swapchain_data =
+            FIND(struct swapchain_data, swapchain);
+
+         uint32_t image_index = pPresentInfo->pImageIndices[i];
 
          before_present(swapchain_data,
+                        queue_data,
                         pPresentInfo->pWaitSemaphores,
                         pPresentInfo->waitSemaphoreCount,
-                        pPresentInfo->pImageIndices[i]);
+                        image_index);
+
+         VkPresentInfoKHR present_info = *pPresentInfo;
+         present_info.swapchainCount = 1;
+         present_info.pSwapchains = &swapchain;
+         present_info.pImageIndices = &image_index;
+
+         uint64_t ts0 = os_time_get();
+         result = queue_data->device->vtable.QueuePresentKHR(queue, &present_info);
+         uint64_t ts1 = os_time_get();
+         swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_present_timing] += ts1 - ts0;
       }
-      result = queue_data->device->vtable.QueuePresentKHR(queue, pPresentInfo);
    } else {
       for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
          VkSwapchainKHR swapchain = pPresentInfo->pSwapchains[i];
-         struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(swapchain);
+         struct swapchain_data *swapchain_data =
+            FIND(struct swapchain_data, swapchain);
+
+         uint32_t image_index = pPresentInfo->pImageIndices[i];
+
          VkPresentInfoKHR present_info = *pPresentInfo;
          present_info.swapchainCount = 1;
          present_info.pSwapchains = &swapchain;
-
-         uint32_t image_index = pPresentInfo->pImageIndices[i];
+         present_info.pImageIndices = &image_index;
 
          struct overlay_draw *draw = before_present(swapchain_data,
+                                                    queue_data,
                                                     pPresentInfo->pWaitSemaphores,
                                                     pPresentInfo->waitSemaphoreCount,
                                                     image_index);
@@ -1672,7 +1982,10 @@ static VkResult overlay_QueuePresentKHR(
          present_info.pWaitSemaphores = &draw->semaphore;
          present_info.waitSemaphoreCount = 1;
 
+         uint64_t ts0 = os_time_get();
          VkResult chain_result = queue_data->device->vtable.QueuePresentKHR(queue, &present_info);
+         uint64_t ts1 = os_time_get();
+         swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_present_timing] += ts1 - ts0;
          if (pPresentInfo->pResults)
             pPresentInfo->pResults[i] = chain_result;
          if (chain_result != VK_SUCCESS && result == VK_SUCCESS)
@@ -1690,7 +2003,8 @@ static VkResult overlay_AcquireNextImageKHR(
     VkFence                                     fence,
     uint32_t*                                   pImageIndex)
 {
-   struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(swapchain);
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, swapchain);
    struct device_data *device_data = swapchain_data->device;
 
    uint64_t ts0 = os_time_get();
@@ -1709,7 +2023,8 @@ static VkResult overlay_AcquireNextImage2KHR(
     const VkAcquireNextImageInfoKHR*            pAcquireInfo,
     uint32_t*                                   pImageIndex)
 {
-   struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(pAcquireInfo->swapchain);
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, pAcquireInfo->swapchain);
    struct device_data *device_data = swapchain_data->device;
 
    uint64_t ts0 = os_time_get();
@@ -1729,7 +2044,8 @@ static void overlay_CmdDraw(
     uint32_t                                    firstVertex,
     uint32_t                                    firstInstance)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw]++;
    struct device_data *device_data = cmd_buffer_data->device;
    device_data->vtable.CmdDraw(commandBuffer, vertexCount, instanceCount,
@@ -1744,7 +2060,8 @@ static void overlay_CmdDrawIndexed(
     int32_t                                     vertexOffset,
     uint32_t                                    firstInstance)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed]++;
    struct device_data *device_data = cmd_buffer_data->device;
    device_data->vtable.CmdDrawIndexed(commandBuffer, indexCount, instanceCount,
@@ -1758,7 +2075,8 @@ static void overlay_CmdDrawIndirect(
     uint32_t                                    drawCount,
     uint32_t                                    stride)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indirect]++;
    struct device_data *device_data = cmd_buffer_data->device;
    device_data->vtable.CmdDrawIndirect(commandBuffer, buffer, offset, drawCount, stride);
@@ -1771,13 +2089,14 @@ static void overlay_CmdDrawIndexedIndirect(
     uint32_t                                    drawCount,
     uint32_t                                    stride)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed_indirect]++;
    struct device_data *device_data = cmd_buffer_data->device;
    device_data->vtable.CmdDrawIndexedIndirect(commandBuffer, buffer, offset, drawCount, stride);
 }
 
-static void overlay_CmdDrawIndirectCountKHR(
+static void overlay_CmdDrawIndirectCount(
     VkCommandBuffer                             commandBuffer,
     VkBuffer                                    buffer,
     VkDeviceSize                                offset,
@@ -1786,15 +2105,16 @@ static void overlay_CmdDrawIndirectCountKHR(
     uint32_t                                    maxDrawCount,
     uint32_t                                    stride)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indirect_count]++;
    struct device_data *device_data = cmd_buffer_data->device;
-   device_data->vtable.CmdDrawIndirectCountKHR(commandBuffer, buffer, offset,
-                                               countBuffer, countBufferOffset,
-                                               maxDrawCount, stride);
+   device_data->vtable.CmdDrawIndirectCount(commandBuffer, buffer, offset,
+                                            countBuffer, countBufferOffset,
+                                            maxDrawCount, stride);
 }
 
-static void overlay_CmdDrawIndexedIndirectCountKHR(
+static void overlay_CmdDrawIndexedIndirectCount(
     VkCommandBuffer                             commandBuffer,
     VkBuffer                                    buffer,
     VkDeviceSize                                offset,
@@ -1803,12 +2123,13 @@ static void overlay_CmdDrawIndexedIndirectCountKHR(
     uint32_t                                    maxDrawCount,
     uint32_t                                    stride)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed_indirect_count]++;
    struct device_data *device_data = cmd_buffer_data->device;
-   device_data->vtable.CmdDrawIndexedIndirectCountKHR(commandBuffer, buffer, offset,
-                                                      countBuffer, countBufferOffset,
-                                                      maxDrawCount, stride);
+   device_data->vtable.CmdDrawIndexedIndirectCount(commandBuffer, buffer, offset,
+                                                   countBuffer, countBufferOffset,
+                                                   maxDrawCount, stride);
 }
 
 static void overlay_CmdDispatch(
@@ -1817,7 +2138,8 @@ static void overlay_CmdDispatch(
     uint32_t                                    groupCountY,
     uint32_t                                    groupCountZ)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_dispatch]++;
    struct device_data *device_data = cmd_buffer_data->device;
    device_data->vtable.CmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
@@ -1828,7 +2150,8 @@ static void overlay_CmdDispatchIndirect(
     VkBuffer                                    buffer,
     VkDeviceSize                                offset)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_dispatch_indirect]++;
    struct device_data *device_data = cmd_buffer_data->device;
    device_data->vtable.CmdDispatchIndirect(commandBuffer, buffer, offset);
@@ -1839,7 +2162,8 @@ static void overlay_CmdBindPipeline(
     VkPipelineBindPoint                         pipelineBindPoint,
     VkPipeline                                  pipeline)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    switch (pipelineBindPoint) {
    case VK_PIPELINE_BIND_POINT_GRAPHICS: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_graphics]++; break;
    case VK_PIPELINE_BIND_POINT_COMPUTE: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_compute]++; break;
@@ -1854,9 +2178,12 @@ static VkResult overlay_BeginCommandBuffer(
     VkCommandBuffer                             commandBuffer,
     const VkCommandBufferBeginInfo*             pBeginInfo)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    struct device_data *device_data = cmd_buffer_data->device;
 
+   memset(&cmd_buffer_data->stats, 0, sizeof(cmd_buffer_data->stats));
+
    /* We don't record any query in secondary command buffers, just make sure
     * we have the right inheritance.
     */
@@ -1893,10 +2220,6 @@ static VkResult overlay_BeginCommandBuffer(
       return result;
    }
 
-   /* Primary command buffers with no queries. */
-   if (!cmd_buffer_data->pipeline_query_pool && cmd_buffer_data->timestamp_query_pool)
-      return device_data->vtable.BeginCommandBuffer(commandBuffer, pBeginInfo);
-
    /* Otherwise record a begin query as first command. */
    VkResult result = device_data->vtable.BeginCommandBuffer(commandBuffer, pBeginInfo);
 
@@ -1930,7 +2253,8 @@ static VkResult overlay_BeginCommandBuffer(
 static VkResult overlay_EndCommandBuffer(
     VkCommandBuffer                             commandBuffer)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    struct device_data *device_data = cmd_buffer_data->device;
 
    if (cmd_buffer_data->timestamp_query_pool) {
@@ -1952,7 +2276,8 @@ static VkResult overlay_ResetCommandBuffer(
     VkCommandBuffer                             commandBuffer,
     VkCommandBufferResetFlags                   flags)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    struct device_data *device_data = cmd_buffer_data->device;
 
    memset(&cmd_buffer_data->stats, 0, sizeof(cmd_buffer_data->stats));
@@ -1965,12 +2290,14 @@ static void overlay_CmdExecuteCommands(
     uint32_t                                    commandBufferCount,
     const VkCommandBuffer*                      pCommandBuffers)
 {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
    struct device_data *device_data = cmd_buffer_data->device;
 
    /* Add the stats of the executed command buffers to the primary one. */
    for (uint32_t c = 0; c < commandBufferCount; c++) {
-      struct command_buffer_data *sec_cmd_buffer_data = FIND_CMD_BUFFER_DATA(pCommandBuffers[c]);
+      struct command_buffer_data *sec_cmd_buffer_data =
+         FIND(struct command_buffer_data, pCommandBuffers[c]);
 
       for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++)
          cmd_buffer_data->stats.stats[s] += sec_cmd_buffer_data->stats.stats[s];
@@ -1984,7 +2311,7 @@ static VkResult overlay_AllocateCommandBuffers(
    const VkCommandBufferAllocateInfo* pAllocateInfo,
    VkCommandBuffer*                   pCommandBuffers)
 {
-   struct device_data *device_data = FIND_DEVICE_DATA(device);
+   struct device_data *device_data = FIND(struct device_data, device);
    VkResult result =
       device_data->vtable.AllocateCommandBuffers(device, pAllocateInfo, pCommandBuffers);
    if (result != VK_SUCCESS)
@@ -2002,10 +2329,8 @@ static VkResult overlay_AllocateCommandBuffers(
          pAllocateInfo->commandBufferCount,
          overlay_query_flags,
       };
-      VkResult err =
-         device_data->vtable.CreateQueryPool(device_data->device, &pool_info,
-                                             NULL, &pipeline_query_pool);
-      check_vk_result(err);
+      VK_CHECK(device_data->vtable.CreateQueryPool(device_data->device, &pool_info,
+                                                   NULL, &pipeline_query_pool));
    }
    if (device_data->instance->params.enabled[OVERLAY_PARAM_ENABLED_gpu_timing]) {
       VkQueryPoolCreateInfo pool_info = {
@@ -2016,10 +2341,8 @@ static VkResult overlay_AllocateCommandBuffers(
          pAllocateInfo->commandBufferCount * 2,
          0,
       };
-      VkResult err =
-         device_data->vtable.CreateQueryPool(device_data->device, &pool_info,
-                                             NULL, &timestamp_query_pool);
-      check_vk_result(err);
+      VK_CHECK(device_data->vtable.CreateQueryPool(device_data->device, &pool_info,
+                                                   NULL, &timestamp_query_pool));
    }
 
    for (uint32_t i = 0; i < pAllocateInfo->commandBufferCount; i++) {
@@ -2042,10 +2365,15 @@ static void overlay_FreeCommandBuffers(
    uint32_t               commandBufferCount,
    const VkCommandBuffer* pCommandBuffers)
 {
-   struct device_data *device_data = FIND_DEVICE_DATA(device);
+   struct device_data *device_data = FIND(struct device_data, device);
    for (uint32_t i = 0; i < commandBufferCount; i++) {
       struct command_buffer_data *cmd_buffer_data =
-         FIND_CMD_BUFFER_DATA(pCommandBuffers[i]);
+         FIND(struct command_buffer_data, pCommandBuffers[i]);
+
+      /* It is legal to free a NULL command buffer*/
+      if (!cmd_buffer_data)
+         continue;
+
       uint64_t count = (uintptr_t)find_object_data(HKEY(cmd_buffer_data->pipeline_query_pool));
       if (count == 1) {
          unmap_object(HKEY(cmd_buffer_data->pipeline_query_pool));
@@ -2075,7 +2403,7 @@ static VkResult overlay_QueueSubmit(
     const VkSubmitInfo*                         pSubmits,
     VkFence                                     fence)
 {
-   struct queue_data *queue_data = FIND_QUEUE_DATA(queue);
+   struct queue_data *queue_data = FIND(struct queue_data, queue);
    struct device_data *device_data = queue_data->device;
 
    device_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_submit]++;
@@ -2083,7 +2411,7 @@ static VkResult overlay_QueueSubmit(
    for (uint32_t s = 0; s < submitCount; s++) {
       for (uint32_t c = 0; c < pSubmits[s].commandBufferCount; c++) {
          struct command_buffer_data *cmd_buffer_data =
-            FIND_CMD_BUFFER_DATA(pSubmits[s].pCommandBuffers[c]);
+            FIND(struct command_buffer_data, pSubmits[s].pCommandBuffers[c]);
 
          /* Merge the submitted command buffer stats into the device. */
          for (uint32_t st = 0; st < OVERLAY_PARAM_ENABLED_MAX; st++)
@@ -2096,7 +2424,7 @@ static VkResult overlay_QueueSubmit(
              !cmd_buffer_data->timestamp_query_pool)
             continue;
 
-         if (list_empty(&cmd_buffer_data->link)) {
+         if (list_is_empty(&cmd_buffer_data->link)) {
             list_addtail(&cmd_buffer_data->link,
                          &queue_data->running_command_buffer);
          } else {
@@ -2115,7 +2443,8 @@ static VkResult overlay_CreateDevice(
     const VkAllocationCallbacks*                pAllocator,
     VkDevice*                                   pDevice)
 {
-   struct instance_data *instance_data = FIND_PHYSICAL_DEVICE_DATA(physicalDevice);
+   struct instance_data *instance_data =
+      FIND(struct instance_data, physicalDevice);
    VkLayerDeviceCreateInfo *chain_info =
       get_device_chain_info(pCreateInfo, VK_LAYER_LINK_INFO);
 
@@ -2165,7 +2494,7 @@ static void overlay_DestroyDevice(
     VkDevice                                    device,
     const VkAllocationCallbacks*                pAllocator)
 {
-   struct device_data *device_data = FIND_DEVICE_DATA(device);
+   struct device_data *device_data = FIND(struct device_data, device);
    device_unmap_queues(device_data);
    device_data->vtable.DestroyDevice(device, pAllocator);
    destroy_device_data(device_data);
@@ -2202,6 +2531,13 @@ static VkResult overlay_CreateInstance(
 
    parse_overlay_env(&instance_data->params, getenv("VK_LAYER_MESA_OVERLAY_CONFIG"));
 
+   /* If there's no control file, and an output_file was specified, start
+    * capturing fps data right away.
+    */
+   instance_data->capture_enabled =
+      instance_data->params.output_file && instance_data->params.control < 0;
+   instance_data->capture_started = instance_data->capture_enabled;
+
    for (int i = OVERLAY_PARAM_ENABLED_vertices;
         i <= OVERLAY_PARAM_ENABLED_compute_invocations; i++) {
       if (instance_data->params.enabled[i]) {
@@ -2217,7 +2553,7 @@ static void overlay_DestroyInstance(
     VkInstance                                  instance,
     const VkAllocationCallbacks*                pAllocator)
 {
-   struct instance_data *instance_data = FIND_INSTANCE_DATA(instance);
+   struct instance_data *instance_data = FIND(struct instance_data, instance);
    instance_data_map_physical_devices(instance_data, false);
    instance_data->vtable.DestroyInstance(instance, pAllocator);
    destroy_instance_data(instance_data);
@@ -2229,6 +2565,7 @@ static const struct {
 } name_to_funcptr_map[] = {
    { "vkGetDeviceProcAddr", (void *) vkGetDeviceProcAddr },
 #define ADD_HOOK(fn) { "vk" # fn, (void *) overlay_ ## fn }
+#define ADD_ALIAS_HOOK(alias, fn) { "vk" # alias, (void *) overlay_ ## fn }
    ADD_HOOK(AllocateCommandBuffers),
    ADD_HOOK(FreeCommandBuffers),
    ADD_HOOK(ResetCommandBuffer),
@@ -2242,8 +2579,10 @@ static const struct {
    ADD_HOOK(CmdDrawIndexedIndirect),
    ADD_HOOK(CmdDispatch),
    ADD_HOOK(CmdDispatchIndirect),
-   ADD_HOOK(CmdDrawIndirectCountKHR),
-   ADD_HOOK(CmdDrawIndexedIndirectCountKHR),
+   ADD_HOOK(CmdDrawIndirectCount),
+   ADD_ALIAS_HOOK(CmdDrawIndirectCountKHR, CmdDrawIndirectCount),
+   ADD_HOOK(CmdDrawIndexedIndirectCount),
+   ADD_ALIAS_HOOK(CmdDrawIndexedIndirectCountKHR, CmdDrawIndexedIndirectCount),
 
    ADD_HOOK(CmdBindPipeline),
 
@@ -2281,7 +2620,7 @@ VK_LAYER_EXPORT VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr(VkD
 
    if (dev == NULL) return NULL;
 
-   struct device_data *device_data = FIND_DEVICE_DATA(dev);
+   struct device_data *device_data = FIND(struct device_data, dev);
    if (device_data->vtable.GetDeviceProcAddr == NULL) return NULL;
    return device_data->vtable.GetDeviceProcAddr(dev, funcName);
 }
@@ -2294,7 +2633,7 @@ VK_LAYER_EXPORT VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(V
 
    if (instance == NULL) return NULL;
 
-   struct instance_data *instance_data = FIND_INSTANCE_DATA(instance);
+   struct instance_data *instance_data = FIND(struct instance_data, instance);
    if (instance_data->vtable.GetInstanceProcAddr == NULL) return NULL;
    return instance_data->vtable.GetInstanceProcAddr(instance, funcName);
 }