util: Add a mapping from VkFormat to PIPE_FORMAT.

[mesa.git] / src / vulkan / overlay-layer / overlay.cpp
diff --git a/src/vulkan/overlay-layer/overlay.cpp b/src/vulkan/overlay-layer/overlay.cpp

index 979bb7bf913011f00fbc9996da82e685385b85e7..652478407e7ac073b8dd50c1b67b45f8330cb40d 100644 (file)
--- a/src/vulkan/overlay-layer/overlay.cpp
+++ b/src/vulkan/overlay-layer/overlay.cpp
@@ -34,6 +34,7 @@
  
  #include "util/debug.h"
  #include "util/hash_table.h"
  
  #include "util/debug.h"
  #include "util/hash_table.h"
+#include "util/list.h"
  #include "util/ralloc.h"
  #include "util/os_time.h"
  #include "util/simple_mtx.h"
  #include "util/ralloc.h"
  #include "util/os_time.h"
  #include "util/simple_mtx.h"
@@ -47,6 +48,9 @@ struct instance_data {
     VkInstance instance;
  
     struct overlay_params params;
     VkInstance instance;
  
     struct overlay_params params;
+   bool pipeline_statistics_enabled;
+
+   bool first_line_printed;
  };
  
  struct frame_stat {
  };
  
  struct frame_stat {
@@ -83,9 +87,12 @@ struct command_buffer_data {
  
     VkCommandBuffer cmd_buffer;
     VkQueryPool pipeline_query_pool;
  
     VkCommandBuffer cmd_buffer;
     VkQueryPool pipeline_query_pool;
+   VkQueryPool timestamp_query_pool;
     uint32_t query_index;
  
     struct frame_stat stats;
     uint32_t query_index;
  
     struct frame_stat stats;
+
+   struct list_head link; /* link into queue_data::running_command_buffer */
  };
  
  /* Mapped from VkQueue */
  };
  
  /* Mapped from VkQueue */
@@ -95,6 +102,28 @@ struct queue_data {
     VkQueue queue;
     VkQueueFlags flags;
     uint32_t family_index;
     VkQueue queue;
     VkQueueFlags flags;
     uint32_t family_index;
+   uint64_t timestamp_mask;
+
+   VkFence queries_fence;
+
+   struct list_head running_command_buffer;
+};
+
+struct overlay_draw {
+   struct list_head link;
+
+   VkCommandBuffer command_buffer;
+
+   VkSemaphore semaphore;
+   VkFence fence;
+
+   VkBuffer vertex_buffer;
+   VkDeviceMemory vertex_buffer_mem;
+   VkDeviceSize vertex_buffer_size;
+
+   VkBuffer index_buffer;
+   VkDeviceMemory index_buffer_mem;
+   VkDeviceSize index_buffer_size;
  };
  
  /* Mapped from VkSwapchainKHR */
  };
  
  /* Mapped from VkSwapchainKHR */
@@ -123,17 +152,7 @@ struct swapchain_data {
  
     VkCommandPool command_pool;
  
  
     VkCommandPool command_pool;
  
-   struct {
-      VkCommandBuffer command_buffer;
-
-      VkBuffer vertex_buffer;
-      VkDeviceMemory vertex_buffer_mem;
-      VkDeviceSize vertex_buffer_size;
-
-      VkBuffer index_buffer;
-      VkDeviceMemory index_buffer_mem;
-      VkDeviceSize index_buffer_size;
-   } frame_data[2];
+   struct list_head draws; /* List of struct overlay_draw */
  
     bool font_uploaded;
     VkImage font_image;
  
     bool font_uploaded;
     VkImage font_image;
@@ -142,9 +161,6 @@ struct swapchain_data {
     VkBuffer upload_font_buffer;
     VkDeviceMemory upload_font_buffer_mem;
  
     VkBuffer upload_font_buffer;
     VkDeviceMemory upload_font_buffer_mem;
  
-   VkFence fence;
-   VkSemaphore submission_semaphore;
-
     /**/
     ImGuiContext* imgui_context;
     ImVec2 window_size;
     /**/
     ImGuiContext* imgui_context;
     ImVec2 window_size;
@@ -158,6 +174,7 @@ struct swapchain_data {
     double fps;
  
     enum overlay_param_enabled stat_selector;
     double fps;
  
     enum overlay_param_enabled stat_selector;
+   double time_dividor;
     struct frame_stat stats_min, stats_max;
     struct frame_stat frames_stats[200];
  
     struct frame_stat stats_min, stats_max;
     struct frame_stat frames_stats[200];
  
@@ -168,49 +185,55 @@ struct swapchain_data {
     struct frame_stat accumulated_stats;
  };
  
     struct frame_stat accumulated_stats;
  };
  
-static struct hash_table *vk_object_to_data = NULL;
+static const VkQueryPipelineStatisticFlags overlay_query_flags =
+   VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
+#define OVERLAY_QUERY_COUNT (11)
+
+static struct hash_table_u64 *vk_object_to_data = NULL;
  static simple_mtx_t vk_object_to_data_mutex = _SIMPLE_MTX_INITIALIZER_NP;
  
  thread_local ImGuiContext* __MesaImGui;
  
  static inline void ensure_vk_object_map(void)
  {
  static simple_mtx_t vk_object_to_data_mutex = _SIMPLE_MTX_INITIALIZER_NP;
  
  thread_local ImGuiContext* __MesaImGui;
  
  static inline void ensure_vk_object_map(void)
  {
-   if (!vk_object_to_data) {
-      vk_object_to_data = _mesa_hash_table_create(NULL,
-                                                  _mesa_hash_pointer,
-                                                  _mesa_key_pointer_equal);
-   }
+   if (!vk_object_to_data)
+      vk_object_to_data = _mesa_hash_table_u64_create(NULL);
  }
  
  }
  
-#define FIND_SWAPCHAIN_DATA(obj) ((struct swapchain_data *)find_object_data((void *) obj))
-#define FIND_CMD_BUFFER_DATA(obj) ((struct command_buffer_data *)find_object_data((void *) obj))
-#define FIND_DEVICE_DATA(obj) ((struct device_data *)find_object_data((void *) obj))
-#define FIND_QUEUE_DATA(obj) ((struct queue_data *)find_object_data((void *) obj))
-#define FIND_PHYSICAL_DEVICE_DATA(obj) ((struct instance_data *)find_object_data((void *) obj))
-#define FIND_INSTANCE_DATA(obj) ((struct instance_data *)find_object_data((void *) obj))
-static void *find_object_data(void *obj)
+#define HKEY(obj) ((uint64_t)(obj))
+#define FIND(type, obj) ((type *)find_object_data(HKEY(obj)))
+
+static void *find_object_data(uint64_t obj)
  {
     simple_mtx_lock(&vk_object_to_data_mutex);
     ensure_vk_object_map();
  {
     simple_mtx_lock(&vk_object_to_data_mutex);
     ensure_vk_object_map();
-   struct hash_entry *entry = _mesa_hash_table_search(vk_object_to_data, obj);
-   void *data = entry ? entry->data : NULL;
+   void *data = _mesa_hash_table_u64_search(vk_object_to_data, obj);
     simple_mtx_unlock(&vk_object_to_data_mutex);
     return data;
  }
  
     simple_mtx_unlock(&vk_object_to_data_mutex);
     return data;
  }
  
-static void map_object(void *obj, void *data)
+static void map_object(uint64_t obj, void *data)
  {
     simple_mtx_lock(&vk_object_to_data_mutex);
     ensure_vk_object_map();
  {
     simple_mtx_lock(&vk_object_to_data_mutex);
     ensure_vk_object_map();
-   _mesa_hash_table_insert(vk_object_to_data, obj, data);
+   _mesa_hash_table_u64_insert(vk_object_to_data, obj, data);
     simple_mtx_unlock(&vk_object_to_data_mutex);
  }
  
     simple_mtx_unlock(&vk_object_to_data_mutex);
  }
  
-static void unmap_object(void *obj)
+static void unmap_object(uint64_t obj)
  {
     simple_mtx_lock(&vk_object_to_data_mutex);
  {
     simple_mtx_lock(&vk_object_to_data_mutex);
-   struct hash_entry *entry = _mesa_hash_table_search(vk_object_to_data, obj);
-   _mesa_hash_table_remove(vk_object_to_data, entry);
+   _mesa_hash_table_u64_remove(vk_object_to_data, obj);
     simple_mtx_unlock(&vk_object_to_data_mutex);
  }
  
     simple_mtx_unlock(&vk_object_to_data_mutex);
  }
  
@@ -251,13 +274,45 @@ static VkLayerDeviceCreateInfo *get_device_chain_info(const VkDeviceCreateInfo *
     return NULL;
  }
  
     return NULL;
  }
  
+static struct VkBaseOutStructure *
+clone_chain(const struct VkBaseInStructure *chain)
+{
+   struct VkBaseOutStructure *head = NULL, *tail = NULL;
+
+   vk_foreach_struct_const(item, chain) {
+      size_t item_size = vk_structure_type_size(item);
+      struct VkBaseOutStructure *new_item =
+         (struct VkBaseOutStructure *)malloc(item_size);;
+
+      memcpy(new_item, item, item_size);
+
+      if (!head)
+         head = new_item;
+      if (tail)
+         tail->pNext = new_item;
+      tail = new_item;
+   }
+
+   return head;
+}
+
+static void
+free_chain(struct VkBaseOutStructure *chain)
+{
+   while (chain) {
+      void *node = chain;
+      chain = chain->pNext;
+      free(node);
+   }
+}
+
  /**/
  
  static struct instance_data *new_instance_data(VkInstance instance)
  {
     struct instance_data *data = rzalloc(NULL, struct instance_data);
     data->instance = instance;
  /**/
  
  static struct instance_data *new_instance_data(VkInstance instance)
  {
     struct instance_data *data = rzalloc(NULL, struct instance_data);
     data->instance = instance;
-   map_object(data->instance, data);
+   map_object(HKEY(data->instance), data);
     return data;
  }
  
     return data;
  }
  
@@ -265,7 +320,7 @@ static void destroy_instance_data(struct instance_data *data)
  {
     if (data->params.output_file)
        fclose(data->params.output_file);
  {
     if (data->params.output_file)
        fclose(data->params.output_file);
-   unmap_object(data->instance);
+   unmap_object(HKEY(data->instance));
     ralloc_free(data);
  }
  
     ralloc_free(data);
  }
  
@@ -284,9 +339,9 @@ static void instance_data_map_physical_devices(struct instance_data *instance_da
  
     for (uint32_t i = 0; i < physicalDeviceCount; i++) {
        if (map)
  
     for (uint32_t i = 0; i < physicalDeviceCount; i++) {
        if (map)
-         map_object(physicalDevices[i], instance_data);
+         map_object(HKEY(physicalDevices[i]), instance_data);
        else
        else
-         unmap_object(physicalDevices[i]);
+         unmap_object(HKEY(physicalDevices[i]));
     }
  
     free(physicalDevices);
     }
  
     free(physicalDevices);
@@ -298,7 +353,7 @@ static struct device_data *new_device_data(VkDevice device, struct instance_data
     struct device_data *data = rzalloc(NULL, struct device_data);
     data->instance = instance;
     data->device = device;
     struct device_data *data = rzalloc(NULL, struct device_data);
     data->instance = instance;
     data->device = device;
-   map_object(data->device, data);
+   map_object(HKEY(data->device), data);
     return data;
  }
  
     return data;
  }
  
@@ -311,8 +366,19 @@ static struct queue_data *new_queue_data(VkQueue queue,
     data->device = device_data;
     data->queue = queue;
     data->flags = family_props->queueFlags;
     data->device = device_data;
     data->queue = queue;
     data->flags = family_props->queueFlags;
+   data->timestamp_mask = (1ull << family_props->timestampValidBits) - 1;
     data->family_index = family_index;
     data->family_index = family_index;
-   map_object(data->queue, data);
+   list_inithead(&data->running_command_buffer);
+   map_object(HKEY(data->queue), data);
+
+   /* Fence synchronizing access to queries on that queue. */
+   VkFenceCreateInfo fence_info = {};
+   fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+   fence_info.flags = VK_FENCE_CREATE_SIGNALED_BIT;
+   VK_CHECK(device_data->vtable.CreateFence(device_data->device,
+                                            &fence_info,
+                                            NULL,
+                                            &data->queries_fence));
  
     if (data->flags & VK_QUEUE_GRAPHICS_BIT)
        device_data->graphic_queue = data;
  
     if (data->flags & VK_QUEUE_GRAPHICS_BIT)
        device_data->graphic_queue = data;
@@ -320,6 +386,14 @@ static struct queue_data *new_queue_data(VkQueue queue,
     return data;
  }
  
     return data;
  }
  
+static void destroy_queue(struct queue_data *data)
+{
+   struct device_data *device_data = data->device;
+   device_data->vtable.DestroyFence(device_data->device, data->queries_fence, NULL);
+   unmap_object(HKEY(data->queue));
+   ralloc_free(data);
+}
+
  static void device_map_queues(struct device_data *data,
                                const VkDeviceCreateInfo *pCreateInfo)
  {
  static void device_map_queues(struct device_data *data,
                                const VkDeviceCreateInfo *pCreateInfo)
  {
@@ -360,53 +434,122 @@ static void device_map_queues(struct device_data *data,
  static void device_unmap_queues(struct device_data *data)
  {
     for (uint32_t i = 0; i < data->n_queues; i++)
  static void device_unmap_queues(struct device_data *data)
  {
     for (uint32_t i = 0; i < data->n_queues; i++)
-      unmap_object(data->queues[i]->queue);
+      destroy_queue(data->queues[i]);
  }
  
  static void destroy_device_data(struct device_data *data)
  {
  }
  
  static void destroy_device_data(struct device_data *data)
  {
-   unmap_object(data->device);
+   unmap_object(HKEY(data->device));
     ralloc_free(data);
  }
  
  /**/
  static struct command_buffer_data *new_command_buffer_data(VkCommandBuffer cmd_buffer,
                                                             VkCommandBufferLevel level,
     ralloc_free(data);
  }
  
  /**/
  static struct command_buffer_data *new_command_buffer_data(VkCommandBuffer cmd_buffer,
                                                             VkCommandBufferLevel level,
+                                                           VkQueryPool pipeline_query_pool,
+                                                           VkQueryPool timestamp_query_pool,
+                                                           uint32_t query_index,
                                                             struct device_data *device_data)
  {
     struct command_buffer_data *data = rzalloc(NULL, struct command_buffer_data);
     data->device = device_data;
     data->cmd_buffer = cmd_buffer;
     data->level = level;
                                                             struct device_data *device_data)
  {
     struct command_buffer_data *data = rzalloc(NULL, struct command_buffer_data);
     data->device = device_data;
     data->cmd_buffer = cmd_buffer;
     data->level = level;
-   map_object((void *) data->cmd_buffer, data);
+   data->pipeline_query_pool = pipeline_query_pool;
+   data->timestamp_query_pool = timestamp_query_pool;
+   data->query_index = query_index;
+   list_inithead(&data->link);
+   map_object(HKEY(data->cmd_buffer), data);
     return data;
  }
  
  static void destroy_command_buffer_data(struct command_buffer_data *data)
  {
     return data;
  }
  
  static void destroy_command_buffer_data(struct command_buffer_data *data)
  {
-   unmap_object((void *) data->cmd_buffer);
+   unmap_object(HKEY(data->cmd_buffer));
+   list_delinit(&data->link);
     ralloc_free(data);
  }
  
     ralloc_free(data);
  }
  
-
  /**/
  static struct swapchain_data *new_swapchain_data(VkSwapchainKHR swapchain,
                                                   struct device_data *device_data)
  {
  /**/
  static struct swapchain_data *new_swapchain_data(VkSwapchainKHR swapchain,
                                                   struct device_data *device_data)
  {
+   struct instance_data *instance_data = device_data->instance;
     struct swapchain_data *data = rzalloc(NULL, struct swapchain_data);
     data->device = device_data;
     data->swapchain = swapchain;
     struct swapchain_data *data = rzalloc(NULL, struct swapchain_data);
     data->device = device_data;
     data->swapchain = swapchain;
-   data->window_size = ImVec2(300, 300);
-   map_object((void *) data->swapchain, data);
+   data->window_size = ImVec2(instance_data->params.width, instance_data->params.height);
+   list_inithead(&data->draws);
+   map_object(HKEY(data->swapchain), data);
     return data;
  }
  
  static void destroy_swapchain_data(struct swapchain_data *data)
  {
     return data;
  }
  
  static void destroy_swapchain_data(struct swapchain_data *data)
  {
-   unmap_object((void *) data->swapchain);
+   unmap_object(HKEY(data->swapchain));
     ralloc_free(data);
  }
  
     ralloc_free(data);
  }
  
+struct overlay_draw *get_overlay_draw(struct swapchain_data *data)
+{
+   struct device_data *device_data = data->device;
+   struct overlay_draw *draw = list_is_empty(&data->draws) ?
+      NULL : list_first_entry(&data->draws, struct overlay_draw, link);
+
+   VkSemaphoreCreateInfo sem_info = {};
+   sem_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+
+   if (draw && device_data->vtable.GetFenceStatus(device_data->device, draw->fence) == VK_SUCCESS) {
+      list_del(&draw->link);
+      VK_CHECK(device_data->vtable.ResetFences(device_data->device,
+                                               1, &draw->fence));
+      list_addtail(&draw->link, &data->draws);
+      return draw;
+   }
+
+   draw = rzalloc(data, struct overlay_draw);
+
+   VkCommandBufferAllocateInfo cmd_buffer_info = {};
+   cmd_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+   cmd_buffer_info.commandPool = data->command_pool;
+   cmd_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+   cmd_buffer_info.commandBufferCount = 1;
+   VK_CHECK(device_data->vtable.AllocateCommandBuffers(device_data->device,
+                                                       &cmd_buffer_info,
+                                                       &draw->command_buffer));
+   VK_CHECK(device_data->set_device_loader_data(device_data->device,
+                                                draw->command_buffer));
+
+
+   VkFenceCreateInfo fence_info = {};
+   fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+   VK_CHECK(device_data->vtable.CreateFence(device_data->device,
+                                            &fence_info,
+                                            NULL,
+                                            &draw->fence));
+
+   VK_CHECK(device_data->vtable.CreateSemaphore(device_data->device, &sem_info,
+                                                NULL, &draw->semaphore));
+
+   list_addtail(&draw->link, &data->draws);
+
+   return draw;
+}
+
+static const char *param_unit(enum overlay_param_enabled param)
+{
+   switch (param) {
+   case OVERLAY_PARAM_ENABLED_frame_timing:
+   case OVERLAY_PARAM_ENABLED_acquire_timing:
+   case OVERLAY_PARAM_ENABLED_present_timing:
+      return "(us)";
+   case OVERLAY_PARAM_ENABLED_gpu_timing:
+      return "(ns)";
+   default:
+      return "";
+   }
+}
+
  static void snapshot_swapchain_frame(struct swapchain_data *data)
  {
     struct device_data *device_data = data->device;
  static void snapshot_swapchain_frame(struct swapchain_data *data)
  {
     struct device_data *device_data = data->device;
@@ -430,7 +573,38 @@ static void snapshot_swapchain_frame(struct swapchain_data *data)
        if (elapsed >= instance_data->params.fps_sampling_period) {
           data->fps = 1000000.0f * data->n_frames_since_update / elapsed;
           if (instance_data->params.output_file) {
        if (elapsed >= instance_data->params.fps_sampling_period) {
           data->fps = 1000000.0f * data->n_frames_since_update / elapsed;
           if (instance_data->params.output_file) {
-            fprintf(instance_data->params.output_file, "%.2f\n", data->fps);
+            if (!instance_data->first_line_printed) {
+               bool first_column = true;
+
+               instance_data->first_line_printed = true;
+
+#define OVERLAY_PARAM_BOOL(name) \
+               if (instance_data->params.enabled[OVERLAY_PARAM_ENABLED_##name]) { \
+                  fprintf(instance_data->params.output_file, \
+                          "%s%s%s", first_column ? "" : ", ", #name, \
+                          param_unit(OVERLAY_PARAM_ENABLED_##name)); \
+                  first_column = false; \
+               }
+#define OVERLAY_PARAM_CUSTOM(name)
+               OVERLAY_PARAMS
+#undef OVERLAY_PARAM_BOOL
+#undef OVERLAY_PARAM_CUSTOM
+               fprintf(instance_data->params.output_file, "\n");
+            }
+
+            for (int s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
+               if (!instance_data->params.enabled[s])
+                  continue;
+               if (s == OVERLAY_PARAM_ENABLED_fps) {
+                  fprintf(instance_data->params.output_file,
+                          "%s%.2f", s == 0 ? "" : ", ", data->fps);
+               } else {
+                  fprintf(instance_data->params.output_file,
+                          "%s%" PRIu64, s == 0 ? "" : ", ",
+                          data->accumulated_stats.stats[s]);
+               }
+            }
+            fprintf(instance_data->params.output_file, "\n");
              fflush(instance_data->params.output_file);
           }
  
              fflush(instance_data->params.output_file);
           }
  
@@ -461,7 +635,7 @@ static float get_time_stat(void *_data, int _idx)
        _idx + data->n_frames;
     idx %= ARRAY_SIZE(data->frames_stats);
     /* Time stats are in us. */
        _idx + data->n_frames;
     idx %= ARRAY_SIZE(data->frames_stats);
     /* Time stats are in us. */
-   return data->frames_stats[idx].stats[data->stat_selector] / 1000.0f;
+   return data->frames_stats[idx].stats[data->stat_selector] / data->time_dividor;
  }
  
  static float get_stat(void *_data, int _idx)
  }
  
  static float get_stat(void *_data, int _idx)
@@ -482,24 +656,25 @@ static void position_layer(struct swapchain_data *data)
  {
     struct device_data *device_data = data->device;
     struct instance_data *instance_data = device_data->instance;
  {
     struct device_data *device_data = data->device;
     struct instance_data *instance_data = device_data->instance;
+   const float margin = 10.0f;
  
     ImGui::SetNextWindowBgAlpha(0.5);
     ImGui::SetNextWindowSize(data->window_size, ImGuiCond_Always);
     switch (instance_data->params.position) {
     case LAYER_POSITION_TOP_LEFT:
  
     ImGui::SetNextWindowBgAlpha(0.5);
     ImGui::SetNextWindowSize(data->window_size, ImGuiCond_Always);
     switch (instance_data->params.position) {
     case LAYER_POSITION_TOP_LEFT:
-      ImGui::SetNextWindowPos(ImVec2(0, 0), ImGuiCond_Always);
+      ImGui::SetNextWindowPos(ImVec2(margin, margin), ImGuiCond_Always);
        break;
     case LAYER_POSITION_TOP_RIGHT:
        break;
     case LAYER_POSITION_TOP_RIGHT:
-      ImGui::SetNextWindowPos(ImVec2(data->width - data->window_size.x, 0),
+      ImGui::SetNextWindowPos(ImVec2(data->width - data->window_size.x - margin, margin),
                                ImGuiCond_Always);
        break;
     case LAYER_POSITION_BOTTOM_LEFT:
                                ImGuiCond_Always);
        break;
     case LAYER_POSITION_BOTTOM_LEFT:
-      ImGui::SetNextWindowPos(ImVec2(0, data->height - data->window_size.y),
+      ImGui::SetNextWindowPos(ImVec2(margin, data->height - data->window_size.y - margin),
                                ImGuiCond_Always);
        break;
     case LAYER_POSITION_BOTTOM_RIGHT:
                                ImGuiCond_Always);
        break;
     case LAYER_POSITION_BOTTOM_RIGHT:
-      ImGui::SetNextWindowPos(ImVec2(data->width - data->window_size.x,
-                                     data->height - data->window_size.y),
+      ImGui::SetNextWindowPos(ImVec2(data->width - data->window_size.x - margin,
+                                     data->height - data->window_size.y - margin),
                                ImGuiCond_Always);
        break;
     }
                                ImGuiCond_Always);
        break;
     }
@@ -542,17 +717,23 @@ static void compute_swapchain_display(struct swapchain_data *data)
  
     for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
        if (!instance_data->params.enabled[s] ||
  
     for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
        if (!instance_data->params.enabled[s] ||
-          s == OVERLAY_PARAM_ENABLED_fps)
+          s == OVERLAY_PARAM_ENABLED_fps ||
+          s == OVERLAY_PARAM_ENABLED_frame)
           continue;
  
        char hash[40];
        snprintf(hash, sizeof(hash), "##%s", overlay_param_names[s]);
        data->stat_selector = (enum overlay_param_enabled) s;
           continue;
  
        char hash[40];
        snprintf(hash, sizeof(hash), "##%s", overlay_param_names[s]);
        data->stat_selector = (enum overlay_param_enabled) s;
+      data->time_dividor = 1000.0f;
+      if (s == OVERLAY_PARAM_ENABLED_gpu_timing)
+         data->time_dividor = 1000000.0f;
  
        if (s == OVERLAY_PARAM_ENABLED_frame_timing ||
  
        if (s == OVERLAY_PARAM_ENABLED_frame_timing ||
-          s == OVERLAY_PARAM_ENABLED_acquire_timing) {
-         double min_time = data->stats_min.stats[s] / 1000.0f;
-         double max_time = data->stats_max.stats[s] / 1000.0f;
+          s == OVERLAY_PARAM_ENABLED_acquire_timing ||
+          s == OVERLAY_PARAM_ENABLED_present_timing ||
+          s == OVERLAY_PARAM_ENABLED_gpu_timing) {
+         double min_time = data->stats_min.stats[s] / data->time_dividor;
+         double max_time = data->stats_max.stats[s] / data->time_dividor;
           ImGui::PlotHistogram(hash, get_time_stat, data,
                                ARRAY_SIZE(data->frames_stats), 0,
                                NULL, min_time, max_time,
           ImGui::PlotHistogram(hash, get_time_stat, data,
                                ARRAY_SIZE(data->frames_stats), 0,
                                NULL, min_time, max_time,
@@ -730,17 +911,20 @@ static void CreateOrResizeBuffer(struct device_data *data,
      *buffer_size = new_size;
  }
  
      *buffer_size = new_size;
  }
  
-static void render_swapchain_display(struct swapchain_data *data, unsigned image_index)
+static struct overlay_draw *render_swapchain_display(struct swapchain_data *data,
+                                                     struct queue_data *present_queue,
+                                                     const VkSemaphore *wait_semaphores,
+                                                     unsigned n_wait_semaphores,
+                                                     unsigned image_index)
  {
     ImDrawData* draw_data = ImGui::GetDrawData();
     if (draw_data->TotalVtxCount == 0)
  {
     ImDrawData* draw_data = ImGui::GetDrawData();
     if (draw_data->TotalVtxCount == 0)
-      return;
+      return NULL;
  
     struct device_data *device_data = data->device;
  
     struct device_data *device_data = data->device;
-   uint32_t idx = data->n_frames % ARRAY_SIZE(data->frame_data);
-   VkCommandBuffer command_buffer = data->frame_data[idx].command_buffer;
+   struct overlay_draw *draw = get_overlay_draw(data);
  
  
-   device_data->vtable.ResetCommandBuffer(command_buffer, 0);
+   device_data->vtable.ResetCommandBuffer(draw->command_buffer, 0);
  
     VkRenderPassBeginInfo render_pass_info = {};
     render_pass_info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
  
     VkRenderPassBeginInfo render_pass_info = {};
     render_pass_info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
@@ -752,9 +936,9 @@ static void render_swapchain_display(struct swapchain_data *data, unsigned image
     VkCommandBufferBeginInfo buffer_begin_info = {};
     buffer_begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
  
     VkCommandBufferBeginInfo buffer_begin_info = {};
     buffer_begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
  
-   device_data->vtable.BeginCommandBuffer(command_buffer, &buffer_begin_info);
+   device_data->vtable.BeginCommandBuffer(draw->command_buffer, &buffer_begin_info);
  
  
-   ensure_swapchain_fonts(data, command_buffer);
+   ensure_swapchain_fonts(data, draw->command_buffer);
  
     /* Bounce the image to display back to color attachment layout for
      * rendering on top of it.
  
     /* Bounce the image to display back to color attachment layout for
      * rendering on top of it.
@@ -772,9 +956,9 @@ static void render_swapchain_display(struct swapchain_data *data, unsigned image
     imb.subresourceRange.levelCount = 1;
     imb.subresourceRange.baseArrayLayer = 0;
     imb.subresourceRange.layerCount = 1;
     imb.subresourceRange.levelCount = 1;
     imb.subresourceRange.baseArrayLayer = 0;
     imb.subresourceRange.layerCount = 1;
-   imb.srcQueueFamilyIndex = device_data->graphic_queue->family_index;
+   imb.srcQueueFamilyIndex = present_queue->family_index;
     imb.dstQueueFamilyIndex = device_data->graphic_queue->family_index;
     imb.dstQueueFamilyIndex = device_data->graphic_queue->family_index;
-   device_data->vtable.CmdPipelineBarrier(command_buffer,
+   device_data->vtable.CmdPipelineBarrier(draw->command_buffer,
                                            VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
                                            VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
                                            0,          /* dependency flags */
                                            VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
                                            VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
                                            0,          /* dependency flags */
@@ -782,37 +966,33 @@ static void render_swapchain_display(struct swapchain_data *data, unsigned image
                                            0, nullptr, /* buffer memory barriers */
                                            1, &imb);   /* image memory barriers */
  
                                            0, nullptr, /* buffer memory barriers */
                                            1, &imb);   /* image memory barriers */
  
-   device_data->vtable.CmdBeginRenderPass(command_buffer, &render_pass_info,
+   device_data->vtable.CmdBeginRenderPass(draw->command_buffer, &render_pass_info,
                                            VK_SUBPASS_CONTENTS_INLINE);
  
     /* Create/Resize vertex & index buffers */
     size_t vertex_size = draw_data->TotalVtxCount * sizeof(ImDrawVert);
     size_t index_size = draw_data->TotalIdxCount * sizeof(ImDrawIdx);
                                            VK_SUBPASS_CONTENTS_INLINE);
  
     /* Create/Resize vertex & index buffers */
     size_t vertex_size = draw_data->TotalVtxCount * sizeof(ImDrawVert);
     size_t index_size = draw_data->TotalIdxCount * sizeof(ImDrawIdx);
-   if (data->frame_data[idx].vertex_buffer_size < vertex_size) {
+   if (draw->vertex_buffer_size < vertex_size) {
        CreateOrResizeBuffer(device_data,
        CreateOrResizeBuffer(device_data,
-                           &data->frame_data[idx].vertex_buffer,
-                           &data->frame_data[idx].vertex_buffer_mem,
-                           &data->frame_data[idx].vertex_buffer_size,
+                           &draw->vertex_buffer,
+                           &draw->vertex_buffer_mem,
+                           &draw->vertex_buffer_size,
                             vertex_size, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
     }
                             vertex_size, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
     }
-   if (data->frame_data[idx].index_buffer_size < index_size) {
+   if (draw->index_buffer_size < index_size) {
        CreateOrResizeBuffer(device_data,
        CreateOrResizeBuffer(device_data,
-                           &data->frame_data[idx].index_buffer,
-                           &data->frame_data[idx].index_buffer_mem,
-                           &data->frame_data[idx].index_buffer_size,
+                           &draw->index_buffer,
+                           &draw->index_buffer_mem,
+                           &draw->index_buffer_size,
                             index_size, VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
     }
  
      /* Upload vertex & index data */
                             index_size, VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
     }
  
      /* Upload vertex & index data */
-    VkBuffer vertex_buffer = data->frame_data[idx].vertex_buffer;
-    VkDeviceMemory vertex_mem = data->frame_data[idx].vertex_buffer_mem;
-    VkBuffer index_buffer = data->frame_data[idx].index_buffer;
-    VkDeviceMemory index_mem = data->frame_data[idx].index_buffer_mem;
      ImDrawVert* vtx_dst = NULL;
      ImDrawIdx* idx_dst = NULL;
      ImDrawVert* vtx_dst = NULL;
      ImDrawIdx* idx_dst = NULL;
-    VK_CHECK(device_data->vtable.MapMemory(device_data->device, vertex_mem,
+    VK_CHECK(device_data->vtable.MapMemory(device_data->device, draw->vertex_buffer_mem,
                                             0, vertex_size, 0, (void**)(&vtx_dst)));
                                             0, vertex_size, 0, (void**)(&vtx_dst)));
-    VK_CHECK(device_data->vtable.MapMemory(device_data->device, index_mem,
+    VK_CHECK(device_data->vtable.MapMemory(device_data->device, draw->index_buffer_mem,
                                             0, index_size, 0, (void**)(&idx_dst)));
      for (int n = 0; n < draw_data->CmdListsCount; n++)
          {
                                             0, index_size, 0, (void**)(&idx_dst)));
      for (int n = 0; n < draw_data->CmdListsCount; n++)
          {
@@ -824,26 +1004,26 @@ static void render_swapchain_display(struct swapchain_data *data, unsigned image
          }
      VkMappedMemoryRange range[2] = {};
      range[0].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
          }
      VkMappedMemoryRange range[2] = {};
      range[0].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
-    range[0].memory = vertex_mem;
+    range[0].memory = draw->vertex_buffer_mem;
      range[0].size = VK_WHOLE_SIZE;
      range[1].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
      range[0].size = VK_WHOLE_SIZE;
      range[1].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
-    range[1].memory = index_mem;
+    range[1].memory = draw->index_buffer_mem;
      range[1].size = VK_WHOLE_SIZE;
      VK_CHECK(device_data->vtable.FlushMappedMemoryRanges(device_data->device, 2, range));
      range[1].size = VK_WHOLE_SIZE;
      VK_CHECK(device_data->vtable.FlushMappedMemoryRanges(device_data->device, 2, range));
-    device_data->vtable.UnmapMemory(device_data->device, vertex_mem);
-    device_data->vtable.UnmapMemory(device_data->device, index_mem);
+    device_data->vtable.UnmapMemory(device_data->device, draw->vertex_buffer_mem);
+    device_data->vtable.UnmapMemory(device_data->device, draw->index_buffer_mem);
  
      /* Bind pipeline and descriptor sets */
  
      /* Bind pipeline and descriptor sets */
-    device_data->vtable.CmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, data->pipeline);
+    device_data->vtable.CmdBindPipeline(draw->command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, data->pipeline);
      VkDescriptorSet desc_set[1] = { data->descriptor_set };
      VkDescriptorSet desc_set[1] = { data->descriptor_set };
-    device_data->vtable.CmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
+    device_data->vtable.CmdBindDescriptorSets(draw->command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
                                                data->pipeline_layout, 0, 1, desc_set, 0, NULL);
  
      /* Bind vertex & index buffers */
                                                data->pipeline_layout, 0, 1, desc_set, 0, NULL);
  
      /* Bind vertex & index buffers */
-    VkBuffer vertex_buffers[1] = { vertex_buffer };
+    VkBuffer vertex_buffers[1] = { draw->vertex_buffer };
      VkDeviceSize vertex_offset[1] = { 0 };
      VkDeviceSize vertex_offset[1] = { 0 };
-    device_data->vtable.CmdBindVertexBuffers(command_buffer, 0, 1, vertex_buffers, vertex_offset);
-    device_data->vtable.CmdBindIndexBuffer(command_buffer, index_buffer, 0, VK_INDEX_TYPE_UINT16);
+    device_data->vtable.CmdBindVertexBuffers(draw->command_buffer, 0, 1, vertex_buffers, vertex_offset);
+    device_data->vtable.CmdBindIndexBuffer(draw->command_buffer, draw->index_buffer, 0, VK_INDEX_TYPE_UINT16);
  
      /* Setup viewport */
      VkViewport viewport;
  
      /* Setup viewport */
      VkViewport viewport;
@@ -853,7 +1033,7 @@ static void render_swapchain_display(struct swapchain_data *data, unsigned image
      viewport.height = draw_data->DisplaySize.y;
      viewport.minDepth = 0.0f;
      viewport.maxDepth = 1.0f;
      viewport.height = draw_data->DisplaySize.y;
      viewport.minDepth = 0.0f;
      viewport.maxDepth = 1.0f;
-    device_data->vtable.CmdSetViewport(command_buffer, 0, 1, &viewport);
+    device_data->vtable.CmdSetViewport(draw->command_buffer, 0, 1, &viewport);
  
  
      /* Setup scale and translation through push constants :
  
  
      /* Setup scale and translation through push constants :
@@ -868,10 +1048,10 @@ static void render_swapchain_display(struct swapchain_data *data, unsigned image
      float translate[2];
      translate[0] = -1.0f - draw_data->DisplayPos.x * scale[0];
      translate[1] = -1.0f - draw_data->DisplayPos.y * scale[1];
      float translate[2];
      translate[0] = -1.0f - draw_data->DisplayPos.x * scale[0];
      translate[1] = -1.0f - draw_data->DisplayPos.y * scale[1];
-    device_data->vtable.CmdPushConstants(command_buffer, data->pipeline_layout,
+    device_data->vtable.CmdPushConstants(draw->command_buffer, data->pipeline_layout,
                                           VK_SHADER_STAGE_VERTEX_BIT,
                                           sizeof(float) * 0, sizeof(float) * 2, scale);
                                           VK_SHADER_STAGE_VERTEX_BIT,
                                           sizeof(float) * 0, sizeof(float) * 2, scale);
-    device_data->vtable.CmdPushConstants(command_buffer, data->pipeline_layout,
+    device_data->vtable.CmdPushConstants(draw->command_buffer, data->pipeline_layout,
                                           VK_SHADER_STAGE_VERTEX_BIT,
                                           sizeof(float) * 2, sizeof(float) * 2, translate);
  
                                           VK_SHADER_STAGE_VERTEX_BIT,
                                           sizeof(float) * 2, sizeof(float) * 2, translate);
  
@@ -892,42 +1072,57 @@ static void render_swapchain_display(struct swapchain_data *data, unsigned image
              scissor.offset.y = (int32_t)(pcmd->ClipRect.y - display_pos.y) > 0 ? (int32_t)(pcmd->ClipRect.y - display_pos.y) : 0;
              scissor.extent.width = (uint32_t)(pcmd->ClipRect.z - pcmd->ClipRect.x);
              scissor.extent.height = (uint32_t)(pcmd->ClipRect.w - pcmd->ClipRect.y + 1); // FIXME: Why +1 here?
              scissor.offset.y = (int32_t)(pcmd->ClipRect.y - display_pos.y) > 0 ? (int32_t)(pcmd->ClipRect.y - display_pos.y) : 0;
              scissor.extent.width = (uint32_t)(pcmd->ClipRect.z - pcmd->ClipRect.x);
              scissor.extent.height = (uint32_t)(pcmd->ClipRect.w - pcmd->ClipRect.y + 1); // FIXME: Why +1 here?
-            device_data->vtable.CmdSetScissor(command_buffer, 0, 1, &scissor);
+            device_data->vtable.CmdSetScissor(draw->command_buffer, 0, 1, &scissor);
  
              // Draw
  
              // Draw
-            device_data->vtable.CmdDrawIndexed(command_buffer, pcmd->ElemCount, 1, idx_offset, vtx_offset, 0);
+            device_data->vtable.CmdDrawIndexed(draw->command_buffer, pcmd->ElemCount, 1, idx_offset, vtx_offset, 0);
  
              idx_offset += pcmd->ElemCount;
          }
          vtx_offset += cmd_list->VtxBuffer.Size;
      }
  
  
              idx_offset += pcmd->ElemCount;
          }
          vtx_offset += cmd_list->VtxBuffer.Size;
      }
  
-   device_data->vtable.CmdEndRenderPass(command_buffer);
-   device_data->vtable.EndCommandBuffer(command_buffer);
+   device_data->vtable.CmdEndRenderPass(draw->command_buffer);
  
  
-   if (data->submission_semaphore) {
-      device_data->vtable.DestroySemaphore(device_data->device,
-                                           data->submission_semaphore,
-                                           NULL);
-   }
-   /* Submission semaphore */
-   VkSemaphoreCreateInfo semaphore_info = {};
-   semaphore_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-   VK_CHECK(device_data->vtable.CreateSemaphore(device_data->device, &semaphore_info,
-                                                NULL, &data->submission_semaphore));
+   /* Bounce the image to display back to present layout. */
+   imb.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+   imb.pNext = nullptr;
+   imb.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+   imb.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+   imb.oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+   imb.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+   imb.image = data->images[image_index];
+   imb.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+   imb.subresourceRange.baseMipLevel = 0;
+   imb.subresourceRange.levelCount = 1;
+   imb.subresourceRange.baseArrayLayer = 0;
+   imb.subresourceRange.layerCount = 1;
+   imb.srcQueueFamilyIndex = device_data->graphic_queue->family_index;
+   imb.dstQueueFamilyIndex = present_queue->family_index;
+   device_data->vtable.CmdPipelineBarrier(draw->command_buffer,
+                                          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
+                                          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
+                                          0,          /* dependency flags */
+                                          0, nullptr, /* memory barriers */
+                                          0, nullptr, /* buffer memory barriers */
+                                          1, &imb);   /* image memory barriers */
+
+   device_data->vtable.EndCommandBuffer(draw->command_buffer);
  
     VkSubmitInfo submit_info = {};
     VkPipelineStageFlags stage_wait = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
     submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
     submit_info.commandBufferCount = 1;
  
     VkSubmitInfo submit_info = {};
     VkPipelineStageFlags stage_wait = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
     submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
     submit_info.commandBufferCount = 1;
-   submit_info.pCommandBuffers = &command_buffer;
+   submit_info.pCommandBuffers = &draw->command_buffer;
     submit_info.pWaitDstStageMask = &stage_wait;
     submit_info.pWaitDstStageMask = &stage_wait;
+   submit_info.waitSemaphoreCount = n_wait_semaphores;
+   submit_info.pWaitSemaphores = wait_semaphores;
     submit_info.signalSemaphoreCount = 1;
     submit_info.signalSemaphoreCount = 1;
-   submit_info.pSignalSemaphores = &data->submission_semaphore;
+   submit_info.pSignalSemaphores = &draw->semaphore;
  
  
-   device_data->vtable.WaitForFences(device_data->device, 1, &data->fence, VK_TRUE, UINT64_MAX);
-   device_data->vtable.ResetFences(device_data->device, 1, &data->fence);
-   device_data->vtable.QueueSubmit(device_data->graphic_queue->queue, 1, &submit_info, data->fence);
+   device_data->vtable.QueueSubmit(device_data->graphic_queue->queue, 1, &submit_info, draw->fence);
+
+   return draw;
  }
  
  static const uint32_t overlay_vert_spv[] = {
  }
  
  static const uint32_t overlay_vert_spv[] = {
@@ -1292,7 +1487,7 @@ static void setup_swapchain_data(struct swapchain_data *data,
                                                       NULL, &data->framebuffers[i]));
     }
  
                                                       NULL, &data->framebuffers[i]));
     }
  
-   /* Command buffer */
+   /* Command buffer pool */
     VkCommandPoolCreateInfo cmd_buffer_pool_info = {};
     cmd_buffer_pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
     cmd_buffer_pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
     VkCommandPoolCreateInfo cmd_buffer_pool_info = {};
     cmd_buffer_pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
     cmd_buffer_pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
@@ -1300,36 +1495,21 @@ static void setup_swapchain_data(struct swapchain_data *data,
     VK_CHECK(device_data->vtable.CreateCommandPool(device_data->device,
                                                    &cmd_buffer_pool_info,
                                                    NULL, &data->command_pool));
     VK_CHECK(device_data->vtable.CreateCommandPool(device_data->device,
                                                    &cmd_buffer_pool_info,
                                                    NULL, &data->command_pool));
-
-   VkCommandBuffer cmd_bufs[ARRAY_SIZE(data->frame_data)];
-
-   VkCommandBufferAllocateInfo cmd_buffer_info = {};
-   cmd_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
-   cmd_buffer_info.commandPool = data->command_pool;
-   cmd_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
-   cmd_buffer_info.commandBufferCount = 2;
-   VK_CHECK(device_data->vtable.AllocateCommandBuffers(device_data->device,
-                                                       &cmd_buffer_info,
-                                                       cmd_bufs));
-   for (uint32_t i = 0; i < ARRAY_SIZE(data->frame_data); i++) {
-      VK_CHECK(device_data->set_device_loader_data(device_data->device,
-                                                   cmd_bufs[i]));
-
-      data->frame_data[i].command_buffer = cmd_bufs[i];
-   }
-
-   /* Submission fence */
-   VkFenceCreateInfo fence_info = {};
-   fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
-   fence_info.flags = VK_FENCE_CREATE_SIGNALED_BIT;
-   VK_CHECK(device_data->vtable.CreateFence(device_data->device, &fence_info,
-                                            NULL, &data->fence));
  }
  
  static void shutdown_swapchain_data(struct swapchain_data *data)
  {
     struct device_data *device_data = data->device;
  
  }
  
  static void shutdown_swapchain_data(struct swapchain_data *data)
  {
     struct device_data *device_data = data->device;
  
+   list_for_each_entry_safe(struct overlay_draw, draw, &data->draws, link) {
+      device_data->vtable.DestroySemaphore(device_data->device, draw->semaphore, NULL);
+      device_data->vtable.DestroyFence(device_data->device, draw->fence, NULL);
+      device_data->vtable.DestroyBuffer(device_data->device, draw->vertex_buffer, NULL);
+      device_data->vtable.DestroyBuffer(device_data->device, draw->index_buffer, NULL);
+      device_data->vtable.FreeMemory(device_data->device, draw->vertex_buffer_mem, NULL);
+      device_data->vtable.FreeMemory(device_data->device, draw->index_buffer_mem, NULL);
+   }
+
     for (uint32_t i = 0; i < data->n_images; i++) {
        device_data->vtable.DestroyImageView(device_data->device, data->image_views[i], NULL);
        device_data->vtable.DestroyFramebuffer(device_data->device, data->framebuffers[i], NULL);
     for (uint32_t i = 0; i < data->n_images; i++) {
        device_data->vtable.DestroyImageView(device_data->device, data->image_views[i], NULL);
        device_data->vtable.DestroyFramebuffer(device_data->device, data->framebuffers[i], NULL);
@@ -1337,25 +1517,8 @@ static void shutdown_swapchain_data(struct swapchain_data *data)
  
     device_data->vtable.DestroyRenderPass(device_data->device, data->render_pass, NULL);
  
  
     device_data->vtable.DestroyRenderPass(device_data->device, data->render_pass, NULL);
  
-   for (uint32_t i = 0; i < ARRAY_SIZE(data->frame_data); i++) {
-      device_data->vtable.FreeCommandBuffers(device_data->device,
-                                             data->command_pool,
-                                             1, &data->frame_data[i].command_buffer);
-      if (data->frame_data[i].vertex_buffer)
-         device_data->vtable.DestroyBuffer(device_data->device, data->frame_data[i].vertex_buffer, NULL);
-      if (data->frame_data[i].index_buffer)
-         device_data->vtable.DestroyBuffer(device_data->device, data->frame_data[i].index_buffer, NULL);
-      if (data->frame_data[i].vertex_buffer_mem)
-         device_data->vtable.FreeMemory(device_data->device, data->frame_data[i].vertex_buffer_mem, NULL);
-      if (data->frame_data[i].index_buffer_mem)
-         device_data->vtable.FreeMemory(device_data->device, data->frame_data[i].index_buffer_mem, NULL);
-   }
     device_data->vtable.DestroyCommandPool(device_data->device, data->command_pool, NULL);
  
     device_data->vtable.DestroyCommandPool(device_data->device, data->command_pool, NULL);
  
-   device_data->vtable.DestroyFence(device_data->device, data->fence, NULL);
-   if (data->submission_semaphore)
-      device_data->vtable.DestroySemaphore(device_data->device, data->submission_semaphore, NULL);
-
     device_data->vtable.DestroyPipeline(device_data->device, data->pipeline, NULL);
     device_data->vtable.DestroyPipelineLayout(device_data->device, data->pipeline_layout, NULL);
  
     device_data->vtable.DestroyPipeline(device_data->device, data->pipeline, NULL);
     device_data->vtable.DestroyPipelineLayout(device_data->device, data->pipeline_layout, NULL);
  
@@ -1375,24 +1538,34 @@ static void shutdown_swapchain_data(struct swapchain_data *data)
     ImGui::DestroyContext(data->imgui_context);
  }
  
     ImGui::DestroyContext(data->imgui_context);
  }
  
-static void before_present(struct swapchain_data *swapchain_data,
-                           unsigned imageIndex)
+static struct overlay_draw *before_present(struct swapchain_data *swapchain_data,
+                                           struct queue_data *present_queue,
+                                           const VkSemaphore *wait_semaphores,
+                                           unsigned n_wait_semaphores,
+                                           unsigned imageIndex)
  {
  {
+   struct instance_data *instance_data = swapchain_data->device->instance;
+   struct overlay_draw *draw = NULL;
+
     snapshot_swapchain_frame(swapchain_data);
  
     snapshot_swapchain_frame(swapchain_data);
  
-   if (swapchain_data->n_frames > 0) {
+   if (!instance_data->params.no_display && swapchain_data->n_frames > 0) {
        compute_swapchain_display(swapchain_data);
        compute_swapchain_display(swapchain_data);
-      render_swapchain_display(swapchain_data, imageIndex);
+      draw = render_swapchain_display(swapchain_data, present_queue,
+                                      wait_semaphores, n_wait_semaphores,
+                                      imageIndex);
     }
     }
+
+   return draw;
  }
  
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_CreateSwapchainKHR(
+static VkResult overlay_CreateSwapchainKHR(
      VkDevice                                    device,
      const VkSwapchainCreateInfoKHR*             pCreateInfo,
      const VkAllocationCallbacks*                pAllocator,
      VkSwapchainKHR*                             pSwapchain)
  {
      VkDevice                                    device,
      const VkSwapchainCreateInfoKHR*             pCreateInfo,
      const VkAllocationCallbacks*                pAllocator,
      VkSwapchainKHR*                             pSwapchain)
  {
-   struct device_data *device_data = FIND_DEVICE_DATA(device);
+   struct device_data *device_data = FIND(struct device_data, device);
     VkResult result = device_data->vtable.CreateSwapchainKHR(device, pCreateInfo, pAllocator, pSwapchain);
     if (result != VK_SUCCESS) return result;
  
     VkResult result = device_data->vtable.CreateSwapchainKHR(device, pCreateInfo, pAllocator, pSwapchain);
     if (result != VK_SUCCESS) return result;
  
@@ -1401,58 +1574,141 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_CreateSwapchainKHR(
     return result;
  }
  
     return result;
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_DestroySwapchainKHR(
+static void overlay_DestroySwapchainKHR(
      VkDevice                                    device,
      VkSwapchainKHR                              swapchain,
      const VkAllocationCallbacks*                pAllocator)
  {
      VkDevice                                    device,
      VkSwapchainKHR                              swapchain,
      const VkAllocationCallbacks*                pAllocator)
  {
-   struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(swapchain);
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, swapchain);
  
     shutdown_swapchain_data(swapchain_data);
     swapchain_data->device->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
     destroy_swapchain_data(swapchain_data);
  }
  
  
     shutdown_swapchain_data(swapchain_data);
     swapchain_data->device->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
     destroy_swapchain_data(swapchain_data);
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_QueuePresentKHR(
+static VkResult overlay_QueuePresentKHR(
      VkQueue                                     queue,
      const VkPresentInfoKHR*                     pPresentInfo)
  {
      VkQueue                                     queue,
      const VkPresentInfoKHR*                     pPresentInfo)
  {
-   struct queue_data *queue_data = FIND_QUEUE_DATA(queue);
+   struct queue_data *queue_data = FIND(struct queue_data, queue);
     struct device_data *device_data = queue_data->device;
     struct device_data *device_data = queue_data->device;
-
-   /* If we present on the graphic queue this layer is using to draw an
-    * overlay, we don't need more than submitting the overlay draw prior to
-    * present.
-    */
-   if (queue_data == device_data->graphic_queue) {
-      for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
-         struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(pPresentInfo->pSwapchains[i]);
-         before_present(swapchain_data, pPresentInfo->pImageIndices[i]);
+   struct instance_data *instance_data = device_data->instance;
+   uint32_t query_results[OVERLAY_QUERY_COUNT];
+
+   device_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_frame]++;
+
+   if (list_length(&queue_data->running_command_buffer) > 0) {
+      /* Before getting the query results, make sure the operations have
+       * completed.
+       */
+      VK_CHECK(device_data->vtable.ResetFences(device_data->device,
+                                               1, &queue_data->queries_fence));
+      VK_CHECK(device_data->vtable.QueueSubmit(queue, 0, NULL, queue_data->queries_fence));
+      VK_CHECK(device_data->vtable.WaitForFences(device_data->device,
+                                                 1, &queue_data->queries_fence,
+                                                 VK_FALSE, UINT64_MAX));
+
+      /* Now get the results. */
+      list_for_each_entry_safe(struct command_buffer_data, cmd_buffer_data,
+                               &queue_data->running_command_buffer, link) {
+         list_delinit(&cmd_buffer_data->link);
+
+         if (cmd_buffer_data->pipeline_query_pool) {
+            memset(query_results, 0, sizeof(query_results));
+            VK_CHECK(device_data->vtable.GetQueryPoolResults(device_data->device,
+                                                             cmd_buffer_data->pipeline_query_pool,
+                                                             cmd_buffer_data->query_index, 1,
+                                                             sizeof(uint32_t) * OVERLAY_QUERY_COUNT,
+                                                             query_results, 0, VK_QUERY_RESULT_WAIT_BIT));
+
+            for (uint32_t i = OVERLAY_PARAM_ENABLED_vertices;
+                 i <= OVERLAY_PARAM_ENABLED_compute_invocations; i++) {
+               device_data->frame_stats.stats[i] += query_results[i - OVERLAY_PARAM_ENABLED_vertices];
+            }
+         }
+         if (cmd_buffer_data->timestamp_query_pool) {
+            uint64_t gpu_timestamps[2] = { 0 };
+            VK_CHECK(device_data->vtable.GetQueryPoolResults(device_data->device,
+                                                             cmd_buffer_data->timestamp_query_pool,
+                                                             cmd_buffer_data->query_index * 2, 2,
+                                                             2 * sizeof(uint64_t), gpu_timestamps, sizeof(uint64_t),
+                                                             VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT));
+
+            gpu_timestamps[0] &= queue_data->timestamp_mask;
+            gpu_timestamps[1] &= queue_data->timestamp_mask;
+            device_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_gpu_timing] +=
+               (gpu_timestamps[1] - gpu_timestamps[0]) *
+               device_data->properties.limits.timestampPeriod;
+         }
        }
        }
-      return queue_data->device->vtable.QueuePresentKHR(queue, pPresentInfo);
     }
  
     }
  
-   /* Otherwise we need to do cross queue synchronization to tie the overlay
-    * draw into the present queue.
+   /* Otherwise we need to add our overlay drawing semaphore to the list of
+    * semaphores to wait on. If we don't do that the presented picture might
+    * be have incomplete overlay drawings.
      */
      */
-   VkPresentInfoKHR present_info = *pPresentInfo;
-   VkSemaphore *semaphores =
-      (VkSemaphore *)malloc(sizeof(VkSemaphore) * (pPresentInfo->waitSemaphoreCount + pPresentInfo->swapchainCount));
-   for (uint32_t i = 0; i < pPresentInfo->waitSemaphoreCount; i++)
-      semaphores[i] = pPresentInfo->pWaitSemaphores[i];
-   for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
-      struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(pPresentInfo->pSwapchains[i]);
-      before_present(swapchain_data, pPresentInfo->pImageIndices[i]);
-      semaphores[pPresentInfo->waitSemaphoreCount + i] = swapchain_data->submission_semaphore;
+   VkResult result = VK_SUCCESS;
+   if (instance_data->params.no_display) {
+      for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
+         VkSwapchainKHR swapchain = pPresentInfo->pSwapchains[i];
+         struct swapchain_data *swapchain_data =
+            FIND(struct swapchain_data, swapchain);
+
+         before_present(swapchain_data,
+                        queue_data,
+                        pPresentInfo->pWaitSemaphores,
+                        pPresentInfo->waitSemaphoreCount,
+                        pPresentInfo->pImageIndices[i]);
+
+         VkPresentInfoKHR present_info = *pPresentInfo;
+         present_info.swapchainCount = 1;
+         present_info.pSwapchains = &swapchain;
+
+         uint64_t ts0 = os_time_get();
+         result = queue_data->device->vtable.QueuePresentKHR(queue, &present_info);
+         uint64_t ts1 = os_time_get();
+         swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_present_timing] += ts1 - ts0;
+      }
+   } else {
+      for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
+         VkSwapchainKHR swapchain = pPresentInfo->pSwapchains[i];
+         struct swapchain_data *swapchain_data =
+            FIND(struct swapchain_data, swapchain);
+         VkPresentInfoKHR present_info = *pPresentInfo;
+         present_info.swapchainCount = 1;
+         present_info.pSwapchains = &swapchain;
+
+         uint32_t image_index = pPresentInfo->pImageIndices[i];
+
+         struct overlay_draw *draw = before_present(swapchain_data,
+                                                    queue_data,
+                                                    pPresentInfo->pWaitSemaphores,
+                                                    pPresentInfo->waitSemaphoreCount,
+                                                    image_index);
+
+         /* Because the submission of the overlay draw waits on the semaphores
+          * handed for present, we don't need to have this present operation
+          * wait on them as well, we can just wait on the overlay submission
+          * semaphore.
+          */
+         present_info.pWaitSemaphores = &draw->semaphore;
+         present_info.waitSemaphoreCount = 1;
+
+         uint64_t ts0 = os_time_get();
+         VkResult chain_result = queue_data->device->vtable.QueuePresentKHR(queue, &present_info);
+         uint64_t ts1 = os_time_get();
+         swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_present_timing] += ts1 - ts0;
+         if (pPresentInfo->pResults)
+            pPresentInfo->pResults[i] = chain_result;
+         if (chain_result != VK_SUCCESS && result == VK_SUCCESS)
+            result = chain_result;
+      }
     }
     }
-   present_info.pWaitSemaphores = semaphores;
-   present_info.waitSemaphoreCount = pPresentInfo->waitSemaphoreCount + pPresentInfo->swapchainCount;
-   VkResult result = queue_data->device->vtable.QueuePresentKHR(queue, &present_info);
-   free(semaphores);
     return result;
  }
  
     return result;
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_AcquireNextImageKHR(
+static VkResult overlay_AcquireNextImageKHR(
      VkDevice                                    device,
      VkSwapchainKHR                              swapchain,
      uint64_t                                    timeout,
      VkDevice                                    device,
      VkSwapchainKHR                              swapchain,
      uint64_t                                    timeout,
@@ -1460,7 +1716,8 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_AcquireNextImageKHR(
      VkFence                                     fence,
      uint32_t*                                   pImageIndex)
  {
      VkFence                                     fence,
      uint32_t*                                   pImageIndex)
  {
-   struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(swapchain);
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, swapchain);
     struct device_data *device_data = swapchain_data->device;
  
     uint64_t ts0 = os_time_get();
     struct device_data *device_data = swapchain_data->device;
  
     uint64_t ts0 = os_time_get();
@@ -1474,12 +1731,13 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_AcquireNextImageKHR(
     return result;
  }
  
     return result;
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_AcquireNextImage2KHR(
+static VkResult overlay_AcquireNextImage2KHR(
      VkDevice                                    device,
      const VkAcquireNextImageInfoKHR*            pAcquireInfo,
      uint32_t*                                   pImageIndex)
  {
      VkDevice                                    device,
      const VkAcquireNextImageInfoKHR*            pAcquireInfo,
      uint32_t*                                   pImageIndex)
  {
-   struct swapchain_data *swapchain_data = FIND_SWAPCHAIN_DATA(pAcquireInfo->swapchain);
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, pAcquireInfo->swapchain);
     struct device_data *device_data = swapchain_data->device;
  
     uint64_t ts0 = os_time_get();
     struct device_data *device_data = swapchain_data->device;
  
     uint64_t ts0 = os_time_get();
@@ -1492,21 +1750,22 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_AcquireNextImage2KHR(
     return result;
  }
  
     return result;
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdDraw(
+static void overlay_CmdDraw(
      VkCommandBuffer                             commandBuffer,
      uint32_t                                    vertexCount,
      uint32_t                                    instanceCount,
      uint32_t                                    firstVertex,
      uint32_t                                    firstInstance)
  {
      VkCommandBuffer                             commandBuffer,
      uint32_t                                    vertexCount,
      uint32_t                                    instanceCount,
      uint32_t                                    firstVertex,
      uint32_t                                    firstInstance)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDraw(commandBuffer, vertexCount, instanceCount,
                                 firstVertex, firstInstance);
  }
  
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDraw(commandBuffer, vertexCount, instanceCount,
                                 firstVertex, firstInstance);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndexed(
+static void overlay_CmdDrawIndexed(
      VkCommandBuffer                             commandBuffer,
      uint32_t                                    indexCount,
      uint32_t                                    instanceCount,
      VkCommandBuffer                             commandBuffer,
      uint32_t                                    indexCount,
      uint32_t                                    instanceCount,
@@ -1514,40 +1773,43 @@ VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndexed(
      int32_t                                     vertexOffset,
      uint32_t                                    firstInstance)
  {
      int32_t                                     vertexOffset,
      uint32_t                                    firstInstance)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndexed(commandBuffer, indexCount, instanceCount,
                                        firstIndex, vertexOffset, firstInstance);
  }
  
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndexed(commandBuffer, indexCount, instanceCount,
                                        firstIndex, vertexOffset, firstInstance);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndirect(
+static void overlay_CmdDrawIndirect(
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset,
      uint32_t                                    drawCount,
      uint32_t                                    stride)
  {
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset,
      uint32_t                                    drawCount,
      uint32_t                                    stride)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indirect]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndirect(commandBuffer, buffer, offset, drawCount, stride);
  }
  
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indirect]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndirect(commandBuffer, buffer, offset, drawCount, stride);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndexedIndirect(
+static void overlay_CmdDrawIndexedIndirect(
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset,
      uint32_t                                    drawCount,
      uint32_t                                    stride)
  {
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset,
      uint32_t                                    drawCount,
      uint32_t                                    stride)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed_indirect]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndexedIndirect(commandBuffer, buffer, offset, drawCount, stride);
  }
  
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed_indirect]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndexedIndirect(commandBuffer, buffer, offset, drawCount, stride);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndirectCountKHR(
+static void overlay_CmdDrawIndirectCountKHR(
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset,
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset,
@@ -1556,7 +1818,8 @@ VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndirectCountKHR(
      uint32_t                                    maxDrawCount,
      uint32_t                                    stride)
  {
      uint32_t                                    maxDrawCount,
      uint32_t                                    stride)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indirect_count]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndirectCountKHR(commandBuffer, buffer, offset,
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indirect_count]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndirectCountKHR(commandBuffer, buffer, offset,
@@ -1564,7 +1827,7 @@ VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndirectCountKHR(
                                                 maxDrawCount, stride);
  }
  
                                                 maxDrawCount, stride);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndexedIndirectCountKHR(
+static void overlay_CmdDrawIndexedIndirectCountKHR(
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset,
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset,
@@ -1573,7 +1836,8 @@ VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndexedIndirectCountKHR(
      uint32_t                                    maxDrawCount,
      uint32_t                                    stride)
  {
      uint32_t                                    maxDrawCount,
      uint32_t                                    stride)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed_indirect_count]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndexedIndirectCountKHR(commandBuffer, buffer, offset,
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed_indirect_count]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDrawIndexedIndirectCountKHR(commandBuffer, buffer, offset,
@@ -1581,35 +1845,38 @@ VKAPI_ATTR void VKAPI_CALL overlay_CmdDrawIndexedIndirectCountKHR(
                                                        maxDrawCount, stride);
  }
  
                                                        maxDrawCount, stride);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdDispatch(
+static void overlay_CmdDispatch(
      VkCommandBuffer                             commandBuffer,
      uint32_t                                    groupCountX,
      uint32_t                                    groupCountY,
      uint32_t                                    groupCountZ)
  {
      VkCommandBuffer                             commandBuffer,
      uint32_t                                    groupCountX,
      uint32_t                                    groupCountY,
      uint32_t                                    groupCountZ)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_dispatch]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
  }
  
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_dispatch]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdDispatchIndirect(
+static void overlay_CmdDispatchIndirect(
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset)
  {
      VkCommandBuffer                             commandBuffer,
      VkBuffer                                    buffer,
      VkDeviceSize                                offset)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_dispatch_indirect]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDispatchIndirect(commandBuffer, buffer, offset);
  }
  
     cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_dispatch_indirect]++;
     struct device_data *device_data = cmd_buffer_data->device;
     device_data->vtable.CmdDispatchIndirect(commandBuffer, buffer, offset);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdBindPipeline(
+static void overlay_CmdBindPipeline(
      VkCommandBuffer                             commandBuffer,
      VkPipelineBindPoint                         pipelineBindPoint,
      VkPipeline                                  pipeline)
  {
      VkCommandBuffer                             commandBuffer,
      VkPipelineBindPoint                         pipelineBindPoint,
      VkPipeline                                  pipeline)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     switch (pipelineBindPoint) {
     case VK_PIPELINE_BIND_POINT_GRAPHICS: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_graphics]++; break;
     case VK_PIPELINE_BIND_POINT_COMPUTE: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_compute]++; break;
     switch (pipelineBindPoint) {
     case VK_PIPELINE_BIND_POINT_GRAPHICS: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_graphics]++; break;
     case VK_PIPELINE_BIND_POINT_COMPUTE: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_compute]++; break;
@@ -1620,22 +1887,95 @@ VKAPI_ATTR void VKAPI_CALL overlay_CmdBindPipeline(
     device_data->vtable.CmdBindPipeline(commandBuffer, pipelineBindPoint, pipeline);
  }
  
     device_data->vtable.CmdBindPipeline(commandBuffer, pipelineBindPoint, pipeline);
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_BeginCommandBuffer(
+static VkResult overlay_BeginCommandBuffer(
      VkCommandBuffer                             commandBuffer,
      const VkCommandBufferBeginInfo*             pBeginInfo)
  {
      VkCommandBuffer                             commandBuffer,
      const VkCommandBufferBeginInfo*             pBeginInfo)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     struct device_data *device_data = cmd_buffer_data->device;
  
     struct device_data *device_data = cmd_buffer_data->device;
  
-   return device_data->vtable.BeginCommandBuffer(commandBuffer, pBeginInfo);
+   memset(&cmd_buffer_data->stats, 0, sizeof(cmd_buffer_data->stats));
+
+   /* We don't record any query in secondary command buffers, just make sure
+    * we have the right inheritance.
+    */
+   if (cmd_buffer_data->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      VkCommandBufferBeginInfo *begin_info = (VkCommandBufferBeginInfo *)
+         clone_chain((const struct VkBaseInStructure *)pBeginInfo);
+      VkCommandBufferInheritanceInfo *parent_inhe_info = (VkCommandBufferInheritanceInfo *)
+         vk_find_struct(begin_info, COMMAND_BUFFER_INHERITANCE_INFO);
+      VkCommandBufferInheritanceInfo inhe_info = {
+         VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO,
+         NULL,
+         VK_NULL_HANDLE,
+         0,
+         VK_NULL_HANDLE,
+         VK_FALSE,
+         0,
+         overlay_query_flags,
+      };
+
+      if (parent_inhe_info)
+         parent_inhe_info->pipelineStatistics = overlay_query_flags;
+      else {
+         inhe_info.pNext = begin_info->pNext;
+         begin_info->pNext = &inhe_info;
+      }
+
+      VkResult result = device_data->vtable.BeginCommandBuffer(commandBuffer, pBeginInfo);
+
+      if (!parent_inhe_info)
+         begin_info->pNext = inhe_info.pNext;
+
+      free_chain((struct VkBaseOutStructure *)begin_info);
+
+      return result;
+   }
+
+   /* Otherwise record a begin query as first command. */
+   VkResult result = device_data->vtable.BeginCommandBuffer(commandBuffer, pBeginInfo);
+
+   if (result == VK_SUCCESS) {
+      if (cmd_buffer_data->pipeline_query_pool) {
+         device_data->vtable.CmdResetQueryPool(commandBuffer,
+                                               cmd_buffer_data->pipeline_query_pool,
+                                               cmd_buffer_data->query_index, 1);
+      }
+      if (cmd_buffer_data->timestamp_query_pool) {
+         device_data->vtable.CmdResetQueryPool(commandBuffer,
+                                               cmd_buffer_data->timestamp_query_pool,
+                                               cmd_buffer_data->query_index * 2, 2);
+      }
+      if (cmd_buffer_data->pipeline_query_pool) {
+         device_data->vtable.CmdBeginQuery(commandBuffer,
+                                           cmd_buffer_data->pipeline_query_pool,
+                                           cmd_buffer_data->query_index, 0);
+      }
+      if (cmd_buffer_data->timestamp_query_pool) {
+         device_data->vtable.CmdWriteTimestamp(commandBuffer,
+                                               VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                                               cmd_buffer_data->timestamp_query_pool,
+                                               cmd_buffer_data->query_index * 2);
+      }
+   }
+
+   return result;
  }
  
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_EndCommandBuffer(
+static VkResult overlay_EndCommandBuffer(
      VkCommandBuffer                             commandBuffer)
  {
      VkCommandBuffer                             commandBuffer)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     struct device_data *device_data = cmd_buffer_data->device;
  
     struct device_data *device_data = cmd_buffer_data->device;
  
+   if (cmd_buffer_data->timestamp_query_pool) {
+      device_data->vtable.CmdWriteTimestamp(commandBuffer,
+                                            VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                                            cmd_buffer_data->timestamp_query_pool,
+                                            cmd_buffer_data->query_index * 2 + 1);
+   }
     if (cmd_buffer_data->pipeline_query_pool) {
        device_data->vtable.CmdEndQuery(commandBuffer,
                                        cmd_buffer_data->pipeline_query_pool,
     if (cmd_buffer_data->pipeline_query_pool) {
        device_data->vtable.CmdEndQuery(commandBuffer,
                                        cmd_buffer_data->pipeline_query_pool,
@@ -1645,11 +1985,12 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_EndCommandBuffer(
     return device_data->vtable.EndCommandBuffer(commandBuffer);
  }
  
     return device_data->vtable.EndCommandBuffer(commandBuffer);
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_ResetCommandBuffer(
+static VkResult overlay_ResetCommandBuffer(
      VkCommandBuffer                             commandBuffer,
      VkCommandBufferResetFlags                   flags)
  {
      VkCommandBuffer                             commandBuffer,
      VkCommandBufferResetFlags                   flags)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     struct device_data *device_data = cmd_buffer_data->device;
  
     memset(&cmd_buffer_data->stats, 0, sizeof(cmd_buffer_data->stats));
     struct device_data *device_data = cmd_buffer_data->device;
  
     memset(&cmd_buffer_data->stats, 0, sizeof(cmd_buffer_data->stats));
@@ -1657,17 +1998,19 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_ResetCommandBuffer(
     return device_data->vtable.ResetCommandBuffer(commandBuffer, flags);
  }
  
     return device_data->vtable.ResetCommandBuffer(commandBuffer, flags);
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_CmdExecuteCommands(
+static void overlay_CmdExecuteCommands(
      VkCommandBuffer                             commandBuffer,
      uint32_t                                    commandBufferCount,
      const VkCommandBuffer*                      pCommandBuffers)
  {
      VkCommandBuffer                             commandBuffer,
      uint32_t                                    commandBufferCount,
      const VkCommandBuffer*                      pCommandBuffers)
  {
-   struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(commandBuffer);
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
     struct device_data *device_data = cmd_buffer_data->device;
  
     /* Add the stats of the executed command buffers to the primary one. */
     for (uint32_t c = 0; c < commandBufferCount; c++) {
     struct device_data *device_data = cmd_buffer_data->device;
  
     /* Add the stats of the executed command buffers to the primary one. */
     for (uint32_t c = 0; c < commandBufferCount; c++) {
-      struct command_buffer_data *sec_cmd_buffer_data = FIND_CMD_BUFFER_DATA(pCommandBuffers[c]);
+      struct command_buffer_data *sec_cmd_buffer_data =
+         FIND(struct command_buffer_data, pCommandBuffers[c]);
  
        for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++)
           cmd_buffer_data->stats.stats[s] += sec_cmd_buffer_data->stats.stats[s];
  
        for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++)
           cmd_buffer_data->stats.stats[s] += sec_cmd_buffer_data->stats.stats[s];
@@ -1676,42 +2019,104 @@ VKAPI_ATTR void VKAPI_CALL overlay_CmdExecuteCommands(
     device_data->vtable.CmdExecuteCommands(commandBuffer, commandBufferCount, pCommandBuffers);
  }
  
     device_data->vtable.CmdExecuteCommands(commandBuffer, commandBufferCount, pCommandBuffers);
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_AllocateCommandBuffers(
+static VkResult overlay_AllocateCommandBuffers(
     VkDevice                           device,
     const VkCommandBufferAllocateInfo* pAllocateInfo,
     VkCommandBuffer*                   pCommandBuffers)
  {
     VkDevice                           device,
     const VkCommandBufferAllocateInfo* pAllocateInfo,
     VkCommandBuffer*                   pCommandBuffers)
  {
-   struct device_data *device_data = FIND_DEVICE_DATA(device);
+   struct device_data *device_data = FIND(struct device_data, device);
     VkResult result =
        device_data->vtable.AllocateCommandBuffers(device, pAllocateInfo, pCommandBuffers);
     if (result != VK_SUCCESS)
        return result;
     VkResult result =
        device_data->vtable.AllocateCommandBuffers(device, pAllocateInfo, pCommandBuffers);
     if (result != VK_SUCCESS)
        return result;
-   for (uint32_t i = 0; i < pAllocateInfo->commandBufferCount; i++)
-      new_command_buffer_data(pCommandBuffers[i], pAllocateInfo->level, device_data);
+
+   VkQueryPool pipeline_query_pool = VK_NULL_HANDLE;
+   VkQueryPool timestamp_query_pool = VK_NULL_HANDLE;
+   if (device_data->instance->pipeline_statistics_enabled &&
+       pAllocateInfo->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+      VkQueryPoolCreateInfo pool_info = {
+         VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+         NULL,
+         0,
+         VK_QUERY_TYPE_PIPELINE_STATISTICS,
+         pAllocateInfo->commandBufferCount,
+         overlay_query_flags,
+      };
+      VK_CHECK(device_data->vtable.CreateQueryPool(device_data->device, &pool_info,
+                                                   NULL, &pipeline_query_pool));
+   }
+   if (device_data->instance->params.enabled[OVERLAY_PARAM_ENABLED_gpu_timing]) {
+      VkQueryPoolCreateInfo pool_info = {
+         VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+         NULL,
+         0,
+         VK_QUERY_TYPE_TIMESTAMP,
+         pAllocateInfo->commandBufferCount * 2,
+         0,
+      };
+      VK_CHECK(device_data->vtable.CreateQueryPool(device_data->device, &pool_info,
+                                                   NULL, &timestamp_query_pool));
+   }
+
+   for (uint32_t i = 0; i < pAllocateInfo->commandBufferCount; i++) {
+      new_command_buffer_data(pCommandBuffers[i], pAllocateInfo->level,
+                              pipeline_query_pool, timestamp_query_pool,
+                              i, device_data);
+   }
+
+   if (pipeline_query_pool)
+      map_object(HKEY(pipeline_query_pool), (void *)(uintptr_t) pAllocateInfo->commandBufferCount);
+   if (timestamp_query_pool)
+      map_object(HKEY(timestamp_query_pool), (void *)(uintptr_t) pAllocateInfo->commandBufferCount);
+
     return result;
  }
  
     return result;
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_FreeCommandBuffers(VkDevice device,
-                                                      VkCommandPool commandPool,
-                                                      uint32_t commandBufferCount,
-                                                      const VkCommandBuffer* pCommandBuffers)
+static void overlay_FreeCommandBuffers(
+   VkDevice               device,
+   VkCommandPool          commandPool,
+   uint32_t               commandBufferCount,
+   const VkCommandBuffer* pCommandBuffers)
  {
  {
-   struct device_data *device_data = FIND_DEVICE_DATA(device);
+   struct device_data *device_data = FIND(struct device_data, device);
     for (uint32_t i = 0; i < commandBufferCount; i++) {
     for (uint32_t i = 0; i < commandBufferCount; i++) {
-      struct command_buffer_data *cmd_buffer_data = FIND_CMD_BUFFER_DATA(pCommandBuffers[i]);
+      struct command_buffer_data *cmd_buffer_data =
+         FIND(struct command_buffer_data, pCommandBuffers[i]);
+
+      /* It is legal to free a NULL command buffer*/
+      if (!cmd_buffer_data)
+         continue;
+
+      uint64_t count = (uintptr_t)find_object_data(HKEY(cmd_buffer_data->pipeline_query_pool));
+      if (count == 1) {
+         unmap_object(HKEY(cmd_buffer_data->pipeline_query_pool));
+         device_data->vtable.DestroyQueryPool(device_data->device,
+                                              cmd_buffer_data->pipeline_query_pool, NULL);
+      } else if (count != 0) {
+         map_object(HKEY(cmd_buffer_data->pipeline_query_pool), (void *)(uintptr_t)(count - 1));
+      }
+      count = (uintptr_t)find_object_data(HKEY(cmd_buffer_data->timestamp_query_pool));
+      if (count == 1) {
+         unmap_object(HKEY(cmd_buffer_data->timestamp_query_pool));
+         device_data->vtable.DestroyQueryPool(device_data->device,
+                                              cmd_buffer_data->timestamp_query_pool, NULL);
+      } else if (count != 0) {
+         map_object(HKEY(cmd_buffer_data->timestamp_query_pool), (void *)(uintptr_t)(count - 1));
+      }
        destroy_command_buffer_data(cmd_buffer_data);
     }
        destroy_command_buffer_data(cmd_buffer_data);
     }
+
     device_data->vtable.FreeCommandBuffers(device, commandPool,
                                            commandBufferCount, pCommandBuffers);
  }
  
     device_data->vtable.FreeCommandBuffers(device, commandPool,
                                            commandBufferCount, pCommandBuffers);
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_QueueSubmit(
+static VkResult overlay_QueueSubmit(
      VkQueue                                     queue,
      uint32_t                                    submitCount,
      const VkSubmitInfo*                         pSubmits,
      VkFence                                     fence)
  {
      VkQueue                                     queue,
      uint32_t                                    submitCount,
      const VkSubmitInfo*                         pSubmits,
      VkFence                                     fence)
  {
-   struct queue_data *queue_data = FIND_QUEUE_DATA(queue);
+   struct queue_data *queue_data = FIND(struct queue_data, queue);
     struct device_data *device_data = queue_data->device;
  
     device_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_submit]++;
     struct device_data *device_data = queue_data->device;
  
     device_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_submit]++;
@@ -1719,24 +2124,40 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_QueueSubmit(
     for (uint32_t s = 0; s < submitCount; s++) {
        for (uint32_t c = 0; c < pSubmits[s].commandBufferCount; c++) {
           struct command_buffer_data *cmd_buffer_data =
     for (uint32_t s = 0; s < submitCount; s++) {
        for (uint32_t c = 0; c < pSubmits[s].commandBufferCount; c++) {
           struct command_buffer_data *cmd_buffer_data =
-            FIND_CMD_BUFFER_DATA(pSubmits[s].pCommandBuffers[c]);
+            FIND(struct command_buffer_data, pSubmits[s].pCommandBuffers[c]);
  
           /* Merge the submitted command buffer stats into the device. */
           for (uint32_t st = 0; st < OVERLAY_PARAM_ENABLED_MAX; st++)
              device_data->frame_stats.stats[st] += cmd_buffer_data->stats.stats[st];
  
           /* Merge the submitted command buffer stats into the device. */
           for (uint32_t st = 0; st < OVERLAY_PARAM_ENABLED_MAX; st++)
              device_data->frame_stats.stats[st] += cmd_buffer_data->stats.stats[st];
+
+         /* Attach the command buffer to the queue so we remember to read its
+          * pipeline statistics & timestamps at QueuePresent().
+          */
+         if (!cmd_buffer_data->pipeline_query_pool &&
+             !cmd_buffer_data->timestamp_query_pool)
+            continue;
+
+         if (list_is_empty(&cmd_buffer_data->link)) {
+            list_addtail(&cmd_buffer_data->link,
+                         &queue_data->running_command_buffer);
+         } else {
+            fprintf(stderr, "Command buffer submitted multiple times before present.\n"
+                    "This could lead to invalid data.\n");
+         }
        }
     }
  
     return device_data->vtable.QueueSubmit(queue, submitCount, pSubmits, fence);
  }
  
        }
     }
  
     return device_data->vtable.QueueSubmit(queue, submitCount, pSubmits, fence);
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_CreateDevice(
+static VkResult overlay_CreateDevice(
      VkPhysicalDevice                            physicalDevice,
      const VkDeviceCreateInfo*                   pCreateInfo,
      const VkAllocationCallbacks*                pAllocator,
      VkDevice*                                   pDevice)
  {
      VkPhysicalDevice                            physicalDevice,
      const VkDeviceCreateInfo*                   pCreateInfo,
      const VkAllocationCallbacks*                pAllocator,
      VkDevice*                                   pDevice)
  {
-   struct instance_data *instance_data = FIND_PHYSICAL_DEVICE_DATA(physicalDevice);
+   struct instance_data *instance_data =
+      FIND(struct instance_data, physicalDevice);
     VkLayerDeviceCreateInfo *chain_info =
        get_device_chain_info(pCreateInfo, VK_LAYER_LINK_INFO);
  
     VkLayerDeviceCreateInfo *chain_info =
        get_device_chain_info(pCreateInfo, VK_LAYER_LINK_INFO);
  
@@ -1751,7 +2172,19 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_CreateDevice(
     // Advance the link info for the next element on the chain
     chain_info->u.pLayerInfo = chain_info->u.pLayerInfo->pNext;
  
     // Advance the link info for the next element on the chain
     chain_info->u.pLayerInfo = chain_info->u.pLayerInfo->pNext;
  
-   VkResult result = fpCreateDevice(physicalDevice, pCreateInfo, pAllocator, pDevice);
+   VkPhysicalDeviceFeatures device_features = {};
+   VkDeviceCreateInfo device_info = *pCreateInfo;
+
+   if (pCreateInfo->pEnabledFeatures)
+      device_features = *(pCreateInfo->pEnabledFeatures);
+   if (instance_data->pipeline_statistics_enabled) {
+      device_features.inheritedQueries = true;
+      device_features.pipelineStatisticsQuery = true;
+   }
+   device_info.pEnabledFeatures = &device_features;
+
+
+   VkResult result = fpCreateDevice(physicalDevice, &device_info, pAllocator, pDevice);
     if (result != VK_SUCCESS) return result;
  
     struct device_data *device_data = new_device_data(*pDevice, instance_data);
     if (result != VK_SUCCESS) return result;
  
     struct device_data *device_data = new_device_data(*pDevice, instance_data);
@@ -1770,17 +2203,17 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_CreateDevice(
     return result;
  }
  
     return result;
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_DestroyDevice(
+static void overlay_DestroyDevice(
      VkDevice                                    device,
      const VkAllocationCallbacks*                pAllocator)
  {
      VkDevice                                    device,
      const VkAllocationCallbacks*                pAllocator)
  {
-   struct device_data *device_data = FIND_DEVICE_DATA(device);
+   struct device_data *device_data = FIND(struct device_data, device);
     device_unmap_queues(device_data);
     device_data->vtable.DestroyDevice(device, pAllocator);
     destroy_device_data(device_data);
  }
  
     device_unmap_queues(device_data);
     device_data->vtable.DestroyDevice(device, pAllocator);
     destroy_device_data(device_data);
  }
  
-VKAPI_ATTR VkResult VKAPI_CALL overlay_CreateInstance(
+static VkResult overlay_CreateInstance(
      const VkInstanceCreateInfo*                 pCreateInfo,
      const VkAllocationCallbacks*                pAllocator,
      VkInstance*                                 pInstance)
      const VkInstanceCreateInfo*                 pCreateInfo,
      const VkAllocationCallbacks*                pAllocator,
      VkInstance*                                 pInstance)
@@ -1811,14 +2244,22 @@ VKAPI_ATTR VkResult VKAPI_CALL overlay_CreateInstance(
  
     parse_overlay_env(&instance_data->params, getenv("VK_LAYER_MESA_OVERLAY_CONFIG"));
  
  
     parse_overlay_env(&instance_data->params, getenv("VK_LAYER_MESA_OVERLAY_CONFIG"));
  
+   for (int i = OVERLAY_PARAM_ENABLED_vertices;
+        i <= OVERLAY_PARAM_ENABLED_compute_invocations; i++) {
+      if (instance_data->params.enabled[i]) {
+         instance_data->pipeline_statistics_enabled = true;
+         break;
+      }
+   }
+
     return result;
  }
  
     return result;
  }
  
-VKAPI_ATTR void VKAPI_CALL overlay_DestroyInstance(
+static void overlay_DestroyInstance(
      VkInstance                                  instance,
      const VkAllocationCallbacks*                pAllocator)
  {
      VkInstance                                  instance,
      const VkAllocationCallbacks*                pAllocator)
  {
-   struct instance_data *instance_data = FIND_INSTANCE_DATA(instance);
+   struct instance_data *instance_data = FIND(struct instance_data, instance);
     instance_data_map_physical_devices(instance_data, false);
     instance_data->vtable.DestroyInstance(instance, pAllocator);
     destroy_instance_data(instance_data);
     instance_data_map_physical_devices(instance_data, false);
     instance_data->vtable.DestroyInstance(instance, pAllocator);
     destroy_instance_data(instance_data);
@@ -1839,6 +2280,7 @@ static const struct {
  
     ADD_HOOK(CmdDraw),
     ADD_HOOK(CmdDrawIndexed),
  
     ADD_HOOK(CmdDraw),
     ADD_HOOK(CmdDrawIndexed),
+   ADD_HOOK(CmdDrawIndirect),
     ADD_HOOK(CmdDrawIndexedIndirect),
     ADD_HOOK(CmdDispatch),
     ADD_HOOK(CmdDispatchIndirect),
     ADD_HOOK(CmdDrawIndexedIndirect),
     ADD_HOOK(CmdDispatch),
     ADD_HOOK(CmdDispatchIndirect),
@@ -1854,10 +2296,12 @@ static const struct {
     ADD_HOOK(AcquireNextImage2KHR),
  
     ADD_HOOK(QueueSubmit),
     ADD_HOOK(AcquireNextImage2KHR),
  
     ADD_HOOK(QueueSubmit),
-   ADD_HOOK(CreateInstance),
-   ADD_HOOK(DestroyInstance),
+
     ADD_HOOK(CreateDevice),
     ADD_HOOK(DestroyDevice),
     ADD_HOOK(CreateDevice),
     ADD_HOOK(DestroyDevice),
+
+   ADD_HOOK(CreateInstance),
+   ADD_HOOK(DestroyInstance),
  #undef ADD_HOOK
  };
  
  #undef ADD_HOOK
  };
  
@@ -1879,7 +2323,7 @@ VK_LAYER_EXPORT VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr(VkD
  
     if (dev == NULL) return NULL;
  
  
     if (dev == NULL) return NULL;
  
-   struct device_data *device_data = FIND_DEVICE_DATA(dev);
+   struct device_data *device_data = FIND(struct device_data, dev);
     if (device_data->vtable.GetDeviceProcAddr == NULL) return NULL;
     return device_data->vtable.GetDeviceProcAddr(dev, funcName);
  }
     if (device_data->vtable.GetDeviceProcAddr == NULL) return NULL;
     return device_data->vtable.GetDeviceProcAddr(dev, funcName);
  }
@@ -1892,7 +2336,7 @@ VK_LAYER_EXPORT VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(V
  
     if (instance == NULL) return NULL;
  
  
     if (instance == NULL) return NULL;
  
-   struct instance_data *instance_data = FIND_INSTANCE_DATA(instance);
+   struct instance_data *instance_data = FIND(struct instance_data, instance);
     if (instance_data->vtable.GetInstanceProcAddr == NULL) return NULL;
     return instance_data->vtable.GetInstanceProcAddr(instance, funcName);
  }
     if (instance_data->vtable.GetInstanceProcAddr == NULL) return NULL;
     return instance_data->vtable.GetInstanceProcAddr(instance, funcName);
  }