anv: Implement VK_KHR_performance_query

author Lionel Landwerlin <lionel.g.landwerlin@intel.com>

Sat, 6 Oct 2018 18:12:34 +0000 (19:12 +0100)

committer Lionel Landwerlin <lionel.g.landwerlin@intel.com>

Wed, 20 May 2020 11:02:27 +0000 (14:02 +0300)
author Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Sat, 6 Oct 2018 18:12:34 +0000 (19:12 +0100)
committer Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Wed, 20 May 2020 11:02:27 +0000 (14:02 +0300)
diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c

index 2c3b7b3cad1f21d1d4af4446bd7d53a5141a94f4..f820a69ceec5851ecd0f0f230392946d5a3c4587 100644 (file)
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -31,6 +31,7 @@
  
  #include "genxml/gen8_pack.h"
  #include "genxml/genX_bits.h"
+#include "perf/gen_perf.h"
  
  #include "util/debug.h"
  
@@ -1102,6 +1103,8 @@ struct anv_execbuf {
  
     const VkAllocationCallbacks *             alloc;
     VkSystemAllocationScope                   alloc_scope;
+
+   int                                       perf_query_pass;
  };
  
  static void
@@ -1375,6 +1378,9 @@ static bool
  relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
                      struct anv_execbuf *exec)
  {
+   if (cmd_buffer->perf_query_pool)
+      return false;
+
     if (!exec->has_relocs)
        return true;
  
@@ -1672,6 +1678,7 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
     anv_execbuf_init(&execbuf);
     execbuf.alloc = submit->alloc;
     execbuf.alloc_scope = submit->alloc_scope;
+   execbuf.perf_query_pass = submit->perf_query_pass;
  
     VkResult result;
  
@@ -1708,10 +1715,26 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
     if (result != VK_SUCCESS)
        goto error;
  
+   const bool has_perf_query =
+      submit->perf_query_pass >= 0 &&
+      submit->cmd_buffer &&
+      submit->cmd_buffer->perf_query_pool;
+
     if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
        if (submit->cmd_buffer) {
-         struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos);
+         if (has_perf_query) {
+            struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool;
+            struct anv_bo *pass_batch_bo = query_pool->bo;
+            uint64_t pass_batch_offset =
+               khr_perf_query_preamble_offset(query_pool,
+                                              submit->perf_query_pass);
+
+            gen_print_batch(&device->decoder_ctx,
+                            pass_batch_bo->map + pass_batch_offset, 64,
+                            pass_batch_bo->offset + pass_batch_offset, false);
+         }
  
+         struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos);
           device->cmd_buffer_being_decoded = submit->cmd_buffer;
           gen_print_batch(&device->decoder_ctx, (*bo)->bo->map,
                           (*bo)->bo->size, (*bo)->bo->offset, false);
@@ -1742,6 +1765,48 @@ anv_queue_execbuf_locked(struct anv_queue *queue,
     if (submit->need_out_fence)
        execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT;
  
+   if (has_perf_query) {
+      struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool;
+      assert(submit->perf_query_pass < query_pool->n_passes);
+      struct gen_perf_query_info *query_info =
+         query_pool->pass_query[submit->perf_query_pass];
+
+      /* Some performance queries just the pipeline statistic HW, no need for
+       * OA in that case, so no need to reconfigure.
+       */
+      if (query_info->kind == GEN_PERF_QUERY_TYPE_OA ||
+          query_info->kind == GEN_PERF_QUERY_TYPE_RAW) {
+         int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
+                             (void *)(uintptr_t) query_info->oa_metrics_set_id);
+         if (ret < 0) {
+            result = anv_device_set_lost(device,
+                                         "i915-perf config failed: %s",
+                                         strerror(ret));
+         }
+      }
+
+      struct anv_bo *pass_batch_bo = query_pool->bo;
+
+      struct drm_i915_gem_exec_object2 query_pass_object = {
+         .handle = pass_batch_bo->gem_handle,
+         .offset = pass_batch_bo->offset,
+         .flags  = pass_batch_bo->flags,
+      };
+      struct drm_i915_gem_execbuffer2 query_pass_execbuf = {
+         .buffers_ptr = (uintptr_t) &query_pass_object,
+         .buffer_count = 1,
+         .batch_start_offset = khr_perf_query_preamble_offset(query_pool,
+                                                              submit->perf_query_pass),
+         .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER,
+         .rsvd1 = device->context_id,
+      };
+
+      int ret = queue->device->no_hw ? 0 :
+         anv_gem_execbuffer(queue->device, &query_pass_execbuf);
+      if (ret)
+         result = anv_queue_set_lost(queue, "execbuf2 failed: %m");
+   }
+
     int ret = queue->device->no_hw ? 0 :
        anv_gem_execbuffer(queue->device, &execbuf.execbuf);
     if (ret)
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c

index 49c7334567f1398c120e8c89af35863912a8babf..ea5ec415340952b6b866a205eb4dea687a80d9d6 100644 (file)
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -306,6 +306,7 @@ VkResult
  anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer)
  {
     cmd_buffer->usage_flags = 0;
+   cmd_buffer->perf_query_pool = NULL;
     anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer);
     anv_cmd_state_reset(cmd_buffer);
  
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c

index 71fc427aa9247368e73ec5e1f55d614774e90b88..d9f15b46332d3c726108ad6bf3f3ba70937abac6 100644 (file)
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1238,6 +1238,15 @@ void anv_GetPhysicalDeviceFeatures2(
           break;
        }
  
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: {
+         VkPhysicalDevicePerformanceQueryFeaturesKHR *feature =
+            (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext;
+         feature->performanceCounterQueryPools = true;
+         /* HW only supports a single configuration at a time. */
+         feature->performanceCounterMultipleQueryPools = false;
+         break;
+      }
+
        case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: {
           VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features =
              (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext;
@@ -1903,6 +1912,16 @@ void anv_GetPhysicalDeviceProperties2(
           break;
        }
  
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: {
+         VkPhysicalDevicePerformanceQueryPropertiesKHR *properties =
+            (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext;
+         /* We could support this by spawning a shader to do the equation
+          * normalization.
+          */
+         properties->allowCommandBufferQueryCopies = false;
+         break;
+      }
+
        case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: {
           VkPhysicalDevicePointClippingProperties *properties =
              (VkPhysicalDevicePointClippingProperties *) ext;
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py

index 926061bb9978b89f0b19905ae366b4696c7d4a40..dabe675971cba168acd88cef59c1fe18fbdd6748 100644 (file)
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -87,6 +87,7 @@ EXTENSIONS = [
      Extension('VK_KHR_maintenance2',                      1, True),
      Extension('VK_KHR_maintenance3',                      1, True),
      Extension('VK_KHR_multiview',                         1, True),
+    Extension('VK_KHR_performance_query',                 1, 'device->use_softpin && device->perf && device->perf->i915_perf_version >= 3'),
      Extension('VK_KHR_pipeline_executable_properties',    1, True),
      Extension('VK_KHR_push_descriptor',                   1, True),
      Extension('VK_KHR_relaxed_block_layout',              1, True),
diff --git a/src/intel/vulkan/anv_perf.c b/src/intel/vulkan/anv_perf.c

index 133315b2c8d45b021cbf558a2a1d1e96ea7912c1..e8575b1bd702378c15b95b786f0f39043174bc08 100644 (file)
--- a/src/intel/vulkan/anv_perf.c
+++ b/src/intel/vulkan/anv_perf.c
@@ -26,17 +26,33 @@
  #include <stdint.h>
  
  #include "anv_private.h"
+#include "vk_util.h"
  
  #include "perf/gen_perf.h"
  #include "perf/gen_perf_mdapi.h"
  
+#include "util/mesa-sha1.h"
+
  struct gen_perf_config *
  anv_get_perf(const struct gen_device_info *devinfo, int fd)
  {
+   /* We need self modifying batches. The i915 parser prevents it on
+    * Gen7.5 :( maybe one day.
+    */
+   if (devinfo->gen < 8)
+      return NULL;
+
     struct gen_perf_config *perf = gen_perf_new(NULL);
  
     gen_perf_init_metrics(perf, devinfo, fd, false /* pipeline statistics */);
  
+   if (!perf->n_queries) {
+      if (perf->platform_supported)
+         intel_logw("Performance support disabled, "
+                    "consider sysctl dev.i915.perf_stream_paranoid=0\n");
+      goto err;
+   }
+
     /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in
      * perf revision 2.
      */
@@ -103,6 +119,7 @@ anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
     return stream_fd;
  }
  
+/* VK_INTEL_performance_query */
  VkResult anv_InitializePerformanceApiINTEL(
      VkDevice                                    _device,
      const VkInitializePerformanceApiInfoINTEL*  pInitializeInfo)
@@ -226,3 +243,175 @@ void anv_UninitializePerformanceApiINTEL(
        device->perf_fd = -1;
     }
  }
+
+/* VK_KHR_performance_query */
+static const VkPerformanceCounterUnitKHR
+gen_perf_counter_unit_to_vk_unit[] = {
+   [GEN_PERF_COUNTER_UNITS_BYTES]                                = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR,
+   [GEN_PERF_COUNTER_UNITS_HZ]                                   = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR,
+   [GEN_PERF_COUNTER_UNITS_NS]                                   = VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR,
+   [GEN_PERF_COUNTER_UNITS_US]                                   = VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR, /* todo */
+   [GEN_PERF_COUNTER_UNITS_PIXELS]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_TEXELS]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_THREADS]                              = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_PERCENT]                              = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR,
+   [GEN_PERF_COUNTER_UNITS_MESSAGES]                             = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_NUMBER]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_CYCLES]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_EVENTS]                               = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_UTILIZATION]                          = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES]           = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES]        = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+   [GEN_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE]           = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR,
+};
+
+static const VkPerformanceCounterStorageKHR
+gen_perf_counter_data_type_to_vk_storage[] = {
+   [GEN_PERF_COUNTER_DATA_TYPE_BOOL32] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
+   [GEN_PERF_COUNTER_DATA_TYPE_UINT32] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR,
+   [GEN_PERF_COUNTER_DATA_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR,
+   [GEN_PERF_COUNTER_DATA_TYPE_FLOAT]  = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR,
+   [GEN_PERF_COUNTER_DATA_TYPE_DOUBLE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR,
+};
+
+VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t                                    queueFamilyIndex,
+    uint32_t*                                   pCounterCount,
+    VkPerformanceCounterKHR*                    pCounters,
+    VkPerformanceCounterDescriptionKHR*         pCounterDescriptions)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   struct gen_perf_config *perf = pdevice->perf;
+
+   uint32_t desc_count = *pCounterCount;
+
+   VK_OUTARRAY_MAKE(out, pCounters, pCounterCount);
+   VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count);
+
+   for (int c = 0; c < (perf ? perf->n_counters : 0); c++) {
+      const struct gen_perf_query_counter *gen_counter = perf->counters[c];
+
+      vk_outarray_append(&out, counter) {
+         counter->unit = gen_perf_counter_unit_to_vk_unit[gen_counter->units];
+         counter->scope = VK_QUERY_SCOPE_COMMAND_KHR;
+         counter->storage = gen_perf_counter_data_type_to_vk_storage[gen_counter->data_type];
+
+         unsigned char sha1_result[20];
+         _mesa_sha1_compute(gen_counter->symbol_name,
+                            strlen(gen_counter->symbol_name),
+                            sha1_result);
+         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+      }
+
+      vk_outarray_append(&out_desc, desc) {
+         desc->flags = 0; /* None so far. */
+         snprintf(desc->name, sizeof(desc->name), "%s", gen_counter->name);
+         snprintf(desc->category, sizeof(desc->category), "%s", gen_counter->category);
+         snprintf(desc->description, sizeof(desc->description), "%s", gen_counter->desc);
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+void anv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR(
+    VkPhysicalDevice                            physicalDevice,
+    const VkQueryPoolPerformanceCreateInfoKHR*  pPerformanceQueryCreateInfo,
+    uint32_t*                                   pNumPasses)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+   struct gen_perf_config *perf = pdevice->perf;
+
+   if (!perf) {
+      *pNumPasses = 0;
+      return;
+   }
+
+   *pNumPasses = gen_perf_get_n_passes(perf,
+                                       pPerformanceQueryCreateInfo->pCounterIndices,
+                                       pPerformanceQueryCreateInfo->counterIndexCount,
+                                       NULL);
+}
+
+VkResult anv_AcquireProfilingLockKHR(
+    VkDevice                                    _device,
+    const VkAcquireProfilingLockInfoKHR*        pInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct gen_perf_config *perf = device->physical->perf;
+   struct gen_perf_query_info *first_metric_set = &perf->queries[0];
+
+   assert(device->perf_fd == -1);
+
+   int fd = anv_device_perf_open(device, first_metric_set->oa_metrics_set_id);
+   if (fd < 0)
+      return VK_TIMEOUT;
+
+   device->perf_fd = fd;
+   return VK_SUCCESS;
+}
+
+void anv_ReleaseProfilingLockKHR(
+    VkDevice                                    _device)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   assert(device->perf_fd >= 0);
+   close(device->perf_fd);
+   device->perf_fd = -1;
+}
+
+void
+anv_perf_write_pass_results(struct gen_perf_config *perf,
+                            struct anv_query_pool *pool, uint32_t pass,
+                            const struct gen_perf_query_result *accumulated_results,
+                            union VkPerformanceCounterResultKHR *results)
+{
+   for (uint32_t c = 0; c < pool->n_counters; c++) {
+      const struct gen_perf_counter_pass *counter_pass = &pool->counter_pass[c];
+
+      if (counter_pass->pass != pass)
+         continue;
+
+      switch (pool->pass_query[pass]->kind) {
+      case GEN_PERF_QUERY_TYPE_PIPELINE: {
+         assert(counter_pass->counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
+         uint32_t accu_offset = counter_pass->counter->offset / sizeof(uint64_t);
+         results[c].uint64 = accumulated_results->accumulator[accu_offset];
+         break;
+      }
+
+      case GEN_PERF_QUERY_TYPE_OA:
+      case GEN_PERF_QUERY_TYPE_RAW:
+         switch (counter_pass->counter->data_type) {
+         case GEN_PERF_COUNTER_DATA_TYPE_UINT64:
+            results[c].uint64 =
+               counter_pass->counter->oa_counter_read_uint64(perf,
+                                                             counter_pass->query,
+                                                             accumulated_results->accumulator);
+            break;
+         case GEN_PERF_COUNTER_DATA_TYPE_FLOAT:
+            results[c].float32 =
+               counter_pass->counter->oa_counter_read_float(perf,
+                                                            counter_pass->query,
+                                                            accumulated_results->accumulator);
+            break;
+         default:
+            /* So far we aren't using uint32, double or bool32... */
+            unreachable("unexpected counter data type");
+         }
+         break;
+
+      default:
+         unreachable("invalid query type");
+      }
+
+      /* The Vulkan extension only has nanoseconds as a unit */
+      if (counter_pass->counter->units == GEN_PERF_COUNTER_UNITS_US) {
+         assert(counter_pass->counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64);
+         results[c].uint64 *= 1000;
+      }
+   }
+}
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h

index 3a0563ae83cc079d9dfc63453edbf6a085325428..fa44307457b8ded615df824f2518a5c41bfc31a0 100644 (file)
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -79,6 +79,8 @@ struct anv_instance;
  
  struct gen_aux_map_context;
  struct gen_perf_config;
+struct gen_perf_counter_pass;
+struct gen_perf_query_result;
  
  #include <vulkan/vulkan.h>
  #include <vulkan/vulkan_intel.h>
@@ -221,6 +223,12 @@ struct gen_perf_config;
   */
  #define ANV_PREDICATE_RESULT_REG 0x2678 /* MI_ALU_REG15 */
  
+/* We reserve this MI ALU register to pass around an offset computed from
+ * VkPerformanceQuerySubmitInfoKHR::counterPassIndex VK_KHR_performance_query.
+ * Other code which uses the MI ALU should leave it alone.
+ */
+#define ANV_PERF_QUERY_OFFSET_REG 0x2670 /* MI_ALU_REG14 */
+
  /* For gen12 we set the streamout buffers using 4 separate commands
   * (3DSTATE_SO_BUFFER_INDEX_*) instead of 3DSTATE_SO_BUFFER. However the layout
   * of the 3DSTATE_SO_BUFFER_INDEX_* commands is identical to that of
@@ -1193,6 +1201,8 @@ struct anv_queue_submit {
      */
     uintptr_t *                               fence_bos;
  
+   int                                       perf_query_pass;
+
     const VkAllocationCallbacks *             alloc;
     VkSystemAllocationScope                   alloc_scope;
  
@@ -1757,6 +1767,11 @@ _anv_combine_address(struct anv_batch *batch, void *location,
             _dst = NULL;                                                 \
           }))
  
+/* #define __gen_get_batch_dwords anv_batch_emit_dwords */
+/* #define __gen_get_batch_address anv_batch_address */
+/* #define __gen_address_value anv_address_physical */
+/* #define __gen_address_offset anv_address_add */
+
  struct anv_device_memory {
     struct vk_object_base                        base;
  
@@ -2875,6 +2890,8 @@ struct anv_cmd_buffer {
     VkCommandBufferUsageFlags                    usage_flags;
     VkCommandBufferLevel                         level;
  
+   struct anv_query_pool                       *perf_query_pool;
+
     struct anv_cmd_state                         state;
  
     struct anv_address                           return_addr;
@@ -2898,7 +2915,8 @@ VkResult anv_cmd_buffer_execbuf(struct anv_queue *queue,
                                  const VkSemaphore *out_semaphores,
                                  const uint64_t *out_signal_values,
                                  uint32_t num_out_semaphores,
-                                VkFence fence);
+                                VkFence fence,
+                                int perf_query_pass);
  
  VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer);
  
@@ -4227,6 +4245,9 @@ struct anv_render_pass {
  
  #define ANV_PIPELINE_STATISTICS_MASK 0x000007ff
  
+#define OA_SNAPSHOT_SIZE (256)
+#define ANV_KHR_PERF_QUERY_SIZE (ALIGN(sizeof(uint64_t), 64) + 2 * OA_SNAPSHOT_SIZE)
+
  struct anv_query_pool {
     struct vk_object_base                        base;
  
@@ -4237,8 +4258,21 @@ struct anv_query_pool {
     /** Number of slots in this query pool */
     uint32_t                                     slots;
     struct anv_bo *                              bo;
+
+   /* Perf queries : */
+   struct anv_bo                                reset_bo;
+   uint32_t                                     n_counters;
+   struct gen_perf_counter_pass                *counter_pass;
+   uint32_t                                     n_passes;
+   struct gen_perf_query_info                 **pass_query;
  };
  
+static inline uint32_t khr_perf_query_preamble_offset(struct anv_query_pool *pool,
+                                                      uint32_t pass)
+{
+   return pass * ANV_KHR_PERF_QUERY_SIZE + 8;
+}
+
  int anv_get_instance_entrypoint_index(const char *name);
  int anv_get_device_entrypoint_index(const char *name);
  int anv_get_physical_device_entrypoint_index(const char *name);
@@ -4292,6 +4326,10 @@ anv_get_subpass_id(const struct anv_cmd_state * const cmd_state)
  
  struct gen_perf_config *anv_get_perf(const struct gen_device_info *devinfo, int fd);
  void anv_device_perf_init(struct anv_device *device);
+void anv_perf_write_pass_results(struct gen_perf_config *perf,
+                                 struct anv_query_pool *pool, uint32_t pass,
+                                 const struct gen_perf_query_result *accumulated_results,
+                                 union VkPerformanceCounterResultKHR *results);
  
  #define ANV_FROM_HANDLE(__anv_type, __name, __handle) \
     VK_FROM_HANDLE(__anv_type, __name, __handle)
diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c

index 009675e23ead7820eaa946dc1d22e0463e08eb7c..f6e3fdd6177cf960aea352a260d26fa49df5d9b1 100644 (file)
--- a/src/intel/vulkan/anv_queue.c
+++ b/src/intel/vulkan/anv_queue.c
@@ -544,7 +544,7 @@ anv_queue_submit_add_timeline_signal(struct anv_queue_submit* submit,
  }
  
  static struct anv_queue_submit *
-anv_queue_submit_alloc(struct anv_device *device)
+anv_queue_submit_alloc(struct anv_device *device, int perf_query_pass)
  {
     const VkAllocationCallbacks *alloc = &device->vk.alloc;
     VkSystemAllocationScope alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE;
@@ -557,6 +557,7 @@ anv_queue_submit_alloc(struct anv_device *device)
     submit->alloc_scope = alloc_scope;
     submit->in_fence = -1;
     submit->out_fence = -1;
+   submit->perf_query_pass = perf_query_pass;
  
     return submit;
  }
@@ -569,7 +570,7 @@ anv_queue_submit_simple_batch(struct anv_queue *queue,
        return VK_SUCCESS;
  
     struct anv_device *device = queue->device;
-   struct anv_queue_submit *submit = anv_queue_submit_alloc(device);
+   struct anv_queue_submit *submit = anv_queue_submit_alloc(device, -1);
     if (!submit)
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  
@@ -720,12 +721,13 @@ anv_queue_submit(struct anv_queue *queue,
                   const uint64_t *out_values,
                   uint32_t num_out_semaphores,
                   struct anv_bo *wsi_signal_bo,
-                 VkFence _fence)
+                 VkFence _fence,
+                 int perf_query_pass)
  {
     ANV_FROM_HANDLE(anv_fence, fence, _fence);
     struct anv_device *device = queue->device;
     UNUSED struct anv_physical_device *pdevice = device->physical;
-   struct anv_queue_submit *submit = anv_queue_submit_alloc(device);
+   struct anv_queue_submit *submit = anv_queue_submit_alloc(device, perf_query_pass);
     if (!submit)
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  
@@ -972,7 +974,7 @@ VkResult anv_QueueSubmit(
         * common case.
         */
        result = anv_queue_submit(queue, NULL, NULL, NULL, 0, NULL, NULL, 0,
-                                NULL, fence);
+                                NULL, fence, -1);
        goto out;
     }
  
@@ -990,6 +992,9 @@ VkResult anv_QueueSubmit(
        const VkTimelineSemaphoreSubmitInfoKHR *timeline_info =
           vk_find_struct_const(pSubmits[i].pNext,
                                TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR);
+      const VkPerformanceQuerySubmitInfoKHR *perf_info =
+         vk_find_struct_const(pSubmits[i].pNext,
+                              PERFORMANCE_QUERY_SUBMIT_INFO_KHR);
        const uint64_t *wait_values =
           timeline_info && timeline_info->waitSemaphoreValueCount ?
           timeline_info->pWaitSemaphoreValues : NULL;
@@ -1011,7 +1016,8 @@ VkResult anv_QueueSubmit(
                                     signal_values,
                                     pSubmits[i].signalSemaphoreCount,
                                     wsi_signal_bo,
-                                   submit_fence);
+                                   submit_fence,
+                                   -1);
           if (result != VK_SUCCESS)
              goto out;
  
@@ -1049,7 +1055,8 @@ VkResult anv_QueueSubmit(
           result = anv_queue_submit(queue, cmd_buffer,
                                     in_semaphores, in_values, num_in_semaphores,
                                     out_semaphores, out_values, num_out_semaphores,
-                                   wsi_signal_bo, execbuf_fence);
+                                   wsi_signal_bo, execbuf_fence,
+                                   perf_info ? perf_info->counterPassIndex : 0);
           if (result != VK_SUCCESS)
              goto out;
        }
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c

index 50670d64a89a9e149a07bd699d42da05686c2ea0..e1389699750bbd1a1560f051196af9fb70587707 100644 (file)
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -34,8 +34,11 @@
  #include "genxml/gen_macros.h"
  #include "genxml/genX_pack.h"
  
-/* We reserve GPR 15 for conditional rendering */
-#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 15
+/* We reserve :
+ *    - GPR 14 for secondary command buffer returns
+ *    - GPR 15 for conditional rendering
+ */
+#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
  #define __gen_get_batch_dwords anv_batch_emit_dwords
  #define __gen_address_offset anv_address_add
  #include "common/gen_mi_builder.h"
@@ -1755,6 +1758,11 @@ genX(CmdExecuteCommands)(
        }
  
        anv_cmd_buffer_add_secondary(primary, secondary);
+
+      assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
+             secondary->perf_query_pool == primary->perf_query_pool);
+      if (secondary->perf_query_pool)
+         primary->perf_query_pool = secondary->perf_query_pool;
     }
  
     /* The secondary isn't counted in our VF cache tracking so we need to
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c

index 17ccfc66dc9a3216829e2b9a33fc87eda46f7fb3..3fd662cc06249d0b810a7002b30818d36cee6188 100644 (file)
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -32,16 +32,32 @@
  #include "genxml/gen_macros.h"
  #include "genxml/genX_pack.h"
  
-/* We reserve GPR 15 for conditional rendering */
-#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 15
+/* We reserve :
+ *    - GPR 14 for perf queries
+ *    - GPR 15 for conditional rendering
+ */
+#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
+#define GEN_MI_BUILDER_CAN_WRITE_BATCH GEN_GEN >= 8
  #define __gen_get_batch_dwords anv_batch_emit_dwords
  #define __gen_address_offset anv_address_add
+#define __gen_get_batch_address(b, a) anv_address_physical(anv_batch_address(b, a))
  #include "common/gen_mi_builder.h"
  #include "perf/gen_perf.h"
  #include "perf/gen_perf_mdapi.h"
  
  #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
  
+#include "vk_util.h"
+
+static struct anv_address
+anv_query_address(struct anv_query_pool *pool, uint32_t query)
+{
+   return (struct anv_address) {
+      .bo = pool->bo,
+      .offset = query * pool->stride,
+   };
+}
+
  VkResult genX(CreateQueryPool)(
      VkDevice                                    _device,
      const VkQueryPoolCreateInfo*                pCreateInfo,
@@ -50,7 +66,11 @@ VkResult genX(CreateQueryPool)(
  {
     ANV_FROM_HANDLE(anv_device, device, _device);
     const struct anv_physical_device *pdevice = device->physical;
+   const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL;
     struct anv_query_pool *pool;
+   struct gen_perf_counter_pass *counter_pass;
+   struct gen_perf_query_info **pass_query;
+   ANV_MULTIALLOC(ma);
     VkResult result;
  
     assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
@@ -65,17 +85,20 @@ VkResult genX(CreateQueryPool)(
      * 64bytes so we put those first and have the "available" bit behind
      * together with some other counters.
      */
-   uint32_t uint64s_per_slot = 1;
+   uint32_t uint64s_per_slot = 0;
+   UNUSED uint32_t n_passes = 0;
+
+   anv_multialloc_add(&ma, &pool, 1);
  
     VkQueryPipelineStatisticFlags pipeline_statistics = 0;
     switch (pCreateInfo->queryType) {
     case VK_QUERY_TYPE_OCCLUSION:
        /* Occlusion queries have two values: begin and end. */
-      uint64s_per_slot += 2;
+      uint64s_per_slot = 1 + 2;
        break;
     case VK_QUERY_TYPE_TIMESTAMP:
        /* Timestamps just have the one timestamp value */
-      uint64s_per_slot += 1;
+      uint64s_per_slot = 1 + 1;
        break;
     case VK_QUERY_TYPE_PIPELINE_STATISTICS:
        pipeline_statistics = pCreateInfo->pipelineStatistics;
@@ -85,25 +108,36 @@ VkResult genX(CreateQueryPool)(
        pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
  
        /* Statistics queries have a min and max for every statistic */
-      uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
+      uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics);
        break;
     case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
        /* Transform feedback queries are 4 values, begin/end for
         * written/available.
         */
-      uint64s_per_slot += 4;
+      uint64s_per_slot = 1 + 4;
        break;
-   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
        uint64s_per_slot = 72; /* 576 bytes, see layout below */
        break;
-   }
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+      perf_query_info = vk_find_struct_const(pCreateInfo->pNext,
+                                             QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+      n_passes = gen_perf_get_n_passes(pdevice->perf,
+                                       perf_query_info->pCounterIndices,
+                                       perf_query_info->counterIndexCount,
+                                       NULL);
+      anv_multialloc_add(&ma, &counter_pass, perf_query_info->counterIndexCount);
+      anv_multialloc_add(&ma, &pass_query, n_passes);
+      STATIC_ASSERT(ANV_KHR_PERF_QUERY_SIZE % sizeof(uint64_t) == 0);
+      uint64s_per_slot = (ANV_KHR_PERF_QUERY_SIZE / sizeof(uint64_t)) * n_passes;
+      break;
     default:
        assert(!"Invalid query type");
     }
  
-   pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8,
-                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-   if (pool == NULL)
+   if (!anv_multialloc_alloc2(&ma, &device->vk.alloc,
+                              pAllocator,
+                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
        return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  
     vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL);
@@ -112,6 +146,21 @@ VkResult genX(CreateQueryPool)(
     pool->stride = uint64s_per_slot * sizeof(uint64_t);
     pool->slots = pCreateInfo->queryCount;
  
+   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      pool->n_counters = perf_query_info->counterIndexCount;
+      pool->counter_pass = counter_pass;
+      gen_perf_get_counters_passes(pdevice->perf,
+                                   perf_query_info->pCounterIndices,
+                                   perf_query_info->counterIndexCount,
+                                   pool->counter_pass);
+      pool->n_passes = n_passes;
+      pool->pass_query = pass_query;
+      gen_perf_get_n_passes(pdevice->perf,
+                            perf_query_info->pCounterIndices,
+                            perf_query_info->counterIndexCount,
+                            pool->pass_query);
+   }
+
     uint32_t bo_flags = 0;
     if (pdevice->supports_48bit_addresses)
        bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
@@ -131,6 +180,23 @@ VkResult genX(CreateQueryPool)(
     if (result != VK_SUCCESS)
        goto fail;
  
+   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      for (uint32_t p = 0; p < pool->n_passes; p++) {
+         struct gen_mi_builder b;
+         struct anv_batch batch = {
+            .start = pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 8,
+            .end = pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 64,
+         };
+         batch.next = batch.start;
+
+         gen_mi_builder_init(&b, &batch);
+         gen_mi_store(&b, gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG),
+                      gen_mi_imm(p * ANV_KHR_PERF_QUERY_SIZE));
+         anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
+         assert(batch.next <= (pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 64));
+      }
+   }
+
     *pQueryPool = anv_query_pool_to_handle(pool);
  
     return VK_SUCCESS;
@@ -157,15 +223,73 @@ void genX(DestroyQueryPool)(
     vk_free2(&device->vk.alloc, pAllocator, pool);
  }
  
-static struct anv_address
-anv_query_address(struct anv_query_pool *pool, uint32_t query)
+/**
+ * VK_KHR_performance_query layout (576 bytes * number of passes) :
+ *
+ * -----------------------------------------
+ * |       availability (8b)    | |        |
+ * |----------------------------| |        |
+ * |     Small batch loading    | |        |
+ * |  ANV_PERF_QUERY_OFFSET_REG | |        |
+ * |          (56b)             | | Pass 0 |
+ * |----------------------------| |        |
+ * |     begin MI_RPC (256b)    | |        |
+ * |----------------------------| |        |
+ * |       end MI_RPC (256b)    | |        |
+ * |----------------------------|--        | Query 0
+ * |       availability (8b)    | |        |
+ * |----------------------------| |        |
+ * |     Small batch loading    | |        |
+ * |  ANV_PERF_QUERY_OFFSET_REG | |        |
+ * |          (56b)             | | Pass 1 |
+ * |----------------------------| |        |
+ * |     begin MI_RPC (256b)    | |        |
+ * |----------------------------| |        |
+ * |       end MI_RPC (256b)    | |        |
+ * |----------------------------|-----------
+ * |       availability (8b)    | |        |
+ * |----------------------------| |        |
+ * |        Unused (48b)        | |        |
+ * |----------------------------| | Pass 0 |
+ * |     begin MI_RPC (256b)    | |        |
+ * |----------------------------| |        | Query 1
+ * |       end MI_RPC (256b)    | |        |
+ * |----------------------------|--        |
+ * |             ...            | |        |
+ * -----------------------------------------
+ */
+UNUSED static uint64_t
+khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
  {
-   return (struct anv_address) {
-      .bo = pool->bo,
-      .offset = query * pool->stride,
-   };
+   return query * (pool->n_passes * ANV_KHR_PERF_QUERY_SIZE) +
+      pass * ANV_KHR_PERF_QUERY_SIZE;
+}
+
+UNUSED static uint64_t
+khr_perf_query_oa_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
+{
+   return query * (pool->n_passes * ANV_KHR_PERF_QUERY_SIZE) +
+      pass * ANV_KHR_PERF_QUERY_SIZE +
+      64 + (end ? OA_SNAPSHOT_SIZE : 0);
+}
+
+UNUSED static struct anv_address
+khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass)
+{
+   return anv_address_add(
+      (struct anv_address) { .bo = pool->bo, },
+      khr_perf_query_availability_offset(pool, query, pass));
  }
  
+UNUSED static struct anv_address
+khr_perf_query_oa_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end)
+{
+   return anv_address_add(
+      (struct anv_address) { .bo = pool->bo, },
+      khr_perf_query_oa_offset(pool, query, pass, end));
+}
+
+
  /**
   * VK_INTEL_performance_query layout (576 bytes) :
   *
@@ -238,7 +362,17 @@ query_slot(struct anv_query_pool *pool, uint32_t query)
  static bool
  query_is_available(struct anv_query_pool *pool, uint32_t query)
  {
-   return *(volatile uint64_t *)query_slot(pool, query);
+   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+      for (uint32_t p = 0; p < pool->n_passes; p++) {
+         volatile uint64_t *slot =
+            pool->bo->map + khr_perf_query_availability_offset(pool, query, p);
+         if (!slot[0])
+            return false;
+      }
+      return true;
+   } else {
+      return *(volatile uint64_t *)query_slot(pool, query);
+   }
  }
  
  static VkResult
@@ -275,6 +409,7 @@ VkResult genX(GetQueryPoolResults)(
            pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
            pool->type == VK_QUERY_TYPE_TIMESTAMP ||
            pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
+          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
            pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
  
     if (anv_device_is_lost(device))
@@ -305,6 +440,12 @@ VkResult genX(GetQueryPoolResults)(
         *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
         *    availability state is still written to pData for those queries if
         *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
+       *
+       * From VK_KHR_performance_query :
+       *
+       *    "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
+       *     that the result should contain the number of counters that were recorded
+       *     into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
         */
        bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
  
@@ -367,6 +508,23 @@ VkResult genX(GetQueryPoolResults)(
           break;
        }
  
+#if GEN_GEN >= 8
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+         const struct anv_physical_device *pdevice = device->physical;
+         assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT |
+                          VK_QUERY_RESULT_PARTIAL_BIT)) == 0);
+         for (uint32_t p = 0; p < pool->n_passes; p++) {
+            const uint32_t *begin = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, false);
+            const uint32_t *end = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, true);
+            struct gen_perf_query_result result;
+            gen_perf_query_result_clear(&result);
+            gen_perf_query_result_accumulate(&result, pool->pass_query[p], begin, end);
+            anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData);
+         }
+         break;
+      }
+#endif
+
        case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
           if (!write_results)
              break;
@@ -503,6 +661,23 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
        }
        break;
  
+#if GEN_GEN >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      for (uint32_t i = 0; i < num_queries; i++) {
+         for (uint32_t p = 0; p < pool->n_passes; p++) {
+            gen_mi_memset(b,
+                          khr_perf_query_oa_address(pool,
+                                                    first_index + i, p, false),
+                          0, 2 * OA_SNAPSHOT_SIZE);
+            emit_query_mi_availability(b,
+                                       khr_perf_query_availability_address(pool, first_index + i, p),
+                                       true);
+         }
+      }
+      break;
+   }
+#endif
+
     case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
        for (uint32_t i = 0; i < num_queries; i++) {
           struct anv_address slot_addr =
@@ -546,6 +721,23 @@ void genX(CmdResetQueryPool)(
        break;
     }
  
+#if GEN_GEN >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      struct gen_mi_builder b;
+      gen_mi_builder_init(&b, &cmd_buffer->batch);
+
+      for (uint32_t i = 0; i < queryCount; i++) {
+         for (uint32_t p = 0; p < pool->n_passes; p++) {
+            emit_query_mi_availability(
+               &b,
+               khr_perf_query_availability_address(pool, firstQuery + i, p),
+               false);
+         }
+      }
+      break;
+   }
+#endif
+
     case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
        struct gen_mi_builder b;
        gen_mi_builder_init(&b, &cmd_buffer->batch);
@@ -569,8 +761,16 @@ void genX(ResetQueryPool)(
     ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
  
     for (uint32_t i = 0; i < queryCount; i++) {
-      uint64_t *slot = query_slot(pool, firstQuery + i);
-      *slot = 0;
+      if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
+         for (uint32_t p = 0; p < pool->n_passes; p++) {
+            uint64_t *pass_slot = pool->bo->map +
+               khr_perf_query_availability_offset(pool, firstQuery + i, p);
+            *pass_slot = 0;
+         }
+      } else {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
+         *slot = 0;
+      }
     }
  }
  
@@ -665,6 +865,41 @@ void genX(CmdBeginQueryIndexedEXT)(
        emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
        break;
  
+#if GEN_GEN >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      cmd_buffer->perf_query_pool = pool;
+
+      /* We know the bottom bits of the address are 0s which match what we
+       * want in the MI_RPC packet.
+       */
+      struct gen_mi_value mi_rpc_write_offset =
+         gen_mi_iadd(
+            &b,
+            gen_mi_imm(
+               gen_canonical_address(
+                  pool->bo->offset +
+                  khr_perf_query_oa_offset(pool, query, 0 /* pass */, false))),
+            gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
+      struct gen_mi_address_token mi_rpc_addr_dest =
+         gen_mi_store_address(&b, mi_rpc_write_offset);
+      gen_mi_self_mod_barrier(&b);
+
+      void *mi_rpc_dws =
+         anv_batch_emitn(&cmd_buffer->batch,
+                         GENX(MI_REPORT_PERF_COUNT_length),
+                         GENX(MI_REPORT_PERF_COUNT),
+                         .MemoryAddress = query_addr /* Will be overwritten */ );
+      _gen_mi_resolve_address_token(&b, mi_rpc_addr_dest,
+                                    mi_rpc_dws +
+                                    GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
+      break;
+   }
+#endif
+
     case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
        anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
           pc.CommandStreamerStallEnable = true;
@@ -757,6 +992,60 @@ void genX(CmdEndQueryIndexedEXT)(
        emit_query_mi_availability(&b, query_addr, true);
        break;
  
+#if GEN_GEN >= 8
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+
+      /* We know the bottom bits of the address are 0s which match what we
+       * want in the MI_RPC/MI_SDI packets.
+       */
+      struct gen_mi_value mi_rpc_write_offset =
+         gen_mi_iadd(
+            &b,
+            gen_mi_imm(
+               gen_canonical_address(
+                  pool->bo->offset +
+                  khr_perf_query_oa_offset(pool, query, 0 /* pass*/, true))),
+            gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
+      struct gen_mi_value availability_write_offset =
+         gen_mi_iadd(
+            &b,
+            gen_mi_imm(
+               gen_canonical_address(
+                  pool->bo->offset +
+                  khr_perf_query_availability_offset(pool, query, 0 /* pass */))),
+            gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG));
+
+      struct gen_mi_address_token mi_rpc_addr_dest =
+         gen_mi_store_address(&b, mi_rpc_write_offset);
+      struct gen_mi_address_token availability_addr_dest =
+         gen_mi_store_address(&b, availability_write_offset);
+      gen_mi_self_mod_barrier(&b);
+
+      void *mi_rpc_dws =
+         anv_batch_emitn(&cmd_buffer->batch,
+                         GENX(MI_REPORT_PERF_COUNT_length),
+                         GENX(MI_REPORT_PERF_COUNT),
+                         .MemoryAddress = query_addr /* Will be overwritten */ );
+      _gen_mi_resolve_address_token(&b, mi_rpc_addr_dest,
+                                    mi_rpc_dws +
+                                    GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8);
+
+      void *availability_dws =
+         anv_batch_emitn(&cmd_buffer->batch,
+                         GENX(MI_STORE_DATA_IMM_length),
+                         GENX(MI_STORE_DATA_IMM),
+                         .ImmediateData = true);
+      _gen_mi_resolve_address_token(&b, availability_addr_dest,
+                                    availability_dws +
+                                    GENX(MI_STORE_DATA_IMM_Address_start) / 8);
+      break;
+   }
+#endif
+
     case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
        anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
           pc.CommandStreamerStallEnable = true;
@@ -1039,6 +1328,12 @@ void genX(CmdCopyQueryPoolResults)(
           gpu_write_query_result(&b, dest_addr, flags, 0, result);
           break;
  
+#if GEN_GEN >= 8
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
+         unreachable("Copy KHR performance query results not implemented");
+         break;
+#endif
+
        default:
           unreachable("unhandled query type");
        }
author	Lionel Landwerlin <lionel.g.landwerlin@intel.com>
	Sat, 6 Oct 2018 18:12:34 +0000 (19:12 +0100)
committer	Lionel Landwerlin <lionel.g.landwerlin@intel.com>
	Wed, 20 May 2020 11:02:27 +0000 (14:02 +0300)
src/intel/vulkan/anv_batch_chain.c		patch \| blob \| history
src/intel/vulkan/anv_cmd_buffer.c		patch \| blob \| history
src/intel/vulkan/anv_device.c		patch \| blob \| history
src/intel/vulkan/anv_extensions.py		patch \| blob \| history
src/intel/vulkan/anv_perf.c		patch \| blob \| history
src/intel/vulkan/anv_private.h		patch \| blob \| history
src/intel/vulkan/anv_queue.c		patch \| blob \| history
src/intel/vulkan/genX_cmd_buffer.c		patch \| blob \| history
src/intel/vulkan/genX_query.c		patch \| blob \| history