anv: implement VK_INTEL_performance_query
authorLionel Landwerlin <lionel.g.landwerlin@intel.com>
Thu, 7 Jun 2018 17:02:03 +0000 (18:02 +0100)
committerLionel Landwerlin <lionel.g.landwerlin@intel.com>
Wed, 23 Oct 2019 05:41:15 +0000 (05:41 +0000)
v2: Introduce the appropriate pipe controls
    Properly deal with changes in metric sets (using execbuf parameter)
    Record marker at query end

v3: Fill out PerfCntr1&2

v4: Introduce vkUninitializePerformanceApiINTEL

v5: Use new execbuf extension mechanism

v6: Fix comments in genX_query.c (Rafael)
    Use PIPE_CONTROL workarounds (Rafael)
    Refactor on the last kernel series update (Lionel)

v7: Only I915_PERF_IOCTL_CONFIG when perf stream is already opened (Lionel)

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Rafael Antognolli <rafael.antognolli@intel.com>
src/intel/Android.vulkan.mk
src/intel/Makefile.sources
src/intel/vulkan/anv_device.c
src/intel/vulkan/anv_extensions.py
src/intel/vulkan/anv_perf.c [new file with mode: 0644]
src/intel/vulkan/anv_private.h
src/intel/vulkan/genX_cmd_buffer.c
src/intel/vulkan/genX_query.c
src/intel/vulkan/meson.build

index 468ddfc65f88dc94b5faea8647eb3ca95bbc6429..134f4183eec1939899d50c5f2b1dc67fd28de991 100644 (file)
@@ -305,6 +305,7 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
        libmesa_compiler \
        libmesa_intel_common \
        libmesa_intel_dev \
+       libmesa_intel_perf \
        libmesa_vulkan_common \
        libmesa_vulkan_util \
        libmesa_anv_gen7 \
index b65cc934350cb3fec760e268b2b5d2becc17717e..4900dd56bd282af23ef5de5e9359d6194e7d4e12 100644 (file)
@@ -259,6 +259,7 @@ VULKAN_FILES := \
        vulkan/anv_nir_lower_push_constants.c \
        vulkan/anv_nir_lower_ycbcr_textures.c \
        vulkan/anv_pass.c \
+       vulkan/anv_perf.c \
        vulkan/anv_pipeline.c \
        vulkan/anv_pipeline_cache.c \
        vulkan/anv_private.h \
index 8934957e39bd79433f924bc8a127f17b6293886c..9730e027392e124f0b961b8966c06ba4fb2ab3d3 100644 (file)
@@ -604,6 +604,8 @@ anv_physical_device_init(struct anv_physical_device *device,
       goto fail;
    }
 
+   device->perf = anv_get_perf(&device->info, fd);
+
    anv_physical_device_get_supported_extensions(device,
                                                 &device->supported_extensions);
 
@@ -625,6 +627,7 @@ anv_physical_device_finish(struct anv_physical_device *device)
    anv_finish_wsi(device);
    anv_physical_device_free_disk_cache(device);
    ralloc_free(device->compiler);
+   ralloc_free(device->perf);
    close(device->local_fd);
    if (device->master_fd >= 0)
       close(device->master_fd);
@@ -2657,6 +2660,8 @@ VkResult anv_CreateDevice(
 
    anv_device_init_border_colors(device);
 
+   anv_device_perf_init(device);
+
    *pDevice = anv_device_to_handle(device);
 
    return VK_SUCCESS;
index 84284398b6a34f47aeb77cec193a0a3171cf1ae0..c72c23530f873882a1ed22898e6206ea0af23c54 100644 (file)
@@ -165,6 +165,7 @@ EXTENSIONS = [
     Extension('VK_ANDROID_native_buffer',                 7, 'ANDROID'),
     Extension('VK_GOOGLE_decorate_string',                1, True),
     Extension('VK_GOOGLE_hlsl_functionality1',            1, True),
+    Extension('VK_INTEL_performance_query',               1, 'device->perf'),
     Extension('VK_NV_compute_shader_derivatives',         1, True),
 ]
 
diff --git a/src/intel/vulkan/anv_perf.c b/src/intel/vulkan/anv_perf.c
new file mode 100644 (file)
index 0000000..6a9fb4f
--- /dev/null
@@ -0,0 +1,224 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "anv_private.h"
+
+#include "perf/gen_perf.h"
+#include "perf/gen_perf_mdapi.h"
+
+struct gen_perf_config *
+anv_get_perf(const struct gen_device_info *devinfo, int fd)
+{
+   struct gen_perf_config *perf = gen_perf_new(NULL);
+
+   gen_perf_init_metrics(perf, devinfo, fd);
+
+   /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in
+    * perf revision 2.
+    */
+   if (anv_gem_get_param(fd, I915_PARAM_PERF_REVISION) < 3)
+      goto err;
+
+   return perf;
+
+ err:
+   ralloc_free(perf);
+   return NULL;
+}
+
+void
+anv_device_perf_init(struct anv_device *device)
+{
+   device->perf_fd = -1;
+}
+
+static int
+anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
+{
+   uint64_t properties[DRM_I915_PERF_PROP_MAX * 2];
+   struct drm_i915_perf_open_param param;
+   int p = 0, stream_fd;
+
+   properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA;
+   properties[p++] = true;
+
+   properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET;
+   properties[p++] = metric_id;
+
+   properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT;
+   properties[p++] = device->info.gen >= 8 ?
+      I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
+      I915_OA_FORMAT_A45_B8_C8;
+
+   properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT;
+   properties[p++] = 31; /* slowest sampling period */
+
+   properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE;
+   properties[p++] = device->context_id;
+
+   properties[p++] = DRM_I915_PERF_PROP_HOLD_PREEMPTION;
+   properties[p++] = true;
+
+   memset(&param, 0, sizeof(param));
+   param.flags = 0;
+   param.flags |= I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK;
+   param.properties_ptr = (uintptr_t)properties;
+   param.num_properties = p / 2;
+
+   stream_fd = gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_OPEN, &param);
+   return stream_fd;
+}
+
+VkResult anv_InitializePerformanceApiINTEL(
+    VkDevice                                    _device,
+    const VkInitializePerformanceApiInfoINTEL*  pInitializeInfo)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
+
+   if (!pdevice->perf)
+      return VK_ERROR_EXTENSION_NOT_PRESENT;
+
+   /* Not much to do here */
+   return VK_SUCCESS;
+}
+
+VkResult anv_GetPerformanceParameterINTEL(
+    VkDevice                                    _device,
+    VkPerformanceParameterTypeINTEL             parameter,
+    VkPerformanceValueINTEL*                    pValue)
+{
+      ANV_FROM_HANDLE(anv_device, device, _device);
+      const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
+
+      if (!pdevice->perf)
+         return VK_ERROR_EXTENSION_NOT_PRESENT;
+
+      VkResult result = VK_SUCCESS;
+      switch (parameter) {
+      case VK_PERFORMANCE_PARAMETER_TYPE_HW_COUNTERS_SUPPORTED_INTEL:
+         pValue->type = VK_PERFORMANCE_VALUE_TYPE_BOOL_INTEL;
+         pValue->data.valueBool = VK_TRUE;
+         break;
+
+      case VK_PERFORMANCE_PARAMETER_TYPE_STREAM_MARKER_VALID_BITS_INTEL:
+         pValue->type = VK_PERFORMANCE_VALUE_TYPE_UINT32_INTEL;
+         pValue->data.value32 = 25;
+         break;
+
+      default:
+         result = VK_ERROR_FEATURE_NOT_PRESENT;
+         break;
+      }
+
+      return result;
+}
+
+VkResult anv_CmdSetPerformanceMarkerINTEL(
+    VkCommandBuffer                             commandBuffer,
+    const VkPerformanceMarkerInfoINTEL*         pMarkerInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   cmd_buffer->intel_perf_marker = pMarkerInfo->marker;
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_AcquirePerformanceConfigurationINTEL(
+    VkDevice                                    _device,
+    const VkPerformanceConfigurationAcquireInfoINTEL* pAcquireInfo,
+    VkPerformanceConfigurationINTEL*            pConfiguration)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
+
+   struct gen_perf_registers *perf_config =
+      gen_perf_load_configuration(pdevice->perf, device->fd,
+                                  GEN_PERF_QUERY_GUID_MDAPI);
+   if (!perf_config)
+      return VK_INCOMPLETE;
+
+   int ret = gen_perf_store_configuration(pdevice->perf, device->fd,
+                                          perf_config, NULL /* guid */);
+   if (ret < 0) {
+      ralloc_free(perf_config);
+      return VK_INCOMPLETE;
+   }
+
+   *pConfiguration = (VkPerformanceConfigurationINTEL) (uint64_t) ret;
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_ReleasePerformanceConfigurationINTEL(
+    VkDevice                                    _device,
+    VkPerformanceConfigurationINTEL             _configuration)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   uint64_t config = (uint64_t) _configuration;
+
+   gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config);
+
+   return VK_SUCCESS;
+}
+
+VkResult anv_QueueSetPerformanceConfigurationINTEL(
+    VkQueue                                     _queue,
+    VkPerformanceConfigurationINTEL             _configuration)
+{
+   ANV_FROM_HANDLE(anv_queue, queue, _queue);
+   struct anv_device *device = queue->device;
+   uint64_t configuration = (uint64_t) _configuration;
+
+   if (device->perf_fd < 0) {
+      device->perf_fd = anv_device_perf_open(device, configuration);
+      if (device->perf_fd < 0)
+         return VK_ERROR_INITIALIZATION_FAILED;
+   } else {
+      int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
+                          (void *)(uintptr_t) _configuration);
+      if (ret < 0) {
+         return anv_device_set_lost(device,
+                                    "i915-perf config failed: %s",
+                                    strerror(ret));
+      }
+   }
+
+   return VK_SUCCESS;
+}
+
+void anv_UninitializePerformanceApiINTEL(
+    VkDevice                                    _device)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   if (device->perf_fd >= 0) {
+      close(device->perf_fd);
+      device->perf_fd = -1;
+   }
+}
index 3aa6d1922f955c88682900c47467aa95c8ffb20a..aa1f2cbea87e5bbdad4bc7f3c658c423a5b7ecd1 100644 (file)
@@ -74,6 +74,7 @@ struct anv_image_view;
 struct anv_instance;
 
 struct gen_l3_config;
+struct gen_perf_config;
 
 #include <vulkan/vulkan.h>
 #include <vulkan/vulkan_intel.h>
@@ -948,6 +949,7 @@ struct anv_physical_device {
     bool                                        supports_48bit_addresses;
     struct brw_compiler *                       compiler;
     struct isl_device                           isl_dev;
+    struct gen_perf_config *                    perf;
     int                                         cmd_parser_version;
     bool                                        has_exec_async;
     bool                                        has_exec_capture;
@@ -1169,6 +1171,9 @@ struct anv_device {
      * the cmd_buffer's list.
      */
     struct anv_cmd_buffer                      *cmd_buffer_being_decoded;
+
+    int                                         perf_fd; /* -1 if no opened */
+    uint64_t                                    perf_metric; /* 0 if unset */
 };
 
 static inline struct anv_state_pool *
@@ -2530,6 +2535,9 @@ struct anv_cmd_buffer {
    VkCommandBufferLevel                         level;
 
    struct anv_cmd_state                         state;
+
+   /* Set by SetPerformanceMarkerINTEL, written into queries by CmdBeginQuery */
+   uint64_t                                     intel_perf_marker;
 };
 
 VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer);
@@ -3750,6 +3758,9 @@ anv_get_subpass_id(const struct anv_cmd_state * const cmd_state)
    return subpass_id;
 }
 
+struct gen_perf_config *anv_get_perf(const struct gen_device_info *devinfo, int fd);
+void anv_device_perf_init(struct anv_device *device);
+
 #define ANV_DEFINE_HANDLE_CASTS(__anv_type, __VkType)                      \
                                                                            \
    static inline struct __anv_type *                                       \
index dd0f6e206813c2563dd257c2183cc95f84d21592..ff9c6c79eb9af626917fa03f7a21782b21a8191b 100644 (file)
@@ -5091,3 +5091,57 @@ void genX(CmdWaitEvents)(
                             bufferMemoryBarrierCount, pBufferMemoryBarriers,
                             imageMemoryBarrierCount, pImageMemoryBarriers);
 }
+
+VkResult genX(CmdSetPerformanceOverrideINTEL)(
+    VkCommandBuffer                             commandBuffer,
+    const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
+{
+   ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+   switch (pOverrideInfo->type) {
+   case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
+      uint32_t dw;
+
+#if GEN_GEN >= 9
+      anv_pack_struct(&dw, GENX(CS_DEBUG_MODE2),
+                      ._3DRenderingInstructionDisable = pOverrideInfo->enable,
+                      .MediaInstructionDisable = pOverrideInfo->enable,
+                      ._3DRenderingInstructionDisableMask = true,
+                      .MediaInstructionDisableMask = true);
+      emit_lri(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2_num), dw);
+#else
+      anv_pack_struct(&dw, GENX(INSTPM),
+                      ._3DRenderingInstructionDisable = pOverrideInfo->enable,
+                      .MediaInstructionDisable = pOverrideInfo->enable,
+                      ._3DRenderingInstructionDisableMask = true,
+                      .MediaInstructionDisableMask = true);
+      emit_lri(&cmd_buffer->batch, GENX(INSTPM_num), dw);
+#endif
+      break;
+   }
+
+   case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
+      if (pOverrideInfo->enable) {
+         /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
+         cmd_buffer->state.pending_pipe_bits |=
+            ANV_PIPE_FLUSH_BITS |
+            ANV_PIPE_INVALIDATE_BITS;
+         genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
+      }
+      break;
+
+   default:
+      unreachable("Invalid override");
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
+    VkCommandBuffer                             commandBuffer,
+    const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
+{
+   /* TODO: Waiting on the register to write, might depend on generation. */
+
+   return VK_SUCCESS;
+}
index aa0cf8b947118c3dbee6ad7ba0faf27959a19ae6..b3090f20545179000993e2ed353e4c98a3ec1236 100644 (file)
 #define __gen_get_batch_dwords anv_batch_emit_dwords
 #define __gen_address_offset anv_address_add
 #include "common/gen_mi_builder.h"
+#include "perf/gen_perf.h"
+#include "perf/gen_perf_mdapi.h"
+
+#define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
 
 VkResult genX(CreateQueryPool)(
     VkDevice                                    _device,
@@ -52,9 +56,14 @@ VkResult genX(CreateQueryPool)(
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
 
    /* Query pool slots are made up of some number of 64-bit values packed
-    * tightly together.  The first 64-bit value is always the "available" bit
-    * which is 0 when the query is unavailable and 1 when it is available.
-    * The 64-bit values that follow are determined by the type of query.
+    * tightly together. For most query types have the first 64-bit value is
+    * the "available" bit which is 0 when the query is unavailable and 1 when
+    * it is available. The 64-bit values that follow are determined by the
+    * type of query.
+    *
+    * For performance queries, we have a requirement to align OA reports at
+    * 64bytes so we put those first and have the "available" bit behind
+    * together with some other counters.
     */
    uint32_t uint64s_per_slot = 1;
 
@@ -84,6 +93,15 @@ VkResult genX(CreateQueryPool)(
        */
       uint64s_per_slot += 4;
       break;
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+      uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */
+      uint64s_per_slot += 4; /* PerfCounter 1 & 2 */
+      uint64s_per_slot++; /* 2 * 32bit RPSTAT register */
+      uint64s_per_slot++; /* 64bit marker */
+      uint64s_per_slot++; /* availability */
+      uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */
+      break;
+   }
    default:
       assert(!"Invalid query type");
    }
@@ -160,6 +178,57 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query)
    };
 }
 
+/**
+ * VK_INTEL_performance_query layout:
+ *
+ * ------------------------------
+ * |       end MI_RPC (256b)    |
+ * |----------------------------|
+ * |     begin MI_RPC (256b)    |
+ * |----------------------------|
+ * | begin perfcntr 1 & 2 (16b) |
+ * |----------------------------|
+ * |  end perfcntr 1 & 2 (16b)  |
+ * |----------------------------|
+ * | begin RPSTAT register (4b) |
+ * |----------------------------|
+ * |  end RPSTAT register (4b)  |
+ * |----------------------------|
+ * |         marker (8b)        |
+ * |----------------------------|
+ * |       availability (8b)    |
+ * ------------------------------
+ */
+
+static uint32_t
+intel_perf_mi_rpc_offset(bool end)
+{
+   return end ? 0 : 256;
+}
+
+static uint32_t
+intel_perf_counter(bool end)
+{
+   uint32_t offset = 512;
+   offset += end ? 2 * sizeof(uint64_t) : 0;
+   return offset;
+}
+
+static uint32_t
+intel_perf_rpstart_offset(bool end)
+{
+   uint32_t offset = intel_perf_counter(false) +
+      4 * sizeof(uint64_t);
+   offset += end ? sizeof(uint32_t) : 0;
+   return offset;
+}
+
+static uint32_t
+intel_perf_marker_offset(void)
+{
+   return intel_perf_rpstart_offset(false) + sizeof(uint64_t);
+}
+
 static void
 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
                        uint32_t value_index, uint64_t result)
@@ -173,18 +242,28 @@ cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
    }
 }
 
+static void *
+query_slot(struct anv_query_pool *pool, uint32_t query)
+{
+   return pool->bo.map + query * pool->stride;
+}
+
 static bool
-query_is_available(uint64_t *slot)
+query_is_available(struct anv_query_pool *pool, uint32_t query)
 {
-   return *(volatile uint64_t *)slot;
+   if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
+      return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) +
+                                    pool->stride - 8);
+   } else
+      return *(volatile uint64_t *)query_slot(pool, query);
 }
 
 static VkResult
 wait_for_available(struct anv_device *device,
-                   struct anv_query_pool *pool, uint64_t *slot)
+                   struct anv_query_pool *pool, uint32_t query)
 {
    while (true) {
-      if (query_is_available(slot))
+      if (query_is_available(pool, query))
          return VK_SUCCESS;
 
       int ret = anv_gem_busy(device, pool->bo.gem_handle);
@@ -197,7 +276,7 @@ wait_for_available(struct anv_device *device,
       } else {
          assert(ret == 0);
          /* The BO is no longer busy. */
-         if (query_is_available(slot)) {
+         if (query_is_available(pool, query)) {
             return VK_SUCCESS;
          } else {
             VkResult status = anv_device_query_status(device);
@@ -233,7 +312,8 @@ VkResult genX(GetQueryPoolResults)(
    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
           pool->type == VK_QUERY_TYPE_TIMESTAMP ||
-          pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
+          pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
+          pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
 
    if (anv_device_is_lost(device))
       return VK_ERROR_DEVICE_LOST;
@@ -245,13 +325,10 @@ VkResult genX(GetQueryPoolResults)(
 
    VkResult status = VK_SUCCESS;
    for (uint32_t i = 0; i < queryCount; i++) {
-      uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
-
-      /* Availability is always at the start of the slot */
-      bool available = slot[0];
+      bool available = query_is_available(pool, firstQuery + i);
 
       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
-         status = wait_for_available(device, pool, slot);
+         status = wait_for_available(device, pool, firstQuery + i);
          if (status != VK_SUCCESS)
             return status;
 
@@ -271,13 +348,16 @@ VkResult genX(GetQueryPoolResults)(
 
       uint32_t idx = 0;
       switch (pool->type) {
-      case VK_QUERY_TYPE_OCCLUSION:
+      case VK_QUERY_TYPE_OCCLUSION: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
          if (write_results)
             cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
          idx++;
          break;
+      }
 
       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
          uint32_t statistics = pool->pipeline_statistics;
          while (statistics) {
             uint32_t stat = u_bit_scan(&statistics);
@@ -297,7 +377,8 @@ VkResult genX(GetQueryPoolResults)(
          break;
       }
 
-      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
          if (write_results)
             cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
          idx++;
@@ -305,12 +386,54 @@ VkResult genX(GetQueryPoolResults)(
             cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
          idx++;
          break;
+      }
 
-      case VK_QUERY_TYPE_TIMESTAMP:
+      case VK_QUERY_TYPE_TIMESTAMP: {
+         uint64_t *slot = query_slot(pool, firstQuery + i);
          if (write_results)
             cpu_write_query_result(pData, flags, idx, slot[1]);
          idx++;
          break;
+      }
+
+      case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+         if (!write_results)
+            break;
+         const void *query_data = query_slot(pool, firstQuery + i);
+         const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
+         const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
+         const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
+         const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
+         struct gen_perf_query_result result;
+         struct gen_perf_query_info metric = {
+            .oa_format = (GEN_GEN >= 8 ?
+                          I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
+                          I915_OA_FORMAT_A45_B8_C8),
+         };
+         uint32_t core_freq[2];
+#if GEN_GEN < 9
+         core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
+         core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
+#else
+         core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
+         core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
+#endif
+         gen_perf_query_result_clear(&result);
+         gen_perf_query_result_accumulate(&result, &metric,
+                                          oa_begin, oa_end);
+         gen_perf_query_result_read_frequencies(&result, &device->info,
+                                                oa_begin, oa_end);
+         gen_perf_query_result_write_mdapi(pData, stride,
+                                           &device->info,
+                                           &result,
+                                           core_freq[0], core_freq[1]);
+         gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
+                                             query_data + intel_perf_counter(false),
+                                             query_data + intel_perf_counter(true));
+         const uint64_t *marker = query_data + intel_perf_marker_offset();
+         gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
+         break;
+      }
 
       default:
          unreachable("invalid pool type");
@@ -406,6 +529,16 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
       }
       break;
 
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
+      for (uint32_t i = 0; i < num_queries; i++) {
+         struct anv_address slot_addr =
+            anv_query_address(pool, first_index + i);
+         gen_mi_memset(b, slot_addr, 0, pool->stride - 8);
+         emit_query_mi_availability(b, anv_address_add(slot_addr,
+                                                       pool->stride - 8), true);
+      }
+      break;
+
    default:
       unreachable("Unsupported query type");
    }
@@ -440,6 +573,21 @@ void genX(CmdResetQueryPool)(
       break;
    }
 
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+      struct gen_mi_builder b;
+      gen_mi_builder_init(&b, &cmd_buffer->batch);
+
+      for (uint32_t i = 0; i < queryCount; i++) {
+         emit_query_mi_availability(
+            &b,
+            anv_address_add(
+               anv_query_address(pool, firstQuery + i),
+               pool->stride - 8),
+            false);
+      }
+      break;
+   }
+
    default:
       unreachable("Unsupported query type");
    }
@@ -550,6 +698,37 @@ void genX(CmdBeginQueryIndexedEXT)(
       emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
       break;
 
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
+         rpc.MemoryAddress =
+            anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
+      }
+#if GEN_GEN < 9
+      gen_mi_store(&b,
+                   gen_mi_mem32(anv_address_add(query_addr,
+                                                intel_perf_rpstart_offset(false))),
+                   gen_mi_reg32(GENX(RPSTAT1_num)));
+#else
+      gen_mi_store(&b,
+                   gen_mi_mem32(anv_address_add(query_addr,
+                                                intel_perf_rpstart_offset(false))),
+                   gen_mi_reg32(GENX(RPSTAT0_num)));
+#endif
+#if GEN_GEN >= 8 && GEN_GEN <= 11
+      gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
+                                                    intel_perf_counter(false))),
+                   gen_mi_reg64(GENX(PERFCNT1_num)));
+      gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
+                                                    intel_perf_counter(false) + 8)),
+                   gen_mi_reg64(GENX(PERFCNT2_num)));
+#endif
+      break;
+   }
+
    default:
       unreachable("");
    }
@@ -611,6 +790,45 @@ void genX(CmdEndQueryIndexedEXT)(
       emit_query_mi_availability(&b, query_addr, true);
       break;
 
+   case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+      anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+         pc.CommandStreamerStallEnable = true;
+         pc.StallAtPixelScoreboard = true;
+      }
+      uint32_t marker_offset = intel_perf_marker_offset();
+      gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
+                   gen_mi_imm(cmd_buffer->intel_perf_marker));
+#if GEN_GEN >= 8 && GEN_GEN <= 11
+      gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
+                   gen_mi_reg64(GENX(PERFCNT1_num)));
+      gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
+                   gen_mi_reg64(GENX(PERFCNT2_num)));
+#endif
+#if GEN_GEN < 9
+      gen_mi_store(&b,
+                   gen_mi_mem32(anv_address_add(query_addr,
+                                                intel_perf_rpstart_offset(true))),
+                   gen_mi_reg32(GENX(RPSTAT1_num)));
+#else
+      gen_mi_store(&b,
+                   gen_mi_mem32(anv_address_add(query_addr,
+                                                intel_perf_rpstart_offset(true))),
+                   gen_mi_reg32(GENX(RPSTAT0_num)));
+#endif
+      /* Position the last OA snapshot at the beginning of the query so that
+       * we can tell whether it's ready.
+       */
+      anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
+         rpc.MemoryAddress = anv_address_add(query_addr,
+                                             intel_perf_mi_rpc_offset(true));
+         rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
+      }
+      emit_query_mi_availability(&b,
+                                 anv_address_add(query_addr, pool->stride - 8),
+                                 true);
+      break;
+   }
+
    default:
       unreachable("");
    }
index e8db8f44de0c141b29e1304c3cf72dac89e88853..69e472f719b80a286d3058030f685c0984836c09 100644 (file)
@@ -118,6 +118,7 @@ libanv_files = files(
   'anv_nir_lower_push_constants.c',
   'anv_nir_lower_ycbcr_textures.c',
   'anv_pass.c',
+  'anv_perf.c',
   'anv_pipeline.c',
   'anv_pipeline_cache.c',
   'anv_private.h',
@@ -194,6 +195,7 @@ libvulkan_intel = shared_library(
   link_whole : [libanv_common, libanv_gen_libs],
   link_with : [
     libintel_compiler, libintel_dev, libisl, libblorp, libvulkan_wsi,
+    libintel_perf,
   ],
   dependencies : [
     dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common,
@@ -227,7 +229,7 @@ if with_tests
     link_whole : libanv_common,
     link_with : [
       libanv_gen_libs, libintel_compiler, libintel_common, libintel_dev,
-      libisl, libblorp, libvulkan_wsi,
+      libisl, libblorp, libvulkan_wsi, libintel_perf,
     ],
     dependencies : [
       dep_thread, dep_dl, dep_m, anv_deps,