From 2b5f30b1d91b98ab27ba21439cd8a40a0d1ece36 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 7 Jun 2018 18:02:03 +0100 Subject: [PATCH] anv: implement VK_INTEL_performance_query v2: Introduce the appropriate pipe controls Properly deal with changes in metric sets (using execbuf parameter) Record marker at query end v3: Fill out PerfCntr1&2 v4: Introduce vkUninitializePerformanceApiINTEL v5: Use new execbuf extension mechanism v6: Fix comments in genX_query.c (Rafael) Use PIPE_CONTROL workarounds (Rafael) Refactor on the last kernel series update (Lionel) v7: Only I915_PERF_IOCTL_CONFIG when perf stream is already opened (Lionel) Signed-off-by: Lionel Landwerlin Reviewed-by: Rafael Antognolli --- src/intel/Android.vulkan.mk | 1 + src/intel/Makefile.sources | 1 + src/intel/vulkan/anv_device.c | 5 + src/intel/vulkan/anv_extensions.py | 1 + src/intel/vulkan/anv_perf.c | 224 +++++++++++++++++++++++++ src/intel/vulkan/anv_private.h | 11 ++ src/intel/vulkan/genX_cmd_buffer.c | 54 +++++++ src/intel/vulkan/genX_query.c | 252 +++++++++++++++++++++++++++-- src/intel/vulkan/meson.build | 4 +- 9 files changed, 535 insertions(+), 18 deletions(-) create mode 100644 src/intel/vulkan/anv_perf.c diff --git a/src/intel/Android.vulkan.mk b/src/intel/Android.vulkan.mk index 468ddfc65f8..134f4183eec 100644 --- a/src/intel/Android.vulkan.mk +++ b/src/intel/Android.vulkan.mk @@ -305,6 +305,7 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ libmesa_compiler \ libmesa_intel_common \ libmesa_intel_dev \ + libmesa_intel_perf \ libmesa_vulkan_common \ libmesa_vulkan_util \ libmesa_anv_gen7 \ diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources index b65cc934350..4900dd56bd2 100644 --- a/src/intel/Makefile.sources +++ b/src/intel/Makefile.sources @@ -259,6 +259,7 @@ VULKAN_FILES := \ vulkan/anv_nir_lower_push_constants.c \ vulkan/anv_nir_lower_ycbcr_textures.c \ vulkan/anv_pass.c \ + vulkan/anv_perf.c \ vulkan/anv_pipeline.c \ vulkan/anv_pipeline_cache.c \ vulkan/anv_private.h \ diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 8934957e39b..9730e027392 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -604,6 +604,8 @@ anv_physical_device_init(struct anv_physical_device *device, goto fail; } + device->perf = anv_get_perf(&device->info, fd); + anv_physical_device_get_supported_extensions(device, &device->supported_extensions); @@ -625,6 +627,7 @@ anv_physical_device_finish(struct anv_physical_device *device) anv_finish_wsi(device); anv_physical_device_free_disk_cache(device); ralloc_free(device->compiler); + ralloc_free(device->perf); close(device->local_fd); if (device->master_fd >= 0) close(device->master_fd); @@ -2657,6 +2660,8 @@ VkResult anv_CreateDevice( anv_device_init_border_colors(device); + anv_device_perf_init(device); + *pDevice = anv_device_to_handle(device); return VK_SUCCESS; diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py index 84284398b6a..c72c23530f8 100644 --- a/src/intel/vulkan/anv_extensions.py +++ b/src/intel/vulkan/anv_extensions.py @@ -165,6 +165,7 @@ EXTENSIONS = [ Extension('VK_ANDROID_native_buffer', 7, 'ANDROID'), Extension('VK_GOOGLE_decorate_string', 1, True), Extension('VK_GOOGLE_hlsl_functionality1', 1, True), + Extension('VK_INTEL_performance_query', 1, 'device->perf'), Extension('VK_NV_compute_shader_derivatives', 1, True), ] diff --git a/src/intel/vulkan/anv_perf.c b/src/intel/vulkan/anv_perf.c new file mode 100644 index 00000000000..6a9fb4f6f11 --- /dev/null +++ b/src/intel/vulkan/anv_perf.c @@ -0,0 +1,224 @@ +/* + * Copyright © 2018 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include "anv_private.h" + +#include "perf/gen_perf.h" +#include "perf/gen_perf_mdapi.h" + +struct gen_perf_config * +anv_get_perf(const struct gen_device_info *devinfo, int fd) +{ + struct gen_perf_config *perf = gen_perf_new(NULL); + + gen_perf_init_metrics(perf, devinfo, fd); + + /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in + * perf revision 2. + */ + if (anv_gem_get_param(fd, I915_PARAM_PERF_REVISION) < 3) + goto err; + + return perf; + + err: + ralloc_free(perf); + return NULL; +} + +void +anv_device_perf_init(struct anv_device *device) +{ + device->perf_fd = -1; +} + +static int +anv_device_perf_open(struct anv_device *device, uint64_t metric_id) +{ + uint64_t properties[DRM_I915_PERF_PROP_MAX * 2]; + struct drm_i915_perf_open_param param; + int p = 0, stream_fd; + + properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA; + properties[p++] = true; + + properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET; + properties[p++] = metric_id; + + properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT; + properties[p++] = device->info.gen >= 8 ? + I915_OA_FORMAT_A32u40_A4u32_B8_C8 : + I915_OA_FORMAT_A45_B8_C8; + + properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT; + properties[p++] = 31; /* slowest sampling period */ + + properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE; + properties[p++] = device->context_id; + + properties[p++] = DRM_I915_PERF_PROP_HOLD_PREEMPTION; + properties[p++] = true; + + memset(¶m, 0, sizeof(param)); + param.flags = 0; + param.flags |= I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK; + param.properties_ptr = (uintptr_t)properties; + param.num_properties = p / 2; + + stream_fd = gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); + return stream_fd; +} + +VkResult anv_InitializePerformanceApiINTEL( + VkDevice _device, + const VkInitializePerformanceApiInfoINTEL* pInitializeInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + const struct anv_physical_device *pdevice = &device->instance->physicalDevice; + + if (!pdevice->perf) + return VK_ERROR_EXTENSION_NOT_PRESENT; + + /* Not much to do here */ + return VK_SUCCESS; +} + +VkResult anv_GetPerformanceParameterINTEL( + VkDevice _device, + VkPerformanceParameterTypeINTEL parameter, + VkPerformanceValueINTEL* pValue) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + const struct anv_physical_device *pdevice = &device->instance->physicalDevice; + + if (!pdevice->perf) + return VK_ERROR_EXTENSION_NOT_PRESENT; + + VkResult result = VK_SUCCESS; + switch (parameter) { + case VK_PERFORMANCE_PARAMETER_TYPE_HW_COUNTERS_SUPPORTED_INTEL: + pValue->type = VK_PERFORMANCE_VALUE_TYPE_BOOL_INTEL; + pValue->data.valueBool = VK_TRUE; + break; + + case VK_PERFORMANCE_PARAMETER_TYPE_STREAM_MARKER_VALID_BITS_INTEL: + pValue->type = VK_PERFORMANCE_VALUE_TYPE_UINT32_INTEL; + pValue->data.value32 = 25; + break; + + default: + result = VK_ERROR_FEATURE_NOT_PRESENT; + break; + } + + return result; +} + +VkResult anv_CmdSetPerformanceMarkerINTEL( + VkCommandBuffer commandBuffer, + const VkPerformanceMarkerInfoINTEL* pMarkerInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + cmd_buffer->intel_perf_marker = pMarkerInfo->marker; + + return VK_SUCCESS; +} + +VkResult anv_AcquirePerformanceConfigurationINTEL( + VkDevice _device, + const VkPerformanceConfigurationAcquireInfoINTEL* pAcquireInfo, + VkPerformanceConfigurationINTEL* pConfiguration) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + const struct anv_physical_device *pdevice = &device->instance->physicalDevice; + + struct gen_perf_registers *perf_config = + gen_perf_load_configuration(pdevice->perf, device->fd, + GEN_PERF_QUERY_GUID_MDAPI); + if (!perf_config) + return VK_INCOMPLETE; + + int ret = gen_perf_store_configuration(pdevice->perf, device->fd, + perf_config, NULL /* guid */); + if (ret < 0) { + ralloc_free(perf_config); + return VK_INCOMPLETE; + } + + *pConfiguration = (VkPerformanceConfigurationINTEL) (uint64_t) ret; + + return VK_SUCCESS; +} + +VkResult anv_ReleasePerformanceConfigurationINTEL( + VkDevice _device, + VkPerformanceConfigurationINTEL _configuration) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + uint64_t config = (uint64_t) _configuration; + + gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config); + + return VK_SUCCESS; +} + +VkResult anv_QueueSetPerformanceConfigurationINTEL( + VkQueue _queue, + VkPerformanceConfigurationINTEL _configuration) +{ + ANV_FROM_HANDLE(anv_queue, queue, _queue); + struct anv_device *device = queue->device; + uint64_t configuration = (uint64_t) _configuration; + + if (device->perf_fd < 0) { + device->perf_fd = anv_device_perf_open(device, configuration); + if (device->perf_fd < 0) + return VK_ERROR_INITIALIZATION_FAILED; + } else { + int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG, + (void *)(uintptr_t) _configuration); + if (ret < 0) { + return anv_device_set_lost(device, + "i915-perf config failed: %s", + strerror(ret)); + } + } + + return VK_SUCCESS; +} + +void anv_UninitializePerformanceApiINTEL( + VkDevice _device) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + if (device->perf_fd >= 0) { + close(device->perf_fd); + device->perf_fd = -1; + } +} diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 3aa6d1922f9..aa1f2cbea87 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -74,6 +74,7 @@ struct anv_image_view; struct anv_instance; struct gen_l3_config; +struct gen_perf_config; #include #include @@ -948,6 +949,7 @@ struct anv_physical_device { bool supports_48bit_addresses; struct brw_compiler * compiler; struct isl_device isl_dev; + struct gen_perf_config * perf; int cmd_parser_version; bool has_exec_async; bool has_exec_capture; @@ -1169,6 +1171,9 @@ struct anv_device { * the cmd_buffer's list. */ struct anv_cmd_buffer *cmd_buffer_being_decoded; + + int perf_fd; /* -1 if no opened */ + uint64_t perf_metric; /* 0 if unset */ }; static inline struct anv_state_pool * @@ -2530,6 +2535,9 @@ struct anv_cmd_buffer { VkCommandBufferLevel level; struct anv_cmd_state state; + + /* Set by SetPerformanceMarkerINTEL, written into queries by CmdBeginQuery */ + uint64_t intel_perf_marker; }; VkResult anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer); @@ -3750,6 +3758,9 @@ anv_get_subpass_id(const struct anv_cmd_state * const cmd_state) return subpass_id; } +struct gen_perf_config *anv_get_perf(const struct gen_device_info *devinfo, int fd); +void anv_device_perf_init(struct anv_device *device); + #define ANV_DEFINE_HANDLE_CASTS(__anv_type, __VkType) \ \ static inline struct __anv_type * \ diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index dd0f6e20681..ff9c6c79eb9 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -5091,3 +5091,57 @@ void genX(CmdWaitEvents)( bufferMemoryBarrierCount, pBufferMemoryBarriers, imageMemoryBarrierCount, pImageMemoryBarriers); } + +VkResult genX(CmdSetPerformanceOverrideINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceOverrideInfoINTEL* pOverrideInfo) +{ + ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); + + switch (pOverrideInfo->type) { + case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: { + uint32_t dw; + +#if GEN_GEN >= 9 + anv_pack_struct(&dw, GENX(CS_DEBUG_MODE2), + ._3DRenderingInstructionDisable = pOverrideInfo->enable, + .MediaInstructionDisable = pOverrideInfo->enable, + ._3DRenderingInstructionDisableMask = true, + .MediaInstructionDisableMask = true); + emit_lri(&cmd_buffer->batch, GENX(CS_DEBUG_MODE2_num), dw); +#else + anv_pack_struct(&dw, GENX(INSTPM), + ._3DRenderingInstructionDisable = pOverrideInfo->enable, + .MediaInstructionDisable = pOverrideInfo->enable, + ._3DRenderingInstructionDisableMask = true, + .MediaInstructionDisableMask = true); + emit_lri(&cmd_buffer->batch, GENX(INSTPM_num), dw); +#endif + break; + } + + case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL: + if (pOverrideInfo->enable) { + /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */ + cmd_buffer->state.pending_pipe_bits |= + ANV_PIPE_FLUSH_BITS | + ANV_PIPE_INVALIDATE_BITS; + genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); + } + break; + + default: + unreachable("Invalid override"); + } + + return VK_SUCCESS; +} + +VkResult genX(CmdSetPerformanceStreamMarkerINTEL)( + VkCommandBuffer commandBuffer, + const VkPerformanceStreamMarkerInfoINTEL* pMarkerInfo) +{ + /* TODO: Waiting on the register to write, might depend on generation. */ + + return VK_SUCCESS; +} diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index aa0cf8b9471..b3090f20545 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -37,6 +37,10 @@ #define __gen_get_batch_dwords anv_batch_emit_dwords #define __gen_address_offset anv_address_add #include "common/gen_mi_builder.h" +#include "perf/gen_perf.h" +#include "perf/gen_perf_mdapi.h" + +#define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t)) VkResult genX(CreateQueryPool)( VkDevice _device, @@ -52,9 +56,14 @@ VkResult genX(CreateQueryPool)( assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); /* Query pool slots are made up of some number of 64-bit values packed - * tightly together. The first 64-bit value is always the "available" bit - * which is 0 when the query is unavailable and 1 when it is available. - * The 64-bit values that follow are determined by the type of query. + * tightly together. For most query types have the first 64-bit value is + * the "available" bit which is 0 when the query is unavailable and 1 when + * it is available. The 64-bit values that follow are determined by the + * type of query. + * + * For performance queries, we have a requirement to align OA reports at + * 64bytes so we put those first and have the "available" bit behind + * together with some other counters. */ uint32_t uint64s_per_slot = 1; @@ -84,6 +93,15 @@ VkResult genX(CreateQueryPool)( */ uint64s_per_slot += 4; break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */ + uint64s_per_slot += 4; /* PerfCounter 1 & 2 */ + uint64s_per_slot++; /* 2 * 32bit RPSTAT register */ + uint64s_per_slot++; /* 64bit marker */ + uint64s_per_slot++; /* availability */ + uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */ + break; + } default: assert(!"Invalid query type"); } @@ -160,6 +178,57 @@ anv_query_address(struct anv_query_pool *pool, uint32_t query) }; } +/** + * VK_INTEL_performance_query layout: + * + * ------------------------------ + * | end MI_RPC (256b) | + * |----------------------------| + * | begin MI_RPC (256b) | + * |----------------------------| + * | begin perfcntr 1 & 2 (16b) | + * |----------------------------| + * | end perfcntr 1 & 2 (16b) | + * |----------------------------| + * | begin RPSTAT register (4b) | + * |----------------------------| + * | end RPSTAT register (4b) | + * |----------------------------| + * | marker (8b) | + * |----------------------------| + * | availability (8b) | + * ------------------------------ + */ + +static uint32_t +intel_perf_mi_rpc_offset(bool end) +{ + return end ? 0 : 256; +} + +static uint32_t +intel_perf_counter(bool end) +{ + uint32_t offset = 512; + offset += end ? 2 * sizeof(uint64_t) : 0; + return offset; +} + +static uint32_t +intel_perf_rpstart_offset(bool end) +{ + uint32_t offset = intel_perf_counter(false) + + 4 * sizeof(uint64_t); + offset += end ? sizeof(uint32_t) : 0; + return offset; +} + +static uint32_t +intel_perf_marker_offset(void) +{ + return intel_perf_rpstart_offset(false) + sizeof(uint64_t); +} + static void cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, uint32_t value_index, uint64_t result) @@ -173,18 +242,28 @@ cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, } } +static void * +query_slot(struct anv_query_pool *pool, uint32_t query) +{ + return pool->bo.map + query * pool->stride; +} + static bool -query_is_available(uint64_t *slot) +query_is_available(struct anv_query_pool *pool, uint32_t query) { - return *(volatile uint64_t *)slot; + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) { + return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) + + pool->stride - 8); + } else + return *(volatile uint64_t *)query_slot(pool, query); } static VkResult wait_for_available(struct anv_device *device, - struct anv_query_pool *pool, uint64_t *slot) + struct anv_query_pool *pool, uint32_t query) { while (true) { - if (query_is_available(slot)) + if (query_is_available(pool, query)) return VK_SUCCESS; int ret = anv_gem_busy(device, pool->bo.gem_handle); @@ -197,7 +276,7 @@ wait_for_available(struct anv_device *device, } else { assert(ret == 0); /* The BO is no longer busy. */ - if (query_is_available(slot)) { + if (query_is_available(pool, query)) { return VK_SUCCESS; } else { VkResult status = anv_device_query_status(device); @@ -233,7 +312,8 @@ VkResult genX(GetQueryPoolResults)( assert(pool->type == VK_QUERY_TYPE_OCCLUSION || pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || pool->type == VK_QUERY_TYPE_TIMESTAMP || - pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT); + pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || + pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL); if (anv_device_is_lost(device)) return VK_ERROR_DEVICE_LOST; @@ -245,13 +325,10 @@ VkResult genX(GetQueryPoolResults)( VkResult status = VK_SUCCESS; for (uint32_t i = 0; i < queryCount; i++) { - uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride; - - /* Availability is always at the start of the slot */ - bool available = slot[0]; + bool available = query_is_available(pool, firstQuery + i); if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { - status = wait_for_available(device, pool, slot); + status = wait_for_available(device, pool, firstQuery + i); if (status != VK_SUCCESS) return status; @@ -271,13 +348,16 @@ VkResult genX(GetQueryPoolResults)( uint32_t idx = 0; switch (pool->type) { - case VK_QUERY_TYPE_OCCLUSION: + case VK_QUERY_TYPE_OCCLUSION: { + uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); idx++; break; + } case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + uint64_t *slot = query_slot(pool, firstQuery + i); uint32_t statistics = pool->pipeline_statistics; while (statistics) { uint32_t stat = u_bit_scan(&statistics); @@ -297,7 +377,8 @@ VkResult genX(GetQueryPoolResults)( break; } - case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: + case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { + uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); idx++; @@ -305,12 +386,54 @@ VkResult genX(GetQueryPoolResults)( cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); idx++; break; + } - case VK_QUERY_TYPE_TIMESTAMP: + case VK_QUERY_TYPE_TIMESTAMP: { + uint64_t *slot = query_slot(pool, firstQuery + i); if (write_results) cpu_write_query_result(pData, flags, idx, slot[1]); idx++; break; + } + + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + if (!write_results) + break; + const void *query_data = query_slot(pool, firstQuery + i); + const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false); + const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true); + const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false); + const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true); + struct gen_perf_query_result result; + struct gen_perf_query_info metric = { + .oa_format = (GEN_GEN >= 8 ? + I915_OA_FORMAT_A32u40_A4u32_B8_C8 : + I915_OA_FORMAT_A45_B8_C8), + }; + uint32_t core_freq[2]; +#if GEN_GEN < 9 + core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL; + core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL; +#else + core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL; + core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL; +#endif + gen_perf_query_result_clear(&result); + gen_perf_query_result_accumulate(&result, &metric, + oa_begin, oa_end); + gen_perf_query_result_read_frequencies(&result, &device->info, + oa_begin, oa_end); + gen_perf_query_result_write_mdapi(pData, stride, + &device->info, + &result, + core_freq[0], core_freq[1]); + gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info, + query_data + intel_perf_counter(false), + query_data + intel_perf_counter(true)); + const uint64_t *marker = query_data + intel_perf_marker_offset(); + gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); + break; + } default: unreachable("invalid pool type"); @@ -406,6 +529,16 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, } break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: + for (uint32_t i = 0; i < num_queries; i++) { + struct anv_address slot_addr = + anv_query_address(pool, first_index + i); + gen_mi_memset(b, slot_addr, 0, pool->stride - 8); + emit_query_mi_availability(b, anv_address_add(slot_addr, + pool->stride - 8), true); + } + break; + default: unreachable("Unsupported query type"); } @@ -440,6 +573,21 @@ void genX(CmdResetQueryPool)( break; } + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + for (uint32_t i = 0; i < queryCount; i++) { + emit_query_mi_availability( + &b, + anv_address_add( + anv_query_address(pool, firstQuery + i), + pool->stride - 8), + false); + } + break; + } + default: unreachable("Unsupported query type"); } @@ -550,6 +698,37 @@ void genX(CmdBeginQueryIndexedEXT)( emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { + rpc.MemoryAddress = + anv_address_add(query_addr, intel_perf_mi_rpc_offset(false)); + } +#if GEN_GEN < 9 + gen_mi_store(&b, + gen_mi_mem32(anv_address_add(query_addr, + intel_perf_rpstart_offset(false))), + gen_mi_reg32(GENX(RPSTAT1_num))); +#else + gen_mi_store(&b, + gen_mi_mem32(anv_address_add(query_addr, + intel_perf_rpstart_offset(false))), + gen_mi_reg32(GENX(RPSTAT0_num))); +#endif +#if GEN_GEN >= 8 && GEN_GEN <= 11 + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, + intel_perf_counter(false))), + gen_mi_reg64(GENX(PERFCNT1_num))); + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, + intel_perf_counter(false) + 8)), + gen_mi_reg64(GENX(PERFCNT2_num))); +#endif + break; + } + default: unreachable(""); } @@ -611,6 +790,45 @@ void genX(CmdEndQueryIndexedEXT)( emit_query_mi_availability(&b, query_addr, true); break; + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + uint32_t marker_offset = intel_perf_marker_offset(); + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)), + gen_mi_imm(cmd_buffer->intel_perf_marker)); +#if GEN_GEN >= 8 && GEN_GEN <= 11 + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))), + gen_mi_reg64(GENX(PERFCNT1_num))); + gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)), + gen_mi_reg64(GENX(PERFCNT2_num))); +#endif +#if GEN_GEN < 9 + gen_mi_store(&b, + gen_mi_mem32(anv_address_add(query_addr, + intel_perf_rpstart_offset(true))), + gen_mi_reg32(GENX(RPSTAT1_num))); +#else + gen_mi_store(&b, + gen_mi_mem32(anv_address_add(query_addr, + intel_perf_rpstart_offset(true))), + gen_mi_reg32(GENX(RPSTAT0_num))); +#endif + /* Position the last OA snapshot at the beginning of the query so that + * we can tell whether it's ready. + */ + anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { + rpc.MemoryAddress = anv_address_add(query_addr, + intel_perf_mi_rpc_offset(true)); + rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */ + } + emit_query_mi_availability(&b, + anv_address_add(query_addr, pool->stride - 8), + true); + break; + } + default: unreachable(""); } diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build index e8db8f44de0..69e472f719b 100644 --- a/src/intel/vulkan/meson.build +++ b/src/intel/vulkan/meson.build @@ -118,6 +118,7 @@ libanv_files = files( 'anv_nir_lower_push_constants.c', 'anv_nir_lower_ycbcr_textures.c', 'anv_pass.c', + 'anv_perf.c', 'anv_pipeline.c', 'anv_pipeline_cache.c', 'anv_private.h', @@ -194,6 +195,7 @@ libvulkan_intel = shared_library( link_whole : [libanv_common, libanv_gen_libs], link_with : [ libintel_compiler, libintel_dev, libisl, libblorp, libvulkan_wsi, + libintel_perf, ], dependencies : [ dep_thread, dep_dl, dep_m, anv_deps, idep_libintel_common, @@ -227,7 +229,7 @@ if with_tests link_whole : libanv_common, link_with : [ libanv_gen_libs, libintel_compiler, libintel_common, libintel_dev, - libisl, libblorp, libvulkan_wsi, + libisl, libblorp, libvulkan_wsi, libintel_perf, ], dependencies : [ dep_thread, dep_dl, dep_m, anv_deps, -- 2.30.2