--- /dev/null
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "anv_private.h"
+
+#include "perf/gen_perf.h"
+#include "perf/gen_perf_mdapi.h"
+
+struct gen_perf_config *
+anv_get_perf(const struct gen_device_info *devinfo, int fd)
+{
+ struct gen_perf_config *perf = gen_perf_new(NULL);
+
+ gen_perf_init_metrics(perf, devinfo, fd);
+
+ /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in
+ * perf revision 2.
+ */
+ if (anv_gem_get_param(fd, I915_PARAM_PERF_REVISION) < 3)
+ goto err;
+
+ return perf;
+
+ err:
+ ralloc_free(perf);
+ return NULL;
+}
+
+void
+anv_device_perf_init(struct anv_device *device)
+{
+ device->perf_fd = -1;
+}
+
+static int
+anv_device_perf_open(struct anv_device *device, uint64_t metric_id)
+{
+ uint64_t properties[DRM_I915_PERF_PROP_MAX * 2];
+ struct drm_i915_perf_open_param param;
+ int p = 0, stream_fd;
+
+ properties[p++] = DRM_I915_PERF_PROP_SAMPLE_OA;
+ properties[p++] = true;
+
+ properties[p++] = DRM_I915_PERF_PROP_OA_METRICS_SET;
+ properties[p++] = metric_id;
+
+ properties[p++] = DRM_I915_PERF_PROP_OA_FORMAT;
+ properties[p++] = device->info.gen >= 8 ?
+ I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
+ I915_OA_FORMAT_A45_B8_C8;
+
+ properties[p++] = DRM_I915_PERF_PROP_OA_EXPONENT;
+ properties[p++] = 31; /* slowest sampling period */
+
+ properties[p++] = DRM_I915_PERF_PROP_CTX_HANDLE;
+ properties[p++] = device->context_id;
+
+ properties[p++] = DRM_I915_PERF_PROP_HOLD_PREEMPTION;
+ properties[p++] = true;
+
+ memset(¶m, 0, sizeof(param));
+ param.flags = 0;
+ param.flags |= I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK;
+ param.properties_ptr = (uintptr_t)properties;
+ param.num_properties = p / 2;
+
+ stream_fd = gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_OPEN, ¶m);
+ return stream_fd;
+}
+
+VkResult anv_InitializePerformanceApiINTEL(
+ VkDevice _device,
+ const VkInitializePerformanceApiInfoINTEL* pInitializeInfo)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
+
+ if (!pdevice->perf)
+ return VK_ERROR_EXTENSION_NOT_PRESENT;
+
+ /* Not much to do here */
+ return VK_SUCCESS;
+}
+
+VkResult anv_GetPerformanceParameterINTEL(
+ VkDevice _device,
+ VkPerformanceParameterTypeINTEL parameter,
+ VkPerformanceValueINTEL* pValue)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
+
+ if (!pdevice->perf)
+ return VK_ERROR_EXTENSION_NOT_PRESENT;
+
+ VkResult result = VK_SUCCESS;
+ switch (parameter) {
+ case VK_PERFORMANCE_PARAMETER_TYPE_HW_COUNTERS_SUPPORTED_INTEL:
+ pValue->type = VK_PERFORMANCE_VALUE_TYPE_BOOL_INTEL;
+ pValue->data.valueBool = VK_TRUE;
+ break;
+
+ case VK_PERFORMANCE_PARAMETER_TYPE_STREAM_MARKER_VALID_BITS_INTEL:
+ pValue->type = VK_PERFORMANCE_VALUE_TYPE_UINT32_INTEL;
+ pValue->data.value32 = 25;
+ break;
+
+ default:
+ result = VK_ERROR_FEATURE_NOT_PRESENT;
+ break;
+ }
+
+ return result;
+}
+
+VkResult anv_CmdSetPerformanceMarkerINTEL(
+ VkCommandBuffer commandBuffer,
+ const VkPerformanceMarkerInfoINTEL* pMarkerInfo)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+
+ cmd_buffer->intel_perf_marker = pMarkerInfo->marker;
+
+ return VK_SUCCESS;
+}
+
+VkResult anv_AcquirePerformanceConfigurationINTEL(
+ VkDevice _device,
+ const VkPerformanceConfigurationAcquireInfoINTEL* pAcquireInfo,
+ VkPerformanceConfigurationINTEL* pConfiguration)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
+
+ struct gen_perf_registers *perf_config =
+ gen_perf_load_configuration(pdevice->perf, device->fd,
+ GEN_PERF_QUERY_GUID_MDAPI);
+ if (!perf_config)
+ return VK_INCOMPLETE;
+
+ int ret = gen_perf_store_configuration(pdevice->perf, device->fd,
+ perf_config, NULL /* guid */);
+ if (ret < 0) {
+ ralloc_free(perf_config);
+ return VK_INCOMPLETE;
+ }
+
+ *pConfiguration = (VkPerformanceConfigurationINTEL) (uint64_t) ret;
+
+ return VK_SUCCESS;
+}
+
+VkResult anv_ReleasePerformanceConfigurationINTEL(
+ VkDevice _device,
+ VkPerformanceConfigurationINTEL _configuration)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+ uint64_t config = (uint64_t) _configuration;
+
+ gen_ioctl(device->fd, DRM_IOCTL_I915_PERF_REMOVE_CONFIG, &config);
+
+ return VK_SUCCESS;
+}
+
+VkResult anv_QueueSetPerformanceConfigurationINTEL(
+ VkQueue _queue,
+ VkPerformanceConfigurationINTEL _configuration)
+{
+ ANV_FROM_HANDLE(anv_queue, queue, _queue);
+ struct anv_device *device = queue->device;
+ uint64_t configuration = (uint64_t) _configuration;
+
+ if (device->perf_fd < 0) {
+ device->perf_fd = anv_device_perf_open(device, configuration);
+ if (device->perf_fd < 0)
+ return VK_ERROR_INITIALIZATION_FAILED;
+ } else {
+ int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG,
+ (void *)(uintptr_t) _configuration);
+ if (ret < 0) {
+ return anv_device_set_lost(device,
+ "i915-perf config failed: %s",
+ strerror(ret));
+ }
+ }
+
+ return VK_SUCCESS;
+}
+
+void anv_UninitializePerformanceApiINTEL(
+ VkDevice _device)
+{
+ ANV_FROM_HANDLE(anv_device, device, _device);
+
+ if (device->perf_fd >= 0) {
+ close(device->perf_fd);
+ device->perf_fd = -1;
+ }
+}
#define __gen_get_batch_dwords anv_batch_emit_dwords
#define __gen_address_offset anv_address_add
#include "common/gen_mi_builder.h"
+#include "perf/gen_perf.h"
+#include "perf/gen_perf_mdapi.h"
+
+#define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
VkResult genX(CreateQueryPool)(
VkDevice _device,
assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
/* Query pool slots are made up of some number of 64-bit values packed
- * tightly together. The first 64-bit value is always the "available" bit
- * which is 0 when the query is unavailable and 1 when it is available.
- * The 64-bit values that follow are determined by the type of query.
+ * tightly together. For most query types have the first 64-bit value is
+ * the "available" bit which is 0 when the query is unavailable and 1 when
+ * it is available. The 64-bit values that follow are determined by the
+ * type of query.
+ *
+ * For performance queries, we have a requirement to align OA reports at
+ * 64bytes so we put those first and have the "available" bit behind
+ * together with some other counters.
*/
uint32_t uint64s_per_slot = 1;
*/
uint64s_per_slot += 4;
break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+ uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */
+ uint64s_per_slot += 4; /* PerfCounter 1 & 2 */
+ uint64s_per_slot++; /* 2 * 32bit RPSTAT register */
+ uint64s_per_slot++; /* 64bit marker */
+ uint64s_per_slot++; /* availability */
+ uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */
+ break;
+ }
default:
assert(!"Invalid query type");
}
};
}
+/**
+ * VK_INTEL_performance_query layout:
+ *
+ * ------------------------------
+ * | end MI_RPC (256b) |
+ * |----------------------------|
+ * | begin MI_RPC (256b) |
+ * |----------------------------|
+ * | begin perfcntr 1 & 2 (16b) |
+ * |----------------------------|
+ * | end perfcntr 1 & 2 (16b) |
+ * |----------------------------|
+ * | begin RPSTAT register (4b) |
+ * |----------------------------|
+ * | end RPSTAT register (4b) |
+ * |----------------------------|
+ * | marker (8b) |
+ * |----------------------------|
+ * | availability (8b) |
+ * ------------------------------
+ */
+
+static uint32_t
+intel_perf_mi_rpc_offset(bool end)
+{
+ return end ? 0 : 256;
+}
+
+static uint32_t
+intel_perf_counter(bool end)
+{
+ uint32_t offset = 512;
+ offset += end ? 2 * sizeof(uint64_t) : 0;
+ return offset;
+}
+
+static uint32_t
+intel_perf_rpstart_offset(bool end)
+{
+ uint32_t offset = intel_perf_counter(false) +
+ 4 * sizeof(uint64_t);
+ offset += end ? sizeof(uint32_t) : 0;
+ return offset;
+}
+
+static uint32_t
+intel_perf_marker_offset(void)
+{
+ return intel_perf_rpstart_offset(false) + sizeof(uint64_t);
+}
+
static void
cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
uint32_t value_index, uint64_t result)
}
}
+static void *
+query_slot(struct anv_query_pool *pool, uint32_t query)
+{
+ return pool->bo.map + query * pool->stride;
+}
+
static bool
-query_is_available(uint64_t *slot)
+query_is_available(struct anv_query_pool *pool, uint32_t query)
{
- return *(volatile uint64_t *)slot;
+ if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
+ return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) +
+ pool->stride - 8);
+ } else
+ return *(volatile uint64_t *)query_slot(pool, query);
}
static VkResult
wait_for_available(struct anv_device *device,
- struct anv_query_pool *pool, uint64_t *slot)
+ struct anv_query_pool *pool, uint32_t query)
{
while (true) {
- if (query_is_available(slot))
+ if (query_is_available(pool, query))
return VK_SUCCESS;
int ret = anv_gem_busy(device, pool->bo.gem_handle);
} else {
assert(ret == 0);
/* The BO is no longer busy. */
- if (query_is_available(slot)) {
+ if (query_is_available(pool, query)) {
return VK_SUCCESS;
} else {
VkResult status = anv_device_query_status(device);
assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
pool->type == VK_QUERY_TYPE_TIMESTAMP ||
- pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
+ pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
+ pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
if (anv_device_is_lost(device))
return VK_ERROR_DEVICE_LOST;
VkResult status = VK_SUCCESS;
for (uint32_t i = 0; i < queryCount; i++) {
- uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
-
- /* Availability is always at the start of the slot */
- bool available = slot[0];
+ bool available = query_is_available(pool, firstQuery + i);
if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
- status = wait_for_available(device, pool, slot);
+ status = wait_for_available(device, pool, firstQuery + i);
if (status != VK_SUCCESS)
return status;
uint32_t idx = 0;
switch (pool->type) {
- case VK_QUERY_TYPE_OCCLUSION:
+ case VK_QUERY_TYPE_OCCLUSION: {
+ uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results)
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
idx++;
break;
+ }
case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
+ uint64_t *slot = query_slot(pool, firstQuery + i);
uint32_t statistics = pool->pipeline_statistics;
while (statistics) {
uint32_t stat = u_bit_scan(&statistics);
break;
}
- case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+ case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
+ uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results)
cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
idx++;
cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
idx++;
break;
+ }
- case VK_QUERY_TYPE_TIMESTAMP:
+ case VK_QUERY_TYPE_TIMESTAMP: {
+ uint64_t *slot = query_slot(pool, firstQuery + i);
if (write_results)
cpu_write_query_result(pData, flags, idx, slot[1]);
idx++;
break;
+ }
+
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+ if (!write_results)
+ break;
+ const void *query_data = query_slot(pool, firstQuery + i);
+ const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
+ const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
+ const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
+ const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
+ struct gen_perf_query_result result;
+ struct gen_perf_query_info metric = {
+ .oa_format = (GEN_GEN >= 8 ?
+ I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
+ I915_OA_FORMAT_A45_B8_C8),
+ };
+ uint32_t core_freq[2];
+#if GEN_GEN < 9
+ core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
+ core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
+#else
+ core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
+ core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
+#endif
+ gen_perf_query_result_clear(&result);
+ gen_perf_query_result_accumulate(&result, &metric,
+ oa_begin, oa_end);
+ gen_perf_query_result_read_frequencies(&result, &device->info,
+ oa_begin, oa_end);
+ gen_perf_query_result_write_mdapi(pData, stride,
+ &device->info,
+ &result,
+ core_freq[0], core_freq[1]);
+ gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
+ query_data + intel_perf_counter(false),
+ query_data + intel_perf_counter(true));
+ const uint64_t *marker = query_data + intel_perf_marker_offset();
+ gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
+ break;
+ }
default:
unreachable("invalid pool type");
}
break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
+ for (uint32_t i = 0; i < num_queries; i++) {
+ struct anv_address slot_addr =
+ anv_query_address(pool, first_index + i);
+ gen_mi_memset(b, slot_addr, 0, pool->stride - 8);
+ emit_query_mi_availability(b, anv_address_add(slot_addr,
+ pool->stride - 8), true);
+ }
+ break;
+
default:
unreachable("Unsupported query type");
}
break;
}
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+ struct gen_mi_builder b;
+ gen_mi_builder_init(&b, &cmd_buffer->batch);
+
+ for (uint32_t i = 0; i < queryCount; i++) {
+ emit_query_mi_availability(
+ &b,
+ anv_address_add(
+ anv_query_address(pool, firstQuery + i),
+ pool->stride - 8),
+ false);
+ }
+ break;
+ }
+
default:
unreachable("Unsupported query type");
}
emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.CommandStreamerStallEnable = true;
+ pc.StallAtPixelScoreboard = true;
+ }
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
+ rpc.MemoryAddress =
+ anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
+ }
+#if GEN_GEN < 9
+ gen_mi_store(&b,
+ gen_mi_mem32(anv_address_add(query_addr,
+ intel_perf_rpstart_offset(false))),
+ gen_mi_reg32(GENX(RPSTAT1_num)));
+#else
+ gen_mi_store(&b,
+ gen_mi_mem32(anv_address_add(query_addr,
+ intel_perf_rpstart_offset(false))),
+ gen_mi_reg32(GENX(RPSTAT0_num)));
+#endif
+#if GEN_GEN >= 8 && GEN_GEN <= 11
+ gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
+ intel_perf_counter(false))),
+ gen_mi_reg64(GENX(PERFCNT1_num)));
+ gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
+ intel_perf_counter(false) + 8)),
+ gen_mi_reg64(GENX(PERFCNT2_num)));
+#endif
+ break;
+ }
+
default:
unreachable("");
}
emit_query_mi_availability(&b, query_addr, true);
break;
+ case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.CommandStreamerStallEnable = true;
+ pc.StallAtPixelScoreboard = true;
+ }
+ uint32_t marker_offset = intel_perf_marker_offset();
+ gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
+ gen_mi_imm(cmd_buffer->intel_perf_marker));
+#if GEN_GEN >= 8 && GEN_GEN <= 11
+ gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
+ gen_mi_reg64(GENX(PERFCNT1_num)));
+ gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
+ gen_mi_reg64(GENX(PERFCNT2_num)));
+#endif
+#if GEN_GEN < 9
+ gen_mi_store(&b,
+ gen_mi_mem32(anv_address_add(query_addr,
+ intel_perf_rpstart_offset(true))),
+ gen_mi_reg32(GENX(RPSTAT1_num)));
+#else
+ gen_mi_store(&b,
+ gen_mi_mem32(anv_address_add(query_addr,
+ intel_perf_rpstart_offset(true))),
+ gen_mi_reg32(GENX(RPSTAT0_num)));
+#endif
+ /* Position the last OA snapshot at the beginning of the query so that
+ * we can tell whether it's ready.
+ */
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
+ rpc.MemoryAddress = anv_address_add(query_addr,
+ intel_perf_mi_rpc_offset(true));
+ rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
+ }
+ emit_query_mi_availability(&b,
+ anv_address_add(query_addr, pool->stride - 8),
+ true);
+ break;
+ }
+
default:
unreachable("");
}