From: Lionel Landwerlin Date: Sat, 6 Oct 2018 18:12:34 +0000 (+0100) Subject: anv: Implement VK_KHR_performance_query X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2001a80d4a81f2e8194b29cca301dd1b27be9acb;p=mesa.git anv: Implement VK_KHR_performance_query This has the same kernel requirements are VK_INTEL_performance_query v2: Fix empty queue submit (Lionel) v3: Fix autotool build issue (Piotr Byszewski) v4: Fix Reset & Begin/End in same command buffer, using soft-pin & relocation on the same buffer won't work currently. This version uses a somewhat dirty trick in anv_execbuf_add_bo (Piotr Byszewski) v5: Fix enumeration with null pointers for either pCounters or pCounterDescriptions (Piotr) Fix return condition on enumeration (Lionel) Set counter uuid using sha1 hashes (Lionel) v6: Fix counters scope, should be COMMAND_KHR not COMMAND_BUFFER_KHR (Lionel) v7: Rebase (Lionel) v8: Rework checking for loaded queries (Lionel) v9: Use new i915-perf interface v10: Use anv_multialloc (Jason) v11: Implement perf query passes using self modifying batches (Lionel) Limit support to softpin/gen8 v12: Remove spurious changes (Jason) v13: Drop relocs (Jason) v14: Avoid overwritting .sType in VkPerformanceCounterKHR/VkPerformanceCounterDescriptionKHR (Lionel) v15: Don't copy the entire VkPerformanceCounterKHR/VkPerformanceCounterDescriptionKHR (Jason) Reuse anv_batch rather than custom packing (Jason) v16: Fix missing MI_BB_END in reconfiguration batch Only report the extension with kernel support (perf_version >= 3) v17: Some cleanup of unused stuff Signed-off-by: Lionel Landwerlin Reviewed-by: Jason Ekstrand Part-of: --- diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index 2c3b7b3cad1..f820a69ceec 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -31,6 +31,7 @@ #include "genxml/gen8_pack.h" #include "genxml/genX_bits.h" +#include "perf/gen_perf.h" #include "util/debug.h" @@ -1102,6 +1103,8 @@ struct anv_execbuf { const VkAllocationCallbacks * alloc; VkSystemAllocationScope alloc_scope; + + int perf_query_pass; }; static void @@ -1375,6 +1378,9 @@ static bool relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, struct anv_execbuf *exec) { + if (cmd_buffer->perf_query_pool) + return false; + if (!exec->has_relocs) return true; @@ -1672,6 +1678,7 @@ anv_queue_execbuf_locked(struct anv_queue *queue, anv_execbuf_init(&execbuf); execbuf.alloc = submit->alloc; execbuf.alloc_scope = submit->alloc_scope; + execbuf.perf_query_pass = submit->perf_query_pass; VkResult result; @@ -1708,10 +1715,26 @@ anv_queue_execbuf_locked(struct anv_queue *queue, if (result != VK_SUCCESS) goto error; + const bool has_perf_query = + submit->perf_query_pass >= 0 && + submit->cmd_buffer && + submit->cmd_buffer->perf_query_pool; + if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) { if (submit->cmd_buffer) { - struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos); + if (has_perf_query) { + struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool; + struct anv_bo *pass_batch_bo = query_pool->bo; + uint64_t pass_batch_offset = + khr_perf_query_preamble_offset(query_pool, + submit->perf_query_pass); + + gen_print_batch(&device->decoder_ctx, + pass_batch_bo->map + pass_batch_offset, 64, + pass_batch_bo->offset + pass_batch_offset, false); + } + struct anv_batch_bo **bo = u_vector_tail(&submit->cmd_buffer->seen_bbos); device->cmd_buffer_being_decoded = submit->cmd_buffer; gen_print_batch(&device->decoder_ctx, (*bo)->bo->map, (*bo)->bo->size, (*bo)->bo->offset, false); @@ -1742,6 +1765,48 @@ anv_queue_execbuf_locked(struct anv_queue *queue, if (submit->need_out_fence) execbuf.execbuf.flags |= I915_EXEC_FENCE_OUT; + if (has_perf_query) { + struct anv_query_pool *query_pool = submit->cmd_buffer->perf_query_pool; + assert(submit->perf_query_pass < query_pool->n_passes); + struct gen_perf_query_info *query_info = + query_pool->pass_query[submit->perf_query_pass]; + + /* Some performance queries just the pipeline statistic HW, no need for + * OA in that case, so no need to reconfigure. + */ + if (query_info->kind == GEN_PERF_QUERY_TYPE_OA || + query_info->kind == GEN_PERF_QUERY_TYPE_RAW) { + int ret = gen_ioctl(device->perf_fd, I915_PERF_IOCTL_CONFIG, + (void *)(uintptr_t) query_info->oa_metrics_set_id); + if (ret < 0) { + result = anv_device_set_lost(device, + "i915-perf config failed: %s", + strerror(ret)); + } + } + + struct anv_bo *pass_batch_bo = query_pool->bo; + + struct drm_i915_gem_exec_object2 query_pass_object = { + .handle = pass_batch_bo->gem_handle, + .offset = pass_batch_bo->offset, + .flags = pass_batch_bo->flags, + }; + struct drm_i915_gem_execbuffer2 query_pass_execbuf = { + .buffers_ptr = (uintptr_t) &query_pass_object, + .buffer_count = 1, + .batch_start_offset = khr_perf_query_preamble_offset(query_pool, + submit->perf_query_pass), + .flags = I915_EXEC_HANDLE_LUT | I915_EXEC_RENDER, + .rsvd1 = device->context_id, + }; + + int ret = queue->device->no_hw ? 0 : + anv_gem_execbuffer(queue->device, &query_pass_execbuf); + if (ret) + result = anv_queue_set_lost(queue, "execbuf2 failed: %m"); + } + int ret = queue->device->no_hw ? 0 : anv_gem_execbuffer(queue->device, &execbuf.execbuf); if (ret) diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 49c7334567f..ea5ec415340 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -306,6 +306,7 @@ VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer) { cmd_buffer->usage_flags = 0; + cmd_buffer->perf_query_pool = NULL; anv_cmd_buffer_reset_batch_bo_chain(cmd_buffer); anv_cmd_state_reset(cmd_buffer); diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 71fc427aa92..d9f15b46332 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -1238,6 +1238,15 @@ void anv_GetPhysicalDeviceFeatures2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_FEATURES_KHR: { + VkPhysicalDevicePerformanceQueryFeaturesKHR *feature = + (VkPhysicalDevicePerformanceQueryFeaturesKHR *)ext; + feature->performanceCounterQueryPools = true; + /* HW only supports a single configuration at a time. */ + feature->performanceCounterMultipleQueryPools = false; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR: { VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *features = (VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR *)ext; @@ -1903,6 +1912,16 @@ void anv_GetPhysicalDeviceProperties2( break; } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PERFORMANCE_QUERY_PROPERTIES_KHR: { + VkPhysicalDevicePerformanceQueryPropertiesKHR *properties = + (VkPhysicalDevicePerformanceQueryPropertiesKHR *)ext; + /* We could support this by spawning a shader to do the equation + * normalization. + */ + properties->allowCommandBufferQueryCopies = false; + break; + } + case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_POINT_CLIPPING_PROPERTIES: { VkPhysicalDevicePointClippingProperties *properties = (VkPhysicalDevicePointClippingProperties *) ext; diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py index 926061bb997..dabe675971c 100644 --- a/src/intel/vulkan/anv_extensions.py +++ b/src/intel/vulkan/anv_extensions.py @@ -87,6 +87,7 @@ EXTENSIONS = [ Extension('VK_KHR_maintenance2', 1, True), Extension('VK_KHR_maintenance3', 1, True), Extension('VK_KHR_multiview', 1, True), + Extension('VK_KHR_performance_query', 1, 'device->use_softpin && device->perf && device->perf->i915_perf_version >= 3'), Extension('VK_KHR_pipeline_executable_properties', 1, True), Extension('VK_KHR_push_descriptor', 1, True), Extension('VK_KHR_relaxed_block_layout', 1, True), diff --git a/src/intel/vulkan/anv_perf.c b/src/intel/vulkan/anv_perf.c index 133315b2c8d..e8575b1bd70 100644 --- a/src/intel/vulkan/anv_perf.c +++ b/src/intel/vulkan/anv_perf.c @@ -26,17 +26,33 @@ #include #include "anv_private.h" +#include "vk_util.h" #include "perf/gen_perf.h" #include "perf/gen_perf_mdapi.h" +#include "util/mesa-sha1.h" + struct gen_perf_config * anv_get_perf(const struct gen_device_info *devinfo, int fd) { + /* We need self modifying batches. The i915 parser prevents it on + * Gen7.5 :( maybe one day. + */ + if (devinfo->gen < 8) + return NULL; + struct gen_perf_config *perf = gen_perf_new(NULL); gen_perf_init_metrics(perf, devinfo, fd, false /* pipeline statistics */); + if (!perf->n_queries) { + if (perf->platform_supported) + intel_logw("Performance support disabled, " + "consider sysctl dev.i915.perf_stream_paranoid=0\n"); + goto err; + } + /* We need DRM_I915_PERF_PROP_HOLD_PREEMPTION support, only available in * perf revision 2. */ @@ -103,6 +119,7 @@ anv_device_perf_open(struct anv_device *device, uint64_t metric_id) return stream_fd; } +/* VK_INTEL_performance_query */ VkResult anv_InitializePerformanceApiINTEL( VkDevice _device, const VkInitializePerformanceApiInfoINTEL* pInitializeInfo) @@ -226,3 +243,175 @@ void anv_UninitializePerformanceApiINTEL( device->perf_fd = -1; } } + +/* VK_KHR_performance_query */ +static const VkPerformanceCounterUnitKHR +gen_perf_counter_unit_to_vk_unit[] = { + [GEN_PERF_COUNTER_UNITS_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR, + [GEN_PERF_COUNTER_UNITS_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR, + [GEN_PERF_COUNTER_UNITS_NS] = VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR, + [GEN_PERF_COUNTER_UNITS_US] = VK_PERFORMANCE_COUNTER_UNIT_NANOSECONDS_KHR, /* todo */ + [GEN_PERF_COUNTER_UNITS_PIXELS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_TEXELS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_THREADS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_PERCENT] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR, + [GEN_PERF_COUNTER_UNITS_MESSAGES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_NUMBER] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_CYCLES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_EVENTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_UTILIZATION] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_EU_SENDS_TO_L3_CACHE_LINES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_EU_ATOMIC_REQUESTS_TO_L3_CACHE_LINES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_EU_REQUESTS_TO_L3_CACHE_LINES] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, + [GEN_PERF_COUNTER_UNITS_EU_BYTES_PER_L3_CACHE_LINE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, +}; + +static const VkPerformanceCounterStorageKHR +gen_perf_counter_data_type_to_vk_storage[] = { + [GEN_PERF_COUNTER_DATA_TYPE_BOOL32] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR, + [GEN_PERF_COUNTER_DATA_TYPE_UINT32] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR, + [GEN_PERF_COUNTER_DATA_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, + [GEN_PERF_COUNTER_DATA_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, + [GEN_PERF_COUNTER_DATA_TYPE_DOUBLE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT64_KHR, +}; + +VkResult anv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + uint32_t* pCounterCount, + VkPerformanceCounterKHR* pCounters, + VkPerformanceCounterDescriptionKHR* pCounterDescriptions) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + struct gen_perf_config *perf = pdevice->perf; + + uint32_t desc_count = *pCounterCount; + + VK_OUTARRAY_MAKE(out, pCounters, pCounterCount); + VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count); + + for (int c = 0; c < (perf ? perf->n_counters : 0); c++) { + const struct gen_perf_query_counter *gen_counter = perf->counters[c]; + + vk_outarray_append(&out, counter) { + counter->unit = gen_perf_counter_unit_to_vk_unit[gen_counter->units]; + counter->scope = VK_QUERY_SCOPE_COMMAND_KHR; + counter->storage = gen_perf_counter_data_type_to_vk_storage[gen_counter->data_type]; + + unsigned char sha1_result[20]; + _mesa_sha1_compute(gen_counter->symbol_name, + strlen(gen_counter->symbol_name), + sha1_result); + memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); + } + + vk_outarray_append(&out_desc, desc) { + desc->flags = 0; /* None so far. */ + snprintf(desc->name, sizeof(desc->name), "%s", gen_counter->name); + snprintf(desc->category, sizeof(desc->category), "%s", gen_counter->category); + snprintf(desc->description, sizeof(desc->description), "%s", gen_counter->desc); + } + } + + return vk_outarray_status(&out); +} + +void anv_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( + VkPhysicalDevice physicalDevice, + const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo, + uint32_t* pNumPasses) +{ + ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice); + struct gen_perf_config *perf = pdevice->perf; + + if (!perf) { + *pNumPasses = 0; + return; + } + + *pNumPasses = gen_perf_get_n_passes(perf, + pPerformanceQueryCreateInfo->pCounterIndices, + pPerformanceQueryCreateInfo->counterIndexCount, + NULL); +} + +VkResult anv_AcquireProfilingLockKHR( + VkDevice _device, + const VkAcquireProfilingLockInfoKHR* pInfo) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + struct gen_perf_config *perf = device->physical->perf; + struct gen_perf_query_info *first_metric_set = &perf->queries[0]; + + assert(device->perf_fd == -1); + + int fd = anv_device_perf_open(device, first_metric_set->oa_metrics_set_id); + if (fd < 0) + return VK_TIMEOUT; + + device->perf_fd = fd; + return VK_SUCCESS; +} + +void anv_ReleaseProfilingLockKHR( + VkDevice _device) +{ + ANV_FROM_HANDLE(anv_device, device, _device); + + assert(device->perf_fd >= 0); + close(device->perf_fd); + device->perf_fd = -1; +} + +void +anv_perf_write_pass_results(struct gen_perf_config *perf, + struct anv_query_pool *pool, uint32_t pass, + const struct gen_perf_query_result *accumulated_results, + union VkPerformanceCounterResultKHR *results) +{ + for (uint32_t c = 0; c < pool->n_counters; c++) { + const struct gen_perf_counter_pass *counter_pass = &pool->counter_pass[c]; + + if (counter_pass->pass != pass) + continue; + + switch (pool->pass_query[pass]->kind) { + case GEN_PERF_QUERY_TYPE_PIPELINE: { + assert(counter_pass->counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64); + uint32_t accu_offset = counter_pass->counter->offset / sizeof(uint64_t); + results[c].uint64 = accumulated_results->accumulator[accu_offset]; + break; + } + + case GEN_PERF_QUERY_TYPE_OA: + case GEN_PERF_QUERY_TYPE_RAW: + switch (counter_pass->counter->data_type) { + case GEN_PERF_COUNTER_DATA_TYPE_UINT64: + results[c].uint64 = + counter_pass->counter->oa_counter_read_uint64(perf, + counter_pass->query, + accumulated_results->accumulator); + break; + case GEN_PERF_COUNTER_DATA_TYPE_FLOAT: + results[c].float32 = + counter_pass->counter->oa_counter_read_float(perf, + counter_pass->query, + accumulated_results->accumulator); + break; + default: + /* So far we aren't using uint32, double or bool32... */ + unreachable("unexpected counter data type"); + } + break; + + default: + unreachable("invalid query type"); + } + + /* The Vulkan extension only has nanoseconds as a unit */ + if (counter_pass->counter->units == GEN_PERF_COUNTER_UNITS_US) { + assert(counter_pass->counter->data_type == GEN_PERF_COUNTER_DATA_TYPE_UINT64); + results[c].uint64 *= 1000; + } + } +} diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 3a0563ae83c..fa44307457b 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -79,6 +79,8 @@ struct anv_instance; struct gen_aux_map_context; struct gen_perf_config; +struct gen_perf_counter_pass; +struct gen_perf_query_result; #include #include @@ -221,6 +223,12 @@ struct gen_perf_config; */ #define ANV_PREDICATE_RESULT_REG 0x2678 /* MI_ALU_REG15 */ +/* We reserve this MI ALU register to pass around an offset computed from + * VkPerformanceQuerySubmitInfoKHR::counterPassIndex VK_KHR_performance_query. + * Other code which uses the MI ALU should leave it alone. + */ +#define ANV_PERF_QUERY_OFFSET_REG 0x2670 /* MI_ALU_REG14 */ + /* For gen12 we set the streamout buffers using 4 separate commands * (3DSTATE_SO_BUFFER_INDEX_*) instead of 3DSTATE_SO_BUFFER. However the layout * of the 3DSTATE_SO_BUFFER_INDEX_* commands is identical to that of @@ -1193,6 +1201,8 @@ struct anv_queue_submit { */ uintptr_t * fence_bos; + int perf_query_pass; + const VkAllocationCallbacks * alloc; VkSystemAllocationScope alloc_scope; @@ -1757,6 +1767,11 @@ _anv_combine_address(struct anv_batch *batch, void *location, _dst = NULL; \ })) +/* #define __gen_get_batch_dwords anv_batch_emit_dwords */ +/* #define __gen_get_batch_address anv_batch_address */ +/* #define __gen_address_value anv_address_physical */ +/* #define __gen_address_offset anv_address_add */ + struct anv_device_memory { struct vk_object_base base; @@ -2875,6 +2890,8 @@ struct anv_cmd_buffer { VkCommandBufferUsageFlags usage_flags; VkCommandBufferLevel level; + struct anv_query_pool *perf_query_pool; + struct anv_cmd_state state; struct anv_address return_addr; @@ -2898,7 +2915,8 @@ VkResult anv_cmd_buffer_execbuf(struct anv_queue *queue, const VkSemaphore *out_semaphores, const uint64_t *out_signal_values, uint32_t num_out_semaphores, - VkFence fence); + VkFence fence, + int perf_query_pass); VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer); @@ -4227,6 +4245,9 @@ struct anv_render_pass { #define ANV_PIPELINE_STATISTICS_MASK 0x000007ff +#define OA_SNAPSHOT_SIZE (256) +#define ANV_KHR_PERF_QUERY_SIZE (ALIGN(sizeof(uint64_t), 64) + 2 * OA_SNAPSHOT_SIZE) + struct anv_query_pool { struct vk_object_base base; @@ -4237,8 +4258,21 @@ struct anv_query_pool { /** Number of slots in this query pool */ uint32_t slots; struct anv_bo * bo; + + /* Perf queries : */ + struct anv_bo reset_bo; + uint32_t n_counters; + struct gen_perf_counter_pass *counter_pass; + uint32_t n_passes; + struct gen_perf_query_info **pass_query; }; +static inline uint32_t khr_perf_query_preamble_offset(struct anv_query_pool *pool, + uint32_t pass) +{ + return pass * ANV_KHR_PERF_QUERY_SIZE + 8; +} + int anv_get_instance_entrypoint_index(const char *name); int anv_get_device_entrypoint_index(const char *name); int anv_get_physical_device_entrypoint_index(const char *name); @@ -4292,6 +4326,10 @@ anv_get_subpass_id(const struct anv_cmd_state * const cmd_state) struct gen_perf_config *anv_get_perf(const struct gen_device_info *devinfo, int fd); void anv_device_perf_init(struct anv_device *device); +void anv_perf_write_pass_results(struct gen_perf_config *perf, + struct anv_query_pool *pool, uint32_t pass, + const struct gen_perf_query_result *accumulated_results, + union VkPerformanceCounterResultKHR *results); #define ANV_FROM_HANDLE(__anv_type, __name, __handle) \ VK_FROM_HANDLE(__anv_type, __name, __handle) diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c index 009675e23ea..f6e3fdd6177 100644 --- a/src/intel/vulkan/anv_queue.c +++ b/src/intel/vulkan/anv_queue.c @@ -544,7 +544,7 @@ anv_queue_submit_add_timeline_signal(struct anv_queue_submit* submit, } static struct anv_queue_submit * -anv_queue_submit_alloc(struct anv_device *device) +anv_queue_submit_alloc(struct anv_device *device, int perf_query_pass) { const VkAllocationCallbacks *alloc = &device->vk.alloc; VkSystemAllocationScope alloc_scope = VK_SYSTEM_ALLOCATION_SCOPE_DEVICE; @@ -557,6 +557,7 @@ anv_queue_submit_alloc(struct anv_device *device) submit->alloc_scope = alloc_scope; submit->in_fence = -1; submit->out_fence = -1; + submit->perf_query_pass = perf_query_pass; return submit; } @@ -569,7 +570,7 @@ anv_queue_submit_simple_batch(struct anv_queue *queue, return VK_SUCCESS; struct anv_device *device = queue->device; - struct anv_queue_submit *submit = anv_queue_submit_alloc(device); + struct anv_queue_submit *submit = anv_queue_submit_alloc(device, -1); if (!submit) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); @@ -720,12 +721,13 @@ anv_queue_submit(struct anv_queue *queue, const uint64_t *out_values, uint32_t num_out_semaphores, struct anv_bo *wsi_signal_bo, - VkFence _fence) + VkFence _fence, + int perf_query_pass) { ANV_FROM_HANDLE(anv_fence, fence, _fence); struct anv_device *device = queue->device; UNUSED struct anv_physical_device *pdevice = device->physical; - struct anv_queue_submit *submit = anv_queue_submit_alloc(device); + struct anv_queue_submit *submit = anv_queue_submit_alloc(device, perf_query_pass); if (!submit) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); @@ -972,7 +974,7 @@ VkResult anv_QueueSubmit( * common case. */ result = anv_queue_submit(queue, NULL, NULL, NULL, 0, NULL, NULL, 0, - NULL, fence); + NULL, fence, -1); goto out; } @@ -990,6 +992,9 @@ VkResult anv_QueueSubmit( const VkTimelineSemaphoreSubmitInfoKHR *timeline_info = vk_find_struct_const(pSubmits[i].pNext, TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR); + const VkPerformanceQuerySubmitInfoKHR *perf_info = + vk_find_struct_const(pSubmits[i].pNext, + PERFORMANCE_QUERY_SUBMIT_INFO_KHR); const uint64_t *wait_values = timeline_info && timeline_info->waitSemaphoreValueCount ? timeline_info->pWaitSemaphoreValues : NULL; @@ -1011,7 +1016,8 @@ VkResult anv_QueueSubmit( signal_values, pSubmits[i].signalSemaphoreCount, wsi_signal_bo, - submit_fence); + submit_fence, + -1); if (result != VK_SUCCESS) goto out; @@ -1049,7 +1055,8 @@ VkResult anv_QueueSubmit( result = anv_queue_submit(queue, cmd_buffer, in_semaphores, in_values, num_in_semaphores, out_semaphores, out_values, num_out_semaphores, - wsi_signal_bo, execbuf_fence); + wsi_signal_bo, execbuf_fence, + perf_info ? perf_info->counterPassIndex : 0); if (result != VK_SUCCESS) goto out; } diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 50670d64a89..e1389699750 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -34,8 +34,11 @@ #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" -/* We reserve GPR 15 for conditional rendering */ -#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 15 +/* We reserve : + * - GPR 14 for secondary command buffer returns + * - GPR 15 for conditional rendering + */ +#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14 #define __gen_get_batch_dwords anv_batch_emit_dwords #define __gen_address_offset anv_address_add #include "common/gen_mi_builder.h" @@ -1755,6 +1758,11 @@ genX(CmdExecuteCommands)( } anv_cmd_buffer_add_secondary(primary, secondary); + + assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL || + secondary->perf_query_pool == primary->perf_query_pool); + if (secondary->perf_query_pool) + primary->perf_query_pool = secondary->perf_query_pool; } /* The secondary isn't counted in our VF cache tracking so we need to diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c index 17ccfc66dc9..3fd662cc062 100644 --- a/src/intel/vulkan/genX_query.c +++ b/src/intel/vulkan/genX_query.c @@ -32,16 +32,32 @@ #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" -/* We reserve GPR 15 for conditional rendering */ -#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 15 +/* We reserve : + * - GPR 14 for perf queries + * - GPR 15 for conditional rendering + */ +#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14 +#define GEN_MI_BUILDER_CAN_WRITE_BATCH GEN_GEN >= 8 #define __gen_get_batch_dwords anv_batch_emit_dwords #define __gen_address_offset anv_address_add +#define __gen_get_batch_address(b, a) anv_address_physical(anv_batch_address(b, a)) #include "common/gen_mi_builder.h" #include "perf/gen_perf.h" #include "perf/gen_perf_mdapi.h" #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t)) +#include "vk_util.h" + +static struct anv_address +anv_query_address(struct anv_query_pool *pool, uint32_t query) +{ + return (struct anv_address) { + .bo = pool->bo, + .offset = query * pool->stride, + }; +} + VkResult genX(CreateQueryPool)( VkDevice _device, const VkQueryPoolCreateInfo* pCreateInfo, @@ -50,7 +66,11 @@ VkResult genX(CreateQueryPool)( { ANV_FROM_HANDLE(anv_device, device, _device); const struct anv_physical_device *pdevice = device->physical; + const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; struct anv_query_pool *pool; + struct gen_perf_counter_pass *counter_pass; + struct gen_perf_query_info **pass_query; + ANV_MULTIALLOC(ma); VkResult result; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); @@ -65,17 +85,20 @@ VkResult genX(CreateQueryPool)( * 64bytes so we put those first and have the "available" bit behind * together with some other counters. */ - uint32_t uint64s_per_slot = 1; + uint32_t uint64s_per_slot = 0; + UNUSED uint32_t n_passes = 0; + + anv_multialloc_add(&ma, &pool, 1); VkQueryPipelineStatisticFlags pipeline_statistics = 0; switch (pCreateInfo->queryType) { case VK_QUERY_TYPE_OCCLUSION: /* Occlusion queries have two values: begin and end. */ - uint64s_per_slot += 2; + uint64s_per_slot = 1 + 2; break; case VK_QUERY_TYPE_TIMESTAMP: /* Timestamps just have the one timestamp value */ - uint64s_per_slot += 1; + uint64s_per_slot = 1 + 1; break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: pipeline_statistics = pCreateInfo->pipelineStatistics; @@ -85,25 +108,36 @@ VkResult genX(CreateQueryPool)( pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK; /* Statistics queries have a min and max for every statistic */ - uint64s_per_slot += 2 * util_bitcount(pipeline_statistics); + uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics); break; case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: /* Transform feedback queries are 4 values, begin/end for * written/available. */ - uint64s_per_slot += 4; + uint64s_per_slot = 1 + 4; break; - case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: uint64s_per_slot = 72; /* 576 bytes, see layout below */ break; - } + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + perf_query_info = vk_find_struct_const(pCreateInfo->pNext, + QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); + n_passes = gen_perf_get_n_passes(pdevice->perf, + perf_query_info->pCounterIndices, + perf_query_info->counterIndexCount, + NULL); + anv_multialloc_add(&ma, &counter_pass, perf_query_info->counterIndexCount); + anv_multialloc_add(&ma, &pass_query, n_passes); + STATIC_ASSERT(ANV_KHR_PERF_QUERY_SIZE % sizeof(uint64_t) == 0); + uint64s_per_slot = (ANV_KHR_PERF_QUERY_SIZE / sizeof(uint64_t)) * n_passes; + break; default: assert(!"Invalid query type"); } - pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (pool == NULL) + if (!anv_multialloc_alloc2(&ma, &device->vk.alloc, + pAllocator, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL); @@ -112,6 +146,21 @@ VkResult genX(CreateQueryPool)( pool->stride = uint64s_per_slot * sizeof(uint64_t); pool->slots = pCreateInfo->queryCount; + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + pool->n_counters = perf_query_info->counterIndexCount; + pool->counter_pass = counter_pass; + gen_perf_get_counters_passes(pdevice->perf, + perf_query_info->pCounterIndices, + perf_query_info->counterIndexCount, + pool->counter_pass); + pool->n_passes = n_passes; + pool->pass_query = pass_query; + gen_perf_get_n_passes(pdevice->perf, + perf_query_info->pCounterIndices, + perf_query_info->counterIndexCount, + pool->pass_query); + } + uint32_t bo_flags = 0; if (pdevice->supports_48bit_addresses) bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; @@ -131,6 +180,23 @@ VkResult genX(CreateQueryPool)( if (result != VK_SUCCESS) goto fail; + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + struct gen_mi_builder b; + struct anv_batch batch = { + .start = pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 8, + .end = pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 64, + }; + batch.next = batch.start; + + gen_mi_builder_init(&b, &batch); + gen_mi_store(&b, gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG), + gen_mi_imm(p * ANV_KHR_PERF_QUERY_SIZE)); + anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); + assert(batch.next <= (pool->bo->map + ANV_KHR_PERF_QUERY_SIZE * p + 64)); + } + } + *pQueryPool = anv_query_pool_to_handle(pool); return VK_SUCCESS; @@ -157,15 +223,73 @@ void genX(DestroyQueryPool)( vk_free2(&device->vk.alloc, pAllocator, pool); } -static struct anv_address -anv_query_address(struct anv_query_pool *pool, uint32_t query) +/** + * VK_KHR_performance_query layout (576 bytes * number of passes) : + * + * ----------------------------------------- + * | availability (8b) | | | + * |----------------------------| | | + * | Small batch loading | | | + * | ANV_PERF_QUERY_OFFSET_REG | | | + * | (56b) | | Pass 0 | + * |----------------------------| | | + * | begin MI_RPC (256b) | | | + * |----------------------------| | | + * | end MI_RPC (256b) | | | + * |----------------------------|-- | Query 0 + * | availability (8b) | | | + * |----------------------------| | | + * | Small batch loading | | | + * | ANV_PERF_QUERY_OFFSET_REG | | | + * | (56b) | | Pass 1 | + * |----------------------------| | | + * | begin MI_RPC (256b) | | | + * |----------------------------| | | + * | end MI_RPC (256b) | | | + * |----------------------------|----------- + * | availability (8b) | | | + * |----------------------------| | | + * | Unused (48b) | | | + * |----------------------------| | Pass 0 | + * | begin MI_RPC (256b) | | | + * |----------------------------| | | Query 1 + * | end MI_RPC (256b) | | | + * |----------------------------|-- | + * | ... | | | + * ----------------------------------------- + */ +UNUSED static uint64_t +khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass) { - return (struct anv_address) { - .bo = pool->bo, - .offset = query * pool->stride, - }; + return query * (pool->n_passes * ANV_KHR_PERF_QUERY_SIZE) + + pass * ANV_KHR_PERF_QUERY_SIZE; +} + +UNUSED static uint64_t +khr_perf_query_oa_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) +{ + return query * (pool->n_passes * ANV_KHR_PERF_QUERY_SIZE) + + pass * ANV_KHR_PERF_QUERY_SIZE + + 64 + (end ? OA_SNAPSHOT_SIZE : 0); +} + +UNUSED static struct anv_address +khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass) +{ + return anv_address_add( + (struct anv_address) { .bo = pool->bo, }, + khr_perf_query_availability_offset(pool, query, pass)); } +UNUSED static struct anv_address +khr_perf_query_oa_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) +{ + return anv_address_add( + (struct anv_address) { .bo = pool->bo, }, + khr_perf_query_oa_offset(pool, query, pass, end)); +} + + /** * VK_INTEL_performance_query layout (576 bytes) : * @@ -238,7 +362,17 @@ query_slot(struct anv_query_pool *pool, uint32_t query) static bool query_is_available(struct anv_query_pool *pool, uint32_t query) { - return *(volatile uint64_t *)query_slot(pool, query); + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + volatile uint64_t *slot = + pool->bo->map + khr_perf_query_availability_offset(pool, query, p); + if (!slot[0]) + return false; + } + return true; + } else { + return *(volatile uint64_t *)query_slot(pool, query); + } } static VkResult @@ -275,6 +409,7 @@ VkResult genX(GetQueryPoolResults)( pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || pool->type == VK_QUERY_TYPE_TIMESTAMP || pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || + pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL); if (anv_device_is_lost(device)) @@ -305,6 +440,12 @@ VkResult genX(GetQueryPoolResults)( * and vkGetQueryPoolResults returns VK_NOT_READY. However, * availability state is still written to pData for those queries if * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." + * + * From VK_KHR_performance_query : + * + * "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies + * that the result should contain the number of counters that were recorded + * into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR" */ bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); @@ -367,6 +508,23 @@ VkResult genX(GetQueryPoolResults)( break; } +#if GEN_GEN >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + const struct anv_physical_device *pdevice = device->physical; + assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | + VK_QUERY_RESULT_PARTIAL_BIT)) == 0); + for (uint32_t p = 0; p < pool->n_passes; p++) { + const uint32_t *begin = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, false); + const uint32_t *end = pool->bo->map + khr_perf_query_oa_offset(pool, firstQuery + i, p, true); + struct gen_perf_query_result result; + gen_perf_query_result_clear(&result); + gen_perf_query_result_accumulate(&result, pool->pass_query[p], begin, end); + anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData); + } + break; + } +#endif + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { if (!write_results) break; @@ -503,6 +661,23 @@ emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, } break; +#if GEN_GEN >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + for (uint32_t i = 0; i < num_queries; i++) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + gen_mi_memset(b, + khr_perf_query_oa_address(pool, + first_index + i, p, false), + 0, 2 * OA_SNAPSHOT_SIZE); + emit_query_mi_availability(b, + khr_perf_query_availability_address(pool, first_index + i, p), + true); + } + } + break; + } +#endif + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: for (uint32_t i = 0; i < num_queries; i++) { struct anv_address slot_addr = @@ -546,6 +721,23 @@ void genX(CmdResetQueryPool)( break; } +#if GEN_GEN >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + struct gen_mi_builder b; + gen_mi_builder_init(&b, &cmd_buffer->batch); + + for (uint32_t i = 0; i < queryCount; i++) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + emit_query_mi_availability( + &b, + khr_perf_query_availability_address(pool, firstQuery + i, p), + false); + } + } + break; + } +#endif + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { struct gen_mi_builder b; gen_mi_builder_init(&b, &cmd_buffer->batch); @@ -569,8 +761,16 @@ void genX(ResetQueryPool)( ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); for (uint32_t i = 0; i < queryCount; i++) { - uint64_t *slot = query_slot(pool, firstQuery + i); - *slot = 0; + if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { + for (uint32_t p = 0; p < pool->n_passes; p++) { + uint64_t *pass_slot = pool->bo->map + + khr_perf_query_availability_offset(pool, firstQuery + i, p); + *pass_slot = 0; + } + } else { + uint64_t *slot = query_slot(pool, firstQuery + i); + *slot = 0; + } } } @@ -665,6 +865,41 @@ void genX(CmdBeginQueryIndexedEXT)( emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); break; +#if GEN_GEN >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + cmd_buffer->perf_query_pool = pool; + + /* We know the bottom bits of the address are 0s which match what we + * want in the MI_RPC packet. + */ + struct gen_mi_value mi_rpc_write_offset = + gen_mi_iadd( + &b, + gen_mi_imm( + gen_canonical_address( + pool->bo->offset + + khr_perf_query_oa_offset(pool, query, 0 /* pass */, false))), + gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); + struct gen_mi_address_token mi_rpc_addr_dest = + gen_mi_store_address(&b, mi_rpc_write_offset); + gen_mi_self_mod_barrier(&b); + + void *mi_rpc_dws = + anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_REPORT_PERF_COUNT_length), + GENX(MI_REPORT_PERF_COUNT), + .MemoryAddress = query_addr /* Will be overwritten */ ); + _gen_mi_resolve_address_token(&b, mi_rpc_addr_dest, + mi_rpc_dws + + GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); + break; + } +#endif + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.CommandStreamerStallEnable = true; @@ -757,6 +992,60 @@ void genX(CmdEndQueryIndexedEXT)( emit_query_mi_availability(&b, query_addr, true); break; +#if GEN_GEN >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + + /* We know the bottom bits of the address are 0s which match what we + * want in the MI_RPC/MI_SDI packets. + */ + struct gen_mi_value mi_rpc_write_offset = + gen_mi_iadd( + &b, + gen_mi_imm( + gen_canonical_address( + pool->bo->offset + + khr_perf_query_oa_offset(pool, query, 0 /* pass*/, true))), + gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); + struct gen_mi_value availability_write_offset = + gen_mi_iadd( + &b, + gen_mi_imm( + gen_canonical_address( + pool->bo->offset + + khr_perf_query_availability_offset(pool, query, 0 /* pass */))), + gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); + + struct gen_mi_address_token mi_rpc_addr_dest = + gen_mi_store_address(&b, mi_rpc_write_offset); + struct gen_mi_address_token availability_addr_dest = + gen_mi_store_address(&b, availability_write_offset); + gen_mi_self_mod_barrier(&b); + + void *mi_rpc_dws = + anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_REPORT_PERF_COUNT_length), + GENX(MI_REPORT_PERF_COUNT), + .MemoryAddress = query_addr /* Will be overwritten */ ); + _gen_mi_resolve_address_token(&b, mi_rpc_addr_dest, + mi_rpc_dws + + GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); + + void *availability_dws = + anv_batch_emitn(&cmd_buffer->batch, + GENX(MI_STORE_DATA_IMM_length), + GENX(MI_STORE_DATA_IMM), + .ImmediateData = true); + _gen_mi_resolve_address_token(&b, availability_addr_dest, + availability_dws + + GENX(MI_STORE_DATA_IMM_Address_start) / 8); + break; + } +#endif + case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.CommandStreamerStallEnable = true; @@ -1039,6 +1328,12 @@ void genX(CmdCopyQueryPoolResults)( gpu_write_query_result(&b, dest_addr, flags, 0, result); break; +#if GEN_GEN >= 8 + case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: + unreachable("Copy KHR performance query results not implemented"); + break; +#endif + default: unreachable("unhandled query type"); }