anv: Properly handle host query reset of performance queries
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 72; /* 576 bytes, see layout below */
98 break;
99 }
100 default:
101 assert(!"Invalid query type");
102 }
103
104 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (pool == NULL)
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 pool->type = pCreateInfo->queryType;
110 pool->pipeline_statistics = pipeline_statistics;
111 pool->stride = uint64s_per_slot * sizeof(uint64_t);
112 pool->slots = pCreateInfo->queryCount;
113
114 uint32_t bo_flags = 0;
115 if (pdevice->supports_48bit_addresses)
116 bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
117
118 if (pdevice->use_softpin)
119 bo_flags |= EXEC_OBJECT_PINNED;
120
121 if (pdevice->has_exec_async)
122 bo_flags |= EXEC_OBJECT_ASYNC;
123
124 uint64_t size = pool->slots * pool->stride;
125 result = anv_device_alloc_bo(device, size,
126 ANV_BO_ALLOC_MAPPED |
127 ANV_BO_ALLOC_SNOOPED,
128 &pool->bo);
129 if (result != VK_SUCCESS)
130 goto fail;
131
132 *pQueryPool = anv_query_pool_to_handle(pool);
133
134 return VK_SUCCESS;
135
136 fail:
137 vk_free2(&device->alloc, pAllocator, pool);
138
139 return result;
140 }
141
142 void genX(DestroyQueryPool)(
143 VkDevice _device,
144 VkQueryPool _pool,
145 const VkAllocationCallbacks* pAllocator)
146 {
147 ANV_FROM_HANDLE(anv_device, device, _device);
148 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
149
150 if (!pool)
151 return;
152
153 anv_device_release_bo(device, pool->bo);
154 vk_free2(&device->alloc, pAllocator, pool);
155 }
156
157 static struct anv_address
158 anv_query_address(struct anv_query_pool *pool, uint32_t query)
159 {
160 return (struct anv_address) {
161 .bo = pool->bo,
162 .offset = query * pool->stride,
163 };
164 }
165
166 /**
167 * VK_INTEL_performance_query layout (576 bytes) :
168 *
169 * ------------------------------
170 * | availability (8b) |
171 * |----------------------------|
172 * | marker (8b) |
173 * |----------------------------|
174 * | begin RPSTAT register (4b) |
175 * |----------------------------|
176 * | end RPSTAT register (4b) |
177 * |----------------------------|
178 * | begin perfcntr 1 & 2 (16b) |
179 * |----------------------------|
180 * | end perfcntr 1 & 2 (16b) |
181 * |----------------------------|
182 * | Unused (8b) |
183 * |----------------------------|
184 * | begin MI_RPC (256b) |
185 * |----------------------------|
186 * | end MI_RPC (256b) |
187 * ------------------------------
188 */
189
190 static uint32_t
191 intel_perf_marker_offset(void)
192 {
193 return 8;
194 }
195
196 static uint32_t
197 intel_perf_rpstart_offset(bool end)
198 {
199 return 16 + (end ? sizeof(uint32_t) : 0);
200 }
201
202 static uint32_t
203 intel_perf_counter(bool end)
204 {
205 return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
206 }
207
208 static uint32_t
209 intel_perf_mi_rpc_offset(bool end)
210 {
211 return 64 + (end ? 256 : 0);
212 }
213
214 static void
215 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
216 uint32_t value_index, uint64_t result)
217 {
218 if (flags & VK_QUERY_RESULT_64_BIT) {
219 uint64_t *dst64 = dst_slot;
220 dst64[value_index] = result;
221 } else {
222 uint32_t *dst32 = dst_slot;
223 dst32[value_index] = result;
224 }
225 }
226
227 static void *
228 query_slot(struct anv_query_pool *pool, uint32_t query)
229 {
230 return pool->bo->map + query * pool->stride;
231 }
232
233 static bool
234 query_is_available(struct anv_query_pool *pool, uint32_t query)
235 {
236 return *(volatile uint64_t *)query_slot(pool, query);
237 }
238
239 static VkResult
240 wait_for_available(struct anv_device *device,
241 struct anv_query_pool *pool, uint32_t query)
242 {
243 while (true) {
244 if (query_is_available(pool, query))
245 return VK_SUCCESS;
246
247 int ret = anv_gem_busy(device, pool->bo->gem_handle);
248 if (ret == 1) {
249 /* The BO is still busy, keep waiting. */
250 continue;
251 } else if (ret == -1) {
252 /* We don't know the real error. */
253 return anv_device_set_lost(device, "gem wait failed: %m");
254 } else {
255 assert(ret == 0);
256 /* The BO is no longer busy. */
257 if (query_is_available(pool, query)) {
258 return VK_SUCCESS;
259 } else {
260 VkResult status = anv_device_query_status(device);
261 if (status != VK_SUCCESS)
262 return status;
263
264 /* If we haven't seen availability yet, then we never will. This
265 * can only happen if we have a client error where they call
266 * GetQueryPoolResults on a query that they haven't submitted to
267 * the GPU yet. The spec allows us to do anything in this case,
268 * but returning VK_SUCCESS doesn't seem right and we shouldn't
269 * just keep spinning.
270 */
271 return VK_NOT_READY;
272 }
273 }
274 }
275 }
276
277 VkResult genX(GetQueryPoolResults)(
278 VkDevice _device,
279 VkQueryPool queryPool,
280 uint32_t firstQuery,
281 uint32_t queryCount,
282 size_t dataSize,
283 void* pData,
284 VkDeviceSize stride,
285 VkQueryResultFlags flags)
286 {
287 ANV_FROM_HANDLE(anv_device, device, _device);
288 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
289
290 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
291 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
292 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
293 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
294 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
295
296 if (anv_device_is_lost(device))
297 return VK_ERROR_DEVICE_LOST;
298
299 if (pData == NULL)
300 return VK_SUCCESS;
301
302 void *data_end = pData + dataSize;
303
304 VkResult status = VK_SUCCESS;
305 for (uint32_t i = 0; i < queryCount; i++) {
306 bool available = query_is_available(pool, firstQuery + i);
307
308 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
309 status = wait_for_available(device, pool, firstQuery + i);
310 if (status != VK_SUCCESS)
311 return status;
312
313 available = true;
314 }
315
316 /* From the Vulkan 1.0.42 spec:
317 *
318 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
319 * both not set then no result values are written to pData for
320 * queries that are in the unavailable state at the time of the call,
321 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
322 * availability state is still written to pData for those queries if
323 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
324 */
325 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
326
327 uint32_t idx = 0;
328 switch (pool->type) {
329 case VK_QUERY_TYPE_OCCLUSION: {
330 uint64_t *slot = query_slot(pool, firstQuery + i);
331 if (write_results)
332 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
333 idx++;
334 break;
335 }
336
337 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
338 uint64_t *slot = query_slot(pool, firstQuery + i);
339 uint32_t statistics = pool->pipeline_statistics;
340 while (statistics) {
341 uint32_t stat = u_bit_scan(&statistics);
342 if (write_results) {
343 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
344
345 /* WaDividePSInvocationCountBy4:HSW,BDW */
346 if ((device->info.gen == 8 || device->info.is_haswell) &&
347 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
348 result >>= 2;
349
350 cpu_write_query_result(pData, flags, idx, result);
351 }
352 idx++;
353 }
354 assert(idx == util_bitcount(pool->pipeline_statistics));
355 break;
356 }
357
358 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
359 uint64_t *slot = query_slot(pool, firstQuery + i);
360 if (write_results)
361 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
362 idx++;
363 if (write_results)
364 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
365 idx++;
366 break;
367 }
368
369 case VK_QUERY_TYPE_TIMESTAMP: {
370 uint64_t *slot = query_slot(pool, firstQuery + i);
371 if (write_results)
372 cpu_write_query_result(pData, flags, idx, slot[1]);
373 idx++;
374 break;
375 }
376
377 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
378 if (!write_results)
379 break;
380 const void *query_data = query_slot(pool, firstQuery + i);
381 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
382 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
383 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
384 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
385 struct gen_perf_query_result result;
386 struct gen_perf_query_info metric = {
387 .oa_format = (GEN_GEN >= 8 ?
388 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
389 I915_OA_FORMAT_A45_B8_C8),
390 };
391 uint32_t core_freq[2];
392 #if GEN_GEN < 9
393 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
394 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
395 #else
396 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
397 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
398 #endif
399 gen_perf_query_result_clear(&result);
400 gen_perf_query_result_accumulate(&result, &metric,
401 oa_begin, oa_end);
402 gen_perf_query_result_read_frequencies(&result, &device->info,
403 oa_begin, oa_end);
404 gen_perf_query_result_write_mdapi(pData, stride,
405 &device->info,
406 &result,
407 core_freq[0], core_freq[1]);
408 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
409 query_data + intel_perf_counter(false),
410 query_data + intel_perf_counter(true));
411 const uint64_t *marker = query_data + intel_perf_marker_offset();
412 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
413 break;
414 }
415
416 default:
417 unreachable("invalid pool type");
418 }
419
420 if (!write_results)
421 status = VK_NOT_READY;
422
423 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
424 cpu_write_query_result(pData, flags, idx, available);
425
426 pData += stride;
427 if (pData >= data_end)
428 break;
429 }
430
431 return status;
432 }
433
434 static void
435 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
436 struct anv_address addr)
437 {
438 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
439 pc.DestinationAddressType = DAT_PPGTT;
440 pc.PostSyncOperation = WritePSDepthCount;
441 pc.DepthStallEnable = true;
442 pc.Address = addr;
443
444 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
445 pc.CommandStreamerStallEnable = true;
446 }
447 }
448
449 static void
450 emit_query_mi_availability(struct gen_mi_builder *b,
451 struct anv_address addr,
452 bool available)
453 {
454 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
455 }
456
457 static void
458 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
459 struct anv_address addr,
460 bool available)
461 {
462 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
463 pc.DestinationAddressType = DAT_PPGTT;
464 pc.PostSyncOperation = WriteImmediateData;
465 pc.Address = addr;
466 pc.ImmediateData = available;
467 }
468 }
469
470 /**
471 * Goes through a series of consecutive query indices in the given pool
472 * setting all element values to 0 and emitting them as available.
473 */
474 static void
475 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
476 struct gen_mi_builder *b, struct anv_query_pool *pool,
477 uint32_t first_index, uint32_t num_queries)
478 {
479 switch (pool->type) {
480 case VK_QUERY_TYPE_OCCLUSION:
481 case VK_QUERY_TYPE_TIMESTAMP:
482 /* These queries are written with a PIPE_CONTROL so clear them using the
483 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
484 * of operations.
485 */
486 assert((pool->stride % 8) == 0);
487 for (uint32_t i = 0; i < num_queries; i++) {
488 struct anv_address slot_addr =
489 anv_query_address(pool, first_index + i);
490
491 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
492 emit_query_pc_availability(cmd_buffer,
493 anv_address_add(slot_addr, qword * 8),
494 false);
495 }
496 emit_query_pc_availability(cmd_buffer, slot_addr, true);
497 }
498 break;
499
500 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
501 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
502 for (uint32_t i = 0; i < num_queries; i++) {
503 struct anv_address slot_addr =
504 anv_query_address(pool, first_index + i);
505 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
506 emit_query_mi_availability(b, slot_addr, true);
507 }
508 break;
509
510 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
511 for (uint32_t i = 0; i < num_queries; i++) {
512 struct anv_address slot_addr =
513 anv_query_address(pool, first_index + i);
514 gen_mi_memset(b, slot_addr, 0, pool->stride - 8);
515 emit_query_mi_availability(b, anv_address_add(slot_addr,
516 pool->stride - 8), true);
517 }
518 break;
519
520 default:
521 unreachable("Unsupported query type");
522 }
523 }
524
525 void genX(CmdResetQueryPool)(
526 VkCommandBuffer commandBuffer,
527 VkQueryPool queryPool,
528 uint32_t firstQuery,
529 uint32_t queryCount)
530 {
531 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
532 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
533
534 switch (pool->type) {
535 case VK_QUERY_TYPE_OCCLUSION:
536 case VK_QUERY_TYPE_TIMESTAMP:
537 for (uint32_t i = 0; i < queryCount; i++) {
538 emit_query_pc_availability(cmd_buffer,
539 anv_query_address(pool, firstQuery + i),
540 false);
541 }
542 break;
543
544 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
545 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
546 struct gen_mi_builder b;
547 gen_mi_builder_init(&b, &cmd_buffer->batch);
548
549 for (uint32_t i = 0; i < queryCount; i++)
550 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
551 break;
552 }
553
554 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
555 struct gen_mi_builder b;
556 gen_mi_builder_init(&b, &cmd_buffer->batch);
557
558 for (uint32_t i = 0; i < queryCount; i++) {
559 emit_query_mi_availability(
560 &b,
561 anv_address_add(
562 anv_query_address(pool, firstQuery + i),
563 pool->stride - 8),
564 false);
565 }
566 break;
567 }
568
569 default:
570 unreachable("Unsupported query type");
571 }
572 }
573
574 void genX(ResetQueryPoolEXT)(
575 VkDevice _device,
576 VkQueryPool queryPool,
577 uint32_t firstQuery,
578 uint32_t queryCount)
579 {
580 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
581
582 for (uint32_t i = 0; i < queryCount; i++) {
583 uint64_t *slot = query_slot(pool, firstQuery + i);
584 *slot = 0;
585 }
586 }
587
588 static const uint32_t vk_pipeline_stat_to_reg[] = {
589 GENX(IA_VERTICES_COUNT_num),
590 GENX(IA_PRIMITIVES_COUNT_num),
591 GENX(VS_INVOCATION_COUNT_num),
592 GENX(GS_INVOCATION_COUNT_num),
593 GENX(GS_PRIMITIVES_COUNT_num),
594 GENX(CL_INVOCATION_COUNT_num),
595 GENX(CL_PRIMITIVES_COUNT_num),
596 GENX(PS_INVOCATION_COUNT_num),
597 GENX(HS_INVOCATION_COUNT_num),
598 GENX(DS_INVOCATION_COUNT_num),
599 GENX(CS_INVOCATION_COUNT_num),
600 };
601
602 static void
603 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
604 struct anv_address addr)
605 {
606 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
607 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
608
609 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
610 gen_mi_store(b, gen_mi_mem64(addr),
611 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
612 }
613
614 static void
615 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
616 struct anv_address addr)
617 {
618 assert(stream < MAX_XFB_STREAMS);
619
620 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
621 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
622 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
623 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
624 }
625
626 void genX(CmdBeginQuery)(
627 VkCommandBuffer commandBuffer,
628 VkQueryPool queryPool,
629 uint32_t query,
630 VkQueryControlFlags flags)
631 {
632 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
633 }
634
635 void genX(CmdBeginQueryIndexedEXT)(
636 VkCommandBuffer commandBuffer,
637 VkQueryPool queryPool,
638 uint32_t query,
639 VkQueryControlFlags flags,
640 uint32_t index)
641 {
642 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
643 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
644 struct anv_address query_addr = anv_query_address(pool, query);
645
646 struct gen_mi_builder b;
647 gen_mi_builder_init(&b, &cmd_buffer->batch);
648
649 switch (pool->type) {
650 case VK_QUERY_TYPE_OCCLUSION:
651 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
652 break;
653
654 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
655 /* TODO: This might only be necessary for certain stats */
656 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
657 pc.CommandStreamerStallEnable = true;
658 pc.StallAtPixelScoreboard = true;
659 }
660
661 uint32_t statistics = pool->pipeline_statistics;
662 uint32_t offset = 8;
663 while (statistics) {
664 uint32_t stat = u_bit_scan(&statistics);
665 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
666 offset += 16;
667 }
668 break;
669 }
670
671 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
672 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
673 pc.CommandStreamerStallEnable = true;
674 pc.StallAtPixelScoreboard = true;
675 }
676 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
677 break;
678
679 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
680 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
681 pc.CommandStreamerStallEnable = true;
682 pc.StallAtPixelScoreboard = true;
683 }
684 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
685 rpc.MemoryAddress =
686 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
687 }
688 #if GEN_GEN < 9
689 gen_mi_store(&b,
690 gen_mi_mem32(anv_address_add(query_addr,
691 intel_perf_rpstart_offset(false))),
692 gen_mi_reg32(GENX(RPSTAT1_num)));
693 #else
694 gen_mi_store(&b,
695 gen_mi_mem32(anv_address_add(query_addr,
696 intel_perf_rpstart_offset(false))),
697 gen_mi_reg32(GENX(RPSTAT0_num)));
698 #endif
699 #if GEN_GEN >= 8 && GEN_GEN <= 11
700 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
701 intel_perf_counter(false))),
702 gen_mi_reg64(GENX(PERFCNT1_num)));
703 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
704 intel_perf_counter(false) + 8)),
705 gen_mi_reg64(GENX(PERFCNT2_num)));
706 #endif
707 break;
708 }
709
710 default:
711 unreachable("");
712 }
713 }
714
715 void genX(CmdEndQuery)(
716 VkCommandBuffer commandBuffer,
717 VkQueryPool queryPool,
718 uint32_t query)
719 {
720 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
721 }
722
723 void genX(CmdEndQueryIndexedEXT)(
724 VkCommandBuffer commandBuffer,
725 VkQueryPool queryPool,
726 uint32_t query,
727 uint32_t index)
728 {
729 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
730 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
731 struct anv_address query_addr = anv_query_address(pool, query);
732
733 struct gen_mi_builder b;
734 gen_mi_builder_init(&b, &cmd_buffer->batch);
735
736 switch (pool->type) {
737 case VK_QUERY_TYPE_OCCLUSION:
738 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
739 emit_query_pc_availability(cmd_buffer, query_addr, true);
740 break;
741
742 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
743 /* TODO: This might only be necessary for certain stats */
744 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
745 pc.CommandStreamerStallEnable = true;
746 pc.StallAtPixelScoreboard = true;
747 }
748
749 uint32_t statistics = pool->pipeline_statistics;
750 uint32_t offset = 16;
751 while (statistics) {
752 uint32_t stat = u_bit_scan(&statistics);
753 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
754 offset += 16;
755 }
756
757 emit_query_mi_availability(&b, query_addr, true);
758 break;
759 }
760
761 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
762 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
763 pc.CommandStreamerStallEnable = true;
764 pc.StallAtPixelScoreboard = true;
765 }
766
767 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
768 emit_query_mi_availability(&b, query_addr, true);
769 break;
770
771 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
772 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
773 pc.CommandStreamerStallEnable = true;
774 pc.StallAtPixelScoreboard = true;
775 }
776 uint32_t marker_offset = intel_perf_marker_offset();
777 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
778 gen_mi_imm(cmd_buffer->intel_perf_marker));
779 #if GEN_GEN >= 8 && GEN_GEN <= 11
780 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
781 gen_mi_reg64(GENX(PERFCNT1_num)));
782 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
783 gen_mi_reg64(GENX(PERFCNT2_num)));
784 #endif
785 #if GEN_GEN < 9
786 gen_mi_store(&b,
787 gen_mi_mem32(anv_address_add(query_addr,
788 intel_perf_rpstart_offset(true))),
789 gen_mi_reg32(GENX(RPSTAT1_num)));
790 #else
791 gen_mi_store(&b,
792 gen_mi_mem32(anv_address_add(query_addr,
793 intel_perf_rpstart_offset(true))),
794 gen_mi_reg32(GENX(RPSTAT0_num)));
795 #endif
796 /* Position the last OA snapshot at the beginning of the query so that
797 * we can tell whether it's ready.
798 */
799 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
800 rpc.MemoryAddress = anv_address_add(query_addr,
801 intel_perf_mi_rpc_offset(true));
802 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
803 }
804 emit_query_mi_availability(&b,
805 anv_address_add(query_addr, pool->stride - 8),
806 true);
807 break;
808 }
809
810 default:
811 unreachable("");
812 }
813
814 /* When multiview is active the spec requires that N consecutive query
815 * indices are used, where N is the number of active views in the subpass.
816 * The spec allows that we only write the results to one of the queries
817 * but we still need to manage result availability for all the query indices.
818 * Since we only emit a single query for all active views in the
819 * first index, mark the other query indices as being already available
820 * with result 0.
821 */
822 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
823 const uint32_t num_queries =
824 util_bitcount(cmd_buffer->state.subpass->view_mask);
825 if (num_queries > 1)
826 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
827 }
828 }
829
830 #define TIMESTAMP 0x2358
831
832 void genX(CmdWriteTimestamp)(
833 VkCommandBuffer commandBuffer,
834 VkPipelineStageFlagBits pipelineStage,
835 VkQueryPool queryPool,
836 uint32_t query)
837 {
838 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
839 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
840 struct anv_address query_addr = anv_query_address(pool, query);
841
842 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
843
844 struct gen_mi_builder b;
845 gen_mi_builder_init(&b, &cmd_buffer->batch);
846
847 switch (pipelineStage) {
848 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
849 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
850 gen_mi_reg64(TIMESTAMP));
851 break;
852
853 default:
854 /* Everything else is bottom-of-pipe */
855 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
856 pc.DestinationAddressType = DAT_PPGTT;
857 pc.PostSyncOperation = WriteTimestamp;
858 pc.Address = anv_address_add(query_addr, 8);
859
860 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
861 pc.CommandStreamerStallEnable = true;
862 }
863 break;
864 }
865
866 emit_query_pc_availability(cmd_buffer, query_addr, true);
867
868 /* When multiview is active the spec requires that N consecutive query
869 * indices are used, where N is the number of active views in the subpass.
870 * The spec allows that we only write the results to one of the queries
871 * but we still need to manage result availability for all the query indices.
872 * Since we only emit a single query for all active views in the
873 * first index, mark the other query indices as being already available
874 * with result 0.
875 */
876 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
877 const uint32_t num_queries =
878 util_bitcount(cmd_buffer->state.subpass->view_mask);
879 if (num_queries > 1)
880 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
881 }
882 }
883
884 #if GEN_GEN > 7 || GEN_IS_HASWELL
885
886 static void
887 gpu_write_query_result(struct gen_mi_builder *b,
888 struct anv_address dst_addr,
889 VkQueryResultFlags flags,
890 uint32_t value_index,
891 struct gen_mi_value query_result)
892 {
893 if (flags & VK_QUERY_RESULT_64_BIT) {
894 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
895 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
896 } else {
897 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
898 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
899 }
900 }
901
902 static struct gen_mi_value
903 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
904 {
905 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
906 gen_mi_mem64(anv_address_add(addr, 0)));
907 }
908
909 void genX(CmdCopyQueryPoolResults)(
910 VkCommandBuffer commandBuffer,
911 VkQueryPool queryPool,
912 uint32_t firstQuery,
913 uint32_t queryCount,
914 VkBuffer destBuffer,
915 VkDeviceSize destOffset,
916 VkDeviceSize destStride,
917 VkQueryResultFlags flags)
918 {
919 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
920 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
921 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
922
923 struct gen_mi_builder b;
924 gen_mi_builder_init(&b, &cmd_buffer->batch);
925 struct gen_mi_value result;
926
927 /* If render target writes are ongoing, request a render target cache flush
928 * to ensure proper ordering of the commands from the 3d pipe and the
929 * command streamer.
930 */
931 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
932 cmd_buffer->state.pending_pipe_bits |=
933 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
934 }
935
936 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
937 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
938 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
939 * because we're about to copy values from MI commands, we need to
940 * stall the command streamer to make sure the PIPE_CONTROL values have
941 * landed, otherwise we could see inconsistent values & availability.
942 *
943 * From the vulkan spec:
944 *
945 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
946 * previous uses of vkCmdResetQueryPool in the same queue, without
947 * any additional synchronization."
948 */
949 pool->type == VK_QUERY_TYPE_OCCLUSION ||
950 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
951 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
952 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
953 }
954
955 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
956 for (uint32_t i = 0; i < queryCount; i++) {
957 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
958 uint32_t idx = 0;
959 switch (pool->type) {
960 case VK_QUERY_TYPE_OCCLUSION:
961 result = compute_query_result(&b, anv_address_add(query_addr, 8));
962 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
963 break;
964
965 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
966 uint32_t statistics = pool->pipeline_statistics;
967 while (statistics) {
968 uint32_t stat = u_bit_scan(&statistics);
969
970 result = compute_query_result(&b, anv_address_add(query_addr,
971 idx * 16 + 8));
972
973 /* WaDividePSInvocationCountBy4:HSW,BDW */
974 if ((cmd_buffer->device->info.gen == 8 ||
975 cmd_buffer->device->info.is_haswell) &&
976 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
977 result = gen_mi_ushr32_imm(&b, result, 2);
978 }
979
980 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
981 }
982 assert(idx == util_bitcount(pool->pipeline_statistics));
983 break;
984 }
985
986 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
987 result = compute_query_result(&b, anv_address_add(query_addr, 8));
988 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
989 result = compute_query_result(&b, anv_address_add(query_addr, 24));
990 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
991 break;
992
993 case VK_QUERY_TYPE_TIMESTAMP:
994 result = gen_mi_mem64(anv_address_add(query_addr, 8));
995 gpu_write_query_result(&b, dest_addr, flags, 0, result);
996 break;
997
998 default:
999 unreachable("unhandled query type");
1000 }
1001
1002 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1003 gpu_write_query_result(&b, dest_addr, flags, idx,
1004 gen_mi_mem64(query_addr));
1005 }
1006
1007 dest_addr = anv_address_add(dest_addr, destStride);
1008 }
1009 }
1010
1011 #else
1012 void genX(CmdCopyQueryPoolResults)(
1013 VkCommandBuffer commandBuffer,
1014 VkQueryPool queryPool,
1015 uint32_t firstQuery,
1016 uint32_t queryCount,
1017 VkBuffer destBuffer,
1018 VkDeviceSize destOffset,
1019 VkDeviceSize destStride,
1020 VkQueryResultFlags flags)
1021 {
1022 anv_finishme("Queries not yet supported on Ivy Bridge");
1023 }
1024 #endif