anv: Use the query_slot helper in vkResetQueryPoolEXT
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 2 * OA_REPORT_N_UINT64; /* begin & end OA reports */
98 uint64s_per_slot += 4; /* PerfCounter 1 & 2 */
99 uint64s_per_slot++; /* 2 * 32bit RPSTAT register */
100 uint64s_per_slot++; /* 64bit marker */
101 uint64s_per_slot++; /* availability */
102 uint64s_per_slot = align_u32(uint64s_per_slot, 8); /* OA reports must be aligned to 64 bytes */
103 break;
104 }
105 default:
106 assert(!"Invalid query type");
107 }
108
109 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
110 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
111 if (pool == NULL)
112 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
113
114 pool->type = pCreateInfo->queryType;
115 pool->pipeline_statistics = pipeline_statistics;
116 pool->stride = uint64s_per_slot * sizeof(uint64_t);
117 pool->slots = pCreateInfo->queryCount;
118
119 uint64_t size = pool->slots * pool->stride;
120 result = anv_bo_init_new(&pool->bo, device, size);
121 if (result != VK_SUCCESS)
122 goto fail;
123
124 if (pdevice->supports_48bit_addresses)
125 pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
126
127 if (pdevice->use_softpin)
128 pool->bo.flags |= EXEC_OBJECT_PINNED;
129
130 if (pdevice->has_exec_async)
131 pool->bo.flags |= EXEC_OBJECT_ASYNC;
132
133 anv_vma_alloc(device, &pool->bo);
134
135 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
136 * platforms, this does nothing. On non-LLC platforms, this means snooping
137 * which comes at a slight cost. However, the buffers aren't big, won't be
138 * written frequently, and trying to handle the flushing manually without
139 * doing too much flushing is extremely painful.
140 */
141 anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
142
143 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
144
145 *pQueryPool = anv_query_pool_to_handle(pool);
146
147 return VK_SUCCESS;
148
149 fail:
150 vk_free2(&device->alloc, pAllocator, pool);
151
152 return result;
153 }
154
155 void genX(DestroyQueryPool)(
156 VkDevice _device,
157 VkQueryPool _pool,
158 const VkAllocationCallbacks* pAllocator)
159 {
160 ANV_FROM_HANDLE(anv_device, device, _device);
161 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
162
163 if (!pool)
164 return;
165
166 anv_gem_munmap(pool->bo.map, pool->bo.size);
167 anv_vma_free(device, &pool->bo);
168 anv_gem_close(device, pool->bo.gem_handle);
169 vk_free2(&device->alloc, pAllocator, pool);
170 }
171
172 static struct anv_address
173 anv_query_address(struct anv_query_pool *pool, uint32_t query)
174 {
175 return (struct anv_address) {
176 .bo = &pool->bo,
177 .offset = query * pool->stride,
178 };
179 }
180
181 /**
182 * VK_INTEL_performance_query layout:
183 *
184 * ------------------------------
185 * | end MI_RPC (256b) |
186 * |----------------------------|
187 * | begin MI_RPC (256b) |
188 * |----------------------------|
189 * | begin perfcntr 1 & 2 (16b) |
190 * |----------------------------|
191 * | end perfcntr 1 & 2 (16b) |
192 * |----------------------------|
193 * | begin RPSTAT register (4b) |
194 * |----------------------------|
195 * | end RPSTAT register (4b) |
196 * |----------------------------|
197 * | marker (8b) |
198 * |----------------------------|
199 * | availability (8b) |
200 * ------------------------------
201 */
202
203 static uint32_t
204 intel_perf_mi_rpc_offset(bool end)
205 {
206 return end ? 0 : 256;
207 }
208
209 static uint32_t
210 intel_perf_counter(bool end)
211 {
212 uint32_t offset = 512;
213 offset += end ? 2 * sizeof(uint64_t) : 0;
214 return offset;
215 }
216
217 static uint32_t
218 intel_perf_rpstart_offset(bool end)
219 {
220 uint32_t offset = intel_perf_counter(false) +
221 4 * sizeof(uint64_t);
222 offset += end ? sizeof(uint32_t) : 0;
223 return offset;
224 }
225
226 static uint32_t
227 intel_perf_marker_offset(void)
228 {
229 return intel_perf_rpstart_offset(false) + sizeof(uint64_t);
230 }
231
232 static void
233 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
234 uint32_t value_index, uint64_t result)
235 {
236 if (flags & VK_QUERY_RESULT_64_BIT) {
237 uint64_t *dst64 = dst_slot;
238 dst64[value_index] = result;
239 } else {
240 uint32_t *dst32 = dst_slot;
241 dst32[value_index] = result;
242 }
243 }
244
245 static void *
246 query_slot(struct anv_query_pool *pool, uint32_t query)
247 {
248 return pool->bo.map + query * pool->stride;
249 }
250
251 static bool
252 query_is_available(struct anv_query_pool *pool, uint32_t query)
253 {
254 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) {
255 return *(volatile uint64_t *)((uint8_t *)query_slot(pool, query) +
256 pool->stride - 8);
257 } else
258 return *(volatile uint64_t *)query_slot(pool, query);
259 }
260
261 static VkResult
262 wait_for_available(struct anv_device *device,
263 struct anv_query_pool *pool, uint32_t query)
264 {
265 while (true) {
266 if (query_is_available(pool, query))
267 return VK_SUCCESS;
268
269 int ret = anv_gem_busy(device, pool->bo.gem_handle);
270 if (ret == 1) {
271 /* The BO is still busy, keep waiting. */
272 continue;
273 } else if (ret == -1) {
274 /* We don't know the real error. */
275 return anv_device_set_lost(device, "gem wait failed: %m");
276 } else {
277 assert(ret == 0);
278 /* The BO is no longer busy. */
279 if (query_is_available(pool, query)) {
280 return VK_SUCCESS;
281 } else {
282 VkResult status = anv_device_query_status(device);
283 if (status != VK_SUCCESS)
284 return status;
285
286 /* If we haven't seen availability yet, then we never will. This
287 * can only happen if we have a client error where they call
288 * GetQueryPoolResults on a query that they haven't submitted to
289 * the GPU yet. The spec allows us to do anything in this case,
290 * but returning VK_SUCCESS doesn't seem right and we shouldn't
291 * just keep spinning.
292 */
293 return VK_NOT_READY;
294 }
295 }
296 }
297 }
298
299 VkResult genX(GetQueryPoolResults)(
300 VkDevice _device,
301 VkQueryPool queryPool,
302 uint32_t firstQuery,
303 uint32_t queryCount,
304 size_t dataSize,
305 void* pData,
306 VkDeviceSize stride,
307 VkQueryResultFlags flags)
308 {
309 ANV_FROM_HANDLE(anv_device, device, _device);
310 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
311
312 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
313 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
314 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
315 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
316 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
317
318 if (anv_device_is_lost(device))
319 return VK_ERROR_DEVICE_LOST;
320
321 if (pData == NULL)
322 return VK_SUCCESS;
323
324 void *data_end = pData + dataSize;
325
326 VkResult status = VK_SUCCESS;
327 for (uint32_t i = 0; i < queryCount; i++) {
328 bool available = query_is_available(pool, firstQuery + i);
329
330 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
331 status = wait_for_available(device, pool, firstQuery + i);
332 if (status != VK_SUCCESS)
333 return status;
334
335 available = true;
336 }
337
338 /* From the Vulkan 1.0.42 spec:
339 *
340 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
341 * both not set then no result values are written to pData for
342 * queries that are in the unavailable state at the time of the call,
343 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
344 * availability state is still written to pData for those queries if
345 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
346 */
347 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
348
349 uint32_t idx = 0;
350 switch (pool->type) {
351 case VK_QUERY_TYPE_OCCLUSION: {
352 uint64_t *slot = query_slot(pool, firstQuery + i);
353 if (write_results)
354 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
355 idx++;
356 break;
357 }
358
359 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
360 uint64_t *slot = query_slot(pool, firstQuery + i);
361 uint32_t statistics = pool->pipeline_statistics;
362 while (statistics) {
363 uint32_t stat = u_bit_scan(&statistics);
364 if (write_results) {
365 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
366
367 /* WaDividePSInvocationCountBy4:HSW,BDW */
368 if ((device->info.gen == 8 || device->info.is_haswell) &&
369 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
370 result >>= 2;
371
372 cpu_write_query_result(pData, flags, idx, result);
373 }
374 idx++;
375 }
376 assert(idx == util_bitcount(pool->pipeline_statistics));
377 break;
378 }
379
380 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
381 uint64_t *slot = query_slot(pool, firstQuery + i);
382 if (write_results)
383 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
384 idx++;
385 if (write_results)
386 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
387 idx++;
388 break;
389 }
390
391 case VK_QUERY_TYPE_TIMESTAMP: {
392 uint64_t *slot = query_slot(pool, firstQuery + i);
393 if (write_results)
394 cpu_write_query_result(pData, flags, idx, slot[1]);
395 idx++;
396 break;
397 }
398
399 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
400 if (!write_results)
401 break;
402 const void *query_data = query_slot(pool, firstQuery + i);
403 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
404 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
405 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
406 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
407 struct gen_perf_query_result result;
408 struct gen_perf_query_info metric = {
409 .oa_format = (GEN_GEN >= 8 ?
410 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
411 I915_OA_FORMAT_A45_B8_C8),
412 };
413 uint32_t core_freq[2];
414 #if GEN_GEN < 9
415 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
416 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
417 #else
418 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
419 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
420 #endif
421 gen_perf_query_result_clear(&result);
422 gen_perf_query_result_accumulate(&result, &metric,
423 oa_begin, oa_end);
424 gen_perf_query_result_read_frequencies(&result, &device->info,
425 oa_begin, oa_end);
426 gen_perf_query_result_write_mdapi(pData, stride,
427 &device->info,
428 &result,
429 core_freq[0], core_freq[1]);
430 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
431 query_data + intel_perf_counter(false),
432 query_data + intel_perf_counter(true));
433 const uint64_t *marker = query_data + intel_perf_marker_offset();
434 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
435 break;
436 }
437
438 default:
439 unreachable("invalid pool type");
440 }
441
442 if (!write_results)
443 status = VK_NOT_READY;
444
445 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
446 cpu_write_query_result(pData, flags, idx, available);
447
448 pData += stride;
449 if (pData >= data_end)
450 break;
451 }
452
453 return status;
454 }
455
456 static void
457 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
458 struct anv_address addr)
459 {
460 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
461 pc.DestinationAddressType = DAT_PPGTT;
462 pc.PostSyncOperation = WritePSDepthCount;
463 pc.DepthStallEnable = true;
464 pc.Address = addr;
465
466 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
467 pc.CommandStreamerStallEnable = true;
468 }
469 }
470
471 static void
472 emit_query_mi_availability(struct gen_mi_builder *b,
473 struct anv_address addr,
474 bool available)
475 {
476 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
477 }
478
479 static void
480 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
481 struct anv_address addr,
482 bool available)
483 {
484 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
485 pc.DestinationAddressType = DAT_PPGTT;
486 pc.PostSyncOperation = WriteImmediateData;
487 pc.Address = addr;
488 pc.ImmediateData = available;
489 }
490 }
491
492 /**
493 * Goes through a series of consecutive query indices in the given pool
494 * setting all element values to 0 and emitting them as available.
495 */
496 static void
497 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
498 struct gen_mi_builder *b, struct anv_query_pool *pool,
499 uint32_t first_index, uint32_t num_queries)
500 {
501 switch (pool->type) {
502 case VK_QUERY_TYPE_OCCLUSION:
503 case VK_QUERY_TYPE_TIMESTAMP:
504 /* These queries are written with a PIPE_CONTROL so clear them using the
505 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
506 * of operations.
507 */
508 assert((pool->stride % 8) == 0);
509 for (uint32_t i = 0; i < num_queries; i++) {
510 struct anv_address slot_addr =
511 anv_query_address(pool, first_index + i);
512
513 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
514 emit_query_pc_availability(cmd_buffer,
515 anv_address_add(slot_addr, qword * 8),
516 false);
517 }
518 emit_query_pc_availability(cmd_buffer, slot_addr, true);
519 }
520 break;
521
522 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
523 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
524 for (uint32_t i = 0; i < num_queries; i++) {
525 struct anv_address slot_addr =
526 anv_query_address(pool, first_index + i);
527 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
528 emit_query_mi_availability(b, slot_addr, true);
529 }
530 break;
531
532 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
533 for (uint32_t i = 0; i < num_queries; i++) {
534 struct anv_address slot_addr =
535 anv_query_address(pool, first_index + i);
536 gen_mi_memset(b, slot_addr, 0, pool->stride - 8);
537 emit_query_mi_availability(b, anv_address_add(slot_addr,
538 pool->stride - 8), true);
539 }
540 break;
541
542 default:
543 unreachable("Unsupported query type");
544 }
545 }
546
547 void genX(CmdResetQueryPool)(
548 VkCommandBuffer commandBuffer,
549 VkQueryPool queryPool,
550 uint32_t firstQuery,
551 uint32_t queryCount)
552 {
553 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
554 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
555
556 switch (pool->type) {
557 case VK_QUERY_TYPE_OCCLUSION:
558 case VK_QUERY_TYPE_TIMESTAMP:
559 for (uint32_t i = 0; i < queryCount; i++) {
560 emit_query_pc_availability(cmd_buffer,
561 anv_query_address(pool, firstQuery + i),
562 false);
563 }
564 break;
565
566 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
567 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
568 struct gen_mi_builder b;
569 gen_mi_builder_init(&b, &cmd_buffer->batch);
570
571 for (uint32_t i = 0; i < queryCount; i++)
572 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
573 break;
574 }
575
576 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
577 struct gen_mi_builder b;
578 gen_mi_builder_init(&b, &cmd_buffer->batch);
579
580 for (uint32_t i = 0; i < queryCount; i++) {
581 emit_query_mi_availability(
582 &b,
583 anv_address_add(
584 anv_query_address(pool, firstQuery + i),
585 pool->stride - 8),
586 false);
587 }
588 break;
589 }
590
591 default:
592 unreachable("Unsupported query type");
593 }
594 }
595
596 void genX(ResetQueryPoolEXT)(
597 VkDevice _device,
598 VkQueryPool queryPool,
599 uint32_t firstQuery,
600 uint32_t queryCount)
601 {
602 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
603
604 for (uint32_t i = 0; i < queryCount; i++) {
605 uint64_t *slot = query_slot(pool, firstQuery + i);
606 *slot = 0;
607 }
608 }
609
610 static const uint32_t vk_pipeline_stat_to_reg[] = {
611 GENX(IA_VERTICES_COUNT_num),
612 GENX(IA_PRIMITIVES_COUNT_num),
613 GENX(VS_INVOCATION_COUNT_num),
614 GENX(GS_INVOCATION_COUNT_num),
615 GENX(GS_PRIMITIVES_COUNT_num),
616 GENX(CL_INVOCATION_COUNT_num),
617 GENX(CL_PRIMITIVES_COUNT_num),
618 GENX(PS_INVOCATION_COUNT_num),
619 GENX(HS_INVOCATION_COUNT_num),
620 GENX(DS_INVOCATION_COUNT_num),
621 GENX(CS_INVOCATION_COUNT_num),
622 };
623
624 static void
625 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
626 struct anv_address addr)
627 {
628 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
629 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
630
631 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
632 gen_mi_store(b, gen_mi_mem64(addr),
633 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
634 }
635
636 static void
637 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
638 struct anv_address addr)
639 {
640 assert(stream < MAX_XFB_STREAMS);
641
642 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
643 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
644 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
645 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
646 }
647
648 void genX(CmdBeginQuery)(
649 VkCommandBuffer commandBuffer,
650 VkQueryPool queryPool,
651 uint32_t query,
652 VkQueryControlFlags flags)
653 {
654 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
655 }
656
657 void genX(CmdBeginQueryIndexedEXT)(
658 VkCommandBuffer commandBuffer,
659 VkQueryPool queryPool,
660 uint32_t query,
661 VkQueryControlFlags flags,
662 uint32_t index)
663 {
664 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
665 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
666 struct anv_address query_addr = anv_query_address(pool, query);
667
668 struct gen_mi_builder b;
669 gen_mi_builder_init(&b, &cmd_buffer->batch);
670
671 switch (pool->type) {
672 case VK_QUERY_TYPE_OCCLUSION:
673 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
674 break;
675
676 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
677 /* TODO: This might only be necessary for certain stats */
678 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
679 pc.CommandStreamerStallEnable = true;
680 pc.StallAtPixelScoreboard = true;
681 }
682
683 uint32_t statistics = pool->pipeline_statistics;
684 uint32_t offset = 8;
685 while (statistics) {
686 uint32_t stat = u_bit_scan(&statistics);
687 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
688 offset += 16;
689 }
690 break;
691 }
692
693 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
694 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
695 pc.CommandStreamerStallEnable = true;
696 pc.StallAtPixelScoreboard = true;
697 }
698 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
699 break;
700
701 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
702 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
703 pc.CommandStreamerStallEnable = true;
704 pc.StallAtPixelScoreboard = true;
705 }
706 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
707 rpc.MemoryAddress =
708 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
709 }
710 #if GEN_GEN < 9
711 gen_mi_store(&b,
712 gen_mi_mem32(anv_address_add(query_addr,
713 intel_perf_rpstart_offset(false))),
714 gen_mi_reg32(GENX(RPSTAT1_num)));
715 #else
716 gen_mi_store(&b,
717 gen_mi_mem32(anv_address_add(query_addr,
718 intel_perf_rpstart_offset(false))),
719 gen_mi_reg32(GENX(RPSTAT0_num)));
720 #endif
721 #if GEN_GEN >= 8 && GEN_GEN <= 11
722 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
723 intel_perf_counter(false))),
724 gen_mi_reg64(GENX(PERFCNT1_num)));
725 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
726 intel_perf_counter(false) + 8)),
727 gen_mi_reg64(GENX(PERFCNT2_num)));
728 #endif
729 break;
730 }
731
732 default:
733 unreachable("");
734 }
735 }
736
737 void genX(CmdEndQuery)(
738 VkCommandBuffer commandBuffer,
739 VkQueryPool queryPool,
740 uint32_t query)
741 {
742 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
743 }
744
745 void genX(CmdEndQueryIndexedEXT)(
746 VkCommandBuffer commandBuffer,
747 VkQueryPool queryPool,
748 uint32_t query,
749 uint32_t index)
750 {
751 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
752 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
753 struct anv_address query_addr = anv_query_address(pool, query);
754
755 struct gen_mi_builder b;
756 gen_mi_builder_init(&b, &cmd_buffer->batch);
757
758 switch (pool->type) {
759 case VK_QUERY_TYPE_OCCLUSION:
760 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
761 emit_query_pc_availability(cmd_buffer, query_addr, true);
762 break;
763
764 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
765 /* TODO: This might only be necessary for certain stats */
766 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
767 pc.CommandStreamerStallEnable = true;
768 pc.StallAtPixelScoreboard = true;
769 }
770
771 uint32_t statistics = pool->pipeline_statistics;
772 uint32_t offset = 16;
773 while (statistics) {
774 uint32_t stat = u_bit_scan(&statistics);
775 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
776 offset += 16;
777 }
778
779 emit_query_mi_availability(&b, query_addr, true);
780 break;
781 }
782
783 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
784 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
785 pc.CommandStreamerStallEnable = true;
786 pc.StallAtPixelScoreboard = true;
787 }
788
789 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
790 emit_query_mi_availability(&b, query_addr, true);
791 break;
792
793 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
794 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
795 pc.CommandStreamerStallEnable = true;
796 pc.StallAtPixelScoreboard = true;
797 }
798 uint32_t marker_offset = intel_perf_marker_offset();
799 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
800 gen_mi_imm(cmd_buffer->intel_perf_marker));
801 #if GEN_GEN >= 8 && GEN_GEN <= 11
802 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
803 gen_mi_reg64(GENX(PERFCNT1_num)));
804 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
805 gen_mi_reg64(GENX(PERFCNT2_num)));
806 #endif
807 #if GEN_GEN < 9
808 gen_mi_store(&b,
809 gen_mi_mem32(anv_address_add(query_addr,
810 intel_perf_rpstart_offset(true))),
811 gen_mi_reg32(GENX(RPSTAT1_num)));
812 #else
813 gen_mi_store(&b,
814 gen_mi_mem32(anv_address_add(query_addr,
815 intel_perf_rpstart_offset(true))),
816 gen_mi_reg32(GENX(RPSTAT0_num)));
817 #endif
818 /* Position the last OA snapshot at the beginning of the query so that
819 * we can tell whether it's ready.
820 */
821 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
822 rpc.MemoryAddress = anv_address_add(query_addr,
823 intel_perf_mi_rpc_offset(true));
824 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
825 }
826 emit_query_mi_availability(&b,
827 anv_address_add(query_addr, pool->stride - 8),
828 true);
829 break;
830 }
831
832 default:
833 unreachable("");
834 }
835
836 /* When multiview is active the spec requires that N consecutive query
837 * indices are used, where N is the number of active views in the subpass.
838 * The spec allows that we only write the results to one of the queries
839 * but we still need to manage result availability for all the query indices.
840 * Since we only emit a single query for all active views in the
841 * first index, mark the other query indices as being already available
842 * with result 0.
843 */
844 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
845 const uint32_t num_queries =
846 util_bitcount(cmd_buffer->state.subpass->view_mask);
847 if (num_queries > 1)
848 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
849 }
850 }
851
852 #define TIMESTAMP 0x2358
853
854 void genX(CmdWriteTimestamp)(
855 VkCommandBuffer commandBuffer,
856 VkPipelineStageFlagBits pipelineStage,
857 VkQueryPool queryPool,
858 uint32_t query)
859 {
860 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
861 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
862 struct anv_address query_addr = anv_query_address(pool, query);
863
864 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
865
866 struct gen_mi_builder b;
867 gen_mi_builder_init(&b, &cmd_buffer->batch);
868
869 switch (pipelineStage) {
870 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
871 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
872 gen_mi_reg64(TIMESTAMP));
873 break;
874
875 default:
876 /* Everything else is bottom-of-pipe */
877 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
878 pc.DestinationAddressType = DAT_PPGTT;
879 pc.PostSyncOperation = WriteTimestamp;
880 pc.Address = anv_address_add(query_addr, 8);
881
882 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
883 pc.CommandStreamerStallEnable = true;
884 }
885 break;
886 }
887
888 emit_query_pc_availability(cmd_buffer, query_addr, true);
889
890 /* When multiview is active the spec requires that N consecutive query
891 * indices are used, where N is the number of active views in the subpass.
892 * The spec allows that we only write the results to one of the queries
893 * but we still need to manage result availability for all the query indices.
894 * Since we only emit a single query for all active views in the
895 * first index, mark the other query indices as being already available
896 * with result 0.
897 */
898 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
899 const uint32_t num_queries =
900 util_bitcount(cmd_buffer->state.subpass->view_mask);
901 if (num_queries > 1)
902 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
903 }
904 }
905
906 #if GEN_GEN > 7 || GEN_IS_HASWELL
907
908 static void
909 gpu_write_query_result(struct gen_mi_builder *b,
910 struct anv_address dst_addr,
911 VkQueryResultFlags flags,
912 uint32_t value_index,
913 struct gen_mi_value query_result)
914 {
915 if (flags & VK_QUERY_RESULT_64_BIT) {
916 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
917 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
918 } else {
919 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
920 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
921 }
922 }
923
924 static struct gen_mi_value
925 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
926 {
927 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
928 gen_mi_mem64(anv_address_add(addr, 0)));
929 }
930
931 void genX(CmdCopyQueryPoolResults)(
932 VkCommandBuffer commandBuffer,
933 VkQueryPool queryPool,
934 uint32_t firstQuery,
935 uint32_t queryCount,
936 VkBuffer destBuffer,
937 VkDeviceSize destOffset,
938 VkDeviceSize destStride,
939 VkQueryResultFlags flags)
940 {
941 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
942 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
943 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
944
945 struct gen_mi_builder b;
946 gen_mi_builder_init(&b, &cmd_buffer->batch);
947 struct gen_mi_value result;
948
949 /* If render target writes are ongoing, request a render target cache flush
950 * to ensure proper ordering of the commands from the 3d pipe and the
951 * command streamer.
952 */
953 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
954 cmd_buffer->state.pending_pipe_bits |=
955 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
956 }
957
958 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
959 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
960 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
961 * because we're about to copy values from MI commands, we need to
962 * stall the command streamer to make sure the PIPE_CONTROL values have
963 * landed, otherwise we could see inconsistent values & availability.
964 *
965 * From the vulkan spec:
966 *
967 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
968 * previous uses of vkCmdResetQueryPool in the same queue, without
969 * any additional synchronization."
970 */
971 pool->type == VK_QUERY_TYPE_OCCLUSION ||
972 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
973 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
974 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
975 }
976
977 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
978 for (uint32_t i = 0; i < queryCount; i++) {
979 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
980 uint32_t idx = 0;
981 switch (pool->type) {
982 case VK_QUERY_TYPE_OCCLUSION:
983 result = compute_query_result(&b, anv_address_add(query_addr, 8));
984 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
985 break;
986
987 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
988 uint32_t statistics = pool->pipeline_statistics;
989 while (statistics) {
990 uint32_t stat = u_bit_scan(&statistics);
991
992 result = compute_query_result(&b, anv_address_add(query_addr,
993 idx * 16 + 8));
994
995 /* WaDividePSInvocationCountBy4:HSW,BDW */
996 if ((cmd_buffer->device->info.gen == 8 ||
997 cmd_buffer->device->info.is_haswell) &&
998 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
999 result = gen_mi_ushr32_imm(&b, result, 2);
1000 }
1001
1002 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1003 }
1004 assert(idx == util_bitcount(pool->pipeline_statistics));
1005 break;
1006 }
1007
1008 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1009 result = compute_query_result(&b, anv_address_add(query_addr, 8));
1010 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1011 result = compute_query_result(&b, anv_address_add(query_addr, 24));
1012 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1013 break;
1014
1015 case VK_QUERY_TYPE_TIMESTAMP:
1016 result = gen_mi_mem64(anv_address_add(query_addr, 8));
1017 gpu_write_query_result(&b, dest_addr, flags, 0, result);
1018 break;
1019
1020 default:
1021 unreachable("unhandled query type");
1022 }
1023
1024 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1025 gpu_write_query_result(&b, dest_addr, flags, idx,
1026 gen_mi_mem64(query_addr));
1027 }
1028
1029 dest_addr = anv_address_add(dest_addr, destStride);
1030 }
1031 }
1032
1033 #else
1034 void genX(CmdCopyQueryPoolResults)(
1035 VkCommandBuffer commandBuffer,
1036 VkQueryPool queryPool,
1037 uint32_t firstQuery,
1038 uint32_t queryCount,
1039 VkBuffer destBuffer,
1040 VkDeviceSize destOffset,
1041 VkDeviceSize destStride,
1042 VkQueryResultFlags flags)
1043 {
1044 anv_finishme("Queries not yet supported on Ivy Bridge");
1045 }
1046 #endif