77a04605867e106cae22cfa30971b6e47f6f8f29
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = device->physical;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 72; /* 576 bytes, see layout below */
98 break;
99 }
100 default:
101 assert(!"Invalid query type");
102 }
103
104 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (pool == NULL)
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 pool->type = pCreateInfo->queryType;
110 pool->pipeline_statistics = pipeline_statistics;
111 pool->stride = uint64s_per_slot * sizeof(uint64_t);
112 pool->slots = pCreateInfo->queryCount;
113
114 uint32_t bo_flags = 0;
115 if (pdevice->supports_48bit_addresses)
116 bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
117
118 if (pdevice->use_softpin)
119 bo_flags |= EXEC_OBJECT_PINNED;
120
121 if (pdevice->has_exec_async)
122 bo_flags |= EXEC_OBJECT_ASYNC;
123
124 uint64_t size = pool->slots * pool->stride;
125 result = anv_device_alloc_bo(device, size,
126 ANV_BO_ALLOC_MAPPED |
127 ANV_BO_ALLOC_SNOOPED,
128 0 /* explicit_address */,
129 &pool->bo);
130 if (result != VK_SUCCESS)
131 goto fail;
132
133 *pQueryPool = anv_query_pool_to_handle(pool);
134
135 return VK_SUCCESS;
136
137 fail:
138 vk_free2(&device->alloc, pAllocator, pool);
139
140 return result;
141 }
142
143 void genX(DestroyQueryPool)(
144 VkDevice _device,
145 VkQueryPool _pool,
146 const VkAllocationCallbacks* pAllocator)
147 {
148 ANV_FROM_HANDLE(anv_device, device, _device);
149 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
150
151 if (!pool)
152 return;
153
154 anv_device_release_bo(device, pool->bo);
155 vk_free2(&device->alloc, pAllocator, pool);
156 }
157
158 static struct anv_address
159 anv_query_address(struct anv_query_pool *pool, uint32_t query)
160 {
161 return (struct anv_address) {
162 .bo = pool->bo,
163 .offset = query * pool->stride,
164 };
165 }
166
167 /**
168 * VK_INTEL_performance_query layout (576 bytes) :
169 *
170 * ------------------------------
171 * | availability (8b) |
172 * |----------------------------|
173 * | marker (8b) |
174 * |----------------------------|
175 * | begin RPSTAT register (4b) |
176 * |----------------------------|
177 * | end RPSTAT register (4b) |
178 * |----------------------------|
179 * | begin perfcntr 1 & 2 (16b) |
180 * |----------------------------|
181 * | end perfcntr 1 & 2 (16b) |
182 * |----------------------------|
183 * | Unused (8b) |
184 * |----------------------------|
185 * | begin MI_RPC (256b) |
186 * |----------------------------|
187 * | end MI_RPC (256b) |
188 * ------------------------------
189 */
190
191 static uint32_t
192 intel_perf_marker_offset(void)
193 {
194 return 8;
195 }
196
197 static uint32_t
198 intel_perf_rpstart_offset(bool end)
199 {
200 return 16 + (end ? sizeof(uint32_t) : 0);
201 }
202
203 static uint32_t
204 intel_perf_counter(bool end)
205 {
206 return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
207 }
208
209 static uint32_t
210 intel_perf_mi_rpc_offset(bool end)
211 {
212 return 64 + (end ? 256 : 0);
213 }
214
215 static void
216 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
217 uint32_t value_index, uint64_t result)
218 {
219 if (flags & VK_QUERY_RESULT_64_BIT) {
220 uint64_t *dst64 = dst_slot;
221 dst64[value_index] = result;
222 } else {
223 uint32_t *dst32 = dst_slot;
224 dst32[value_index] = result;
225 }
226 }
227
228 static void *
229 query_slot(struct anv_query_pool *pool, uint32_t query)
230 {
231 return pool->bo->map + query * pool->stride;
232 }
233
234 static bool
235 query_is_available(struct anv_query_pool *pool, uint32_t query)
236 {
237 return *(volatile uint64_t *)query_slot(pool, query);
238 }
239
240 static VkResult
241 wait_for_available(struct anv_device *device,
242 struct anv_query_pool *pool, uint32_t query)
243 {
244 uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC);
245
246 while (anv_gettime_ns() < abs_timeout) {
247 if (query_is_available(pool, query))
248 return VK_SUCCESS;
249 VkResult status = anv_device_query_status(device);
250 if (status != VK_SUCCESS)
251 return status;
252 }
253
254 return anv_device_set_lost(device, "query timeout");
255 }
256
257 VkResult genX(GetQueryPoolResults)(
258 VkDevice _device,
259 VkQueryPool queryPool,
260 uint32_t firstQuery,
261 uint32_t queryCount,
262 size_t dataSize,
263 void* pData,
264 VkDeviceSize stride,
265 VkQueryResultFlags flags)
266 {
267 ANV_FROM_HANDLE(anv_device, device, _device);
268 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
269
270 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
271 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
272 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
273 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
274 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
275
276 if (anv_device_is_lost(device))
277 return VK_ERROR_DEVICE_LOST;
278
279 if (pData == NULL)
280 return VK_SUCCESS;
281
282 void *data_end = pData + dataSize;
283
284 VkResult status = VK_SUCCESS;
285 for (uint32_t i = 0; i < queryCount; i++) {
286 bool available = query_is_available(pool, firstQuery + i);
287
288 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
289 status = wait_for_available(device, pool, firstQuery + i);
290 if (status != VK_SUCCESS)
291 return status;
292
293 available = true;
294 }
295
296 /* From the Vulkan 1.0.42 spec:
297 *
298 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
299 * both not set then no result values are written to pData for
300 * queries that are in the unavailable state at the time of the call,
301 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
302 * availability state is still written to pData for those queries if
303 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
304 */
305 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
306
307 uint32_t idx = 0;
308 switch (pool->type) {
309 case VK_QUERY_TYPE_OCCLUSION: {
310 uint64_t *slot = query_slot(pool, firstQuery + i);
311 if (write_results) {
312 /* From the Vulkan 1.2.132 spec:
313 *
314 * "If VK_QUERY_RESULT_PARTIAL_BIT is set,
315 * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
316 * is unavailable, an intermediate result value between zero and
317 * the final result value is written to pData for that query."
318 */
319 uint64_t result = available ? slot[2] - slot[1] : 0;
320 cpu_write_query_result(pData, flags, idx, result);
321 }
322 idx++;
323 break;
324 }
325
326 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
327 uint64_t *slot = query_slot(pool, firstQuery + i);
328 uint32_t statistics = pool->pipeline_statistics;
329 while (statistics) {
330 uint32_t stat = u_bit_scan(&statistics);
331 if (write_results) {
332 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
333
334 /* WaDividePSInvocationCountBy4:HSW,BDW */
335 if ((device->info.gen == 8 || device->info.is_haswell) &&
336 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
337 result >>= 2;
338
339 cpu_write_query_result(pData, flags, idx, result);
340 }
341 idx++;
342 }
343 assert(idx == util_bitcount(pool->pipeline_statistics));
344 break;
345 }
346
347 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
348 uint64_t *slot = query_slot(pool, firstQuery + i);
349 if (write_results)
350 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
351 idx++;
352 if (write_results)
353 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
354 idx++;
355 break;
356 }
357
358 case VK_QUERY_TYPE_TIMESTAMP: {
359 uint64_t *slot = query_slot(pool, firstQuery + i);
360 if (write_results)
361 cpu_write_query_result(pData, flags, idx, slot[1]);
362 idx++;
363 break;
364 }
365
366 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
367 if (!write_results)
368 break;
369 const void *query_data = query_slot(pool, firstQuery + i);
370 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
371 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
372 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
373 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
374 struct gen_perf_query_result result;
375 struct gen_perf_query_info metric = {
376 .oa_format = (GEN_GEN >= 8 ?
377 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
378 I915_OA_FORMAT_A45_B8_C8),
379 };
380 uint32_t core_freq[2];
381 #if GEN_GEN < 9
382 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
383 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
384 #else
385 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
386 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
387 #endif
388 gen_perf_query_result_clear(&result);
389 gen_perf_query_result_accumulate(&result, &metric,
390 oa_begin, oa_end);
391 gen_perf_query_result_read_frequencies(&result, &device->info,
392 oa_begin, oa_end);
393 gen_perf_query_result_write_mdapi(pData, stride,
394 &device->info,
395 &result,
396 core_freq[0], core_freq[1]);
397 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
398 query_data + intel_perf_counter(false),
399 query_data + intel_perf_counter(true));
400 const uint64_t *marker = query_data + intel_perf_marker_offset();
401 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
402 break;
403 }
404
405 default:
406 unreachable("invalid pool type");
407 }
408
409 if (!write_results)
410 status = VK_NOT_READY;
411
412 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
413 cpu_write_query_result(pData, flags, idx, available);
414
415 pData += stride;
416 if (pData >= data_end)
417 break;
418 }
419
420 return status;
421 }
422
423 static void
424 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
425 struct anv_address addr)
426 {
427 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
428 pc.DestinationAddressType = DAT_PPGTT;
429 pc.PostSyncOperation = WritePSDepthCount;
430 pc.DepthStallEnable = true;
431 pc.Address = addr;
432
433 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
434 pc.CommandStreamerStallEnable = true;
435 }
436 }
437
438 static void
439 emit_query_mi_availability(struct gen_mi_builder *b,
440 struct anv_address addr,
441 bool available)
442 {
443 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
444 }
445
446 static void
447 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
448 struct anv_address addr,
449 bool available)
450 {
451 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
452 pc.DestinationAddressType = DAT_PPGTT;
453 pc.PostSyncOperation = WriteImmediateData;
454 pc.Address = addr;
455 pc.ImmediateData = available;
456 }
457 }
458
459 /**
460 * Goes through a series of consecutive query indices in the given pool
461 * setting all element values to 0 and emitting them as available.
462 */
463 static void
464 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
465 struct gen_mi_builder *b, struct anv_query_pool *pool,
466 uint32_t first_index, uint32_t num_queries)
467 {
468 switch (pool->type) {
469 case VK_QUERY_TYPE_OCCLUSION:
470 case VK_QUERY_TYPE_TIMESTAMP:
471 /* These queries are written with a PIPE_CONTROL so clear them using the
472 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
473 * of operations.
474 */
475 assert((pool->stride % 8) == 0);
476 for (uint32_t i = 0; i < num_queries; i++) {
477 struct anv_address slot_addr =
478 anv_query_address(pool, first_index + i);
479
480 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
481 emit_query_pc_availability(cmd_buffer,
482 anv_address_add(slot_addr, qword * 8),
483 false);
484 }
485 emit_query_pc_availability(cmd_buffer, slot_addr, true);
486 }
487 break;
488
489 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
490 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
491 for (uint32_t i = 0; i < num_queries; i++) {
492 struct anv_address slot_addr =
493 anv_query_address(pool, first_index + i);
494 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
495 emit_query_mi_availability(b, slot_addr, true);
496 }
497 break;
498
499 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
500 for (uint32_t i = 0; i < num_queries; i++) {
501 struct anv_address slot_addr =
502 anv_query_address(pool, first_index + i);
503 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
504 emit_query_mi_availability(b, slot_addr, true);
505 }
506 break;
507
508 default:
509 unreachable("Unsupported query type");
510 }
511 }
512
513 void genX(CmdResetQueryPool)(
514 VkCommandBuffer commandBuffer,
515 VkQueryPool queryPool,
516 uint32_t firstQuery,
517 uint32_t queryCount)
518 {
519 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
520 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
521
522 switch (pool->type) {
523 case VK_QUERY_TYPE_OCCLUSION:
524 case VK_QUERY_TYPE_TIMESTAMP:
525 for (uint32_t i = 0; i < queryCount; i++) {
526 emit_query_pc_availability(cmd_buffer,
527 anv_query_address(pool, firstQuery + i),
528 false);
529 }
530 break;
531
532 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
533 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
534 struct gen_mi_builder b;
535 gen_mi_builder_init(&b, &cmd_buffer->batch);
536
537 for (uint32_t i = 0; i < queryCount; i++)
538 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
539 break;
540 }
541
542 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
543 struct gen_mi_builder b;
544 gen_mi_builder_init(&b, &cmd_buffer->batch);
545
546 for (uint32_t i = 0; i < queryCount; i++)
547 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
548 break;
549 }
550
551 default:
552 unreachable("Unsupported query type");
553 }
554 }
555
556 void genX(ResetQueryPool)(
557 VkDevice _device,
558 VkQueryPool queryPool,
559 uint32_t firstQuery,
560 uint32_t queryCount)
561 {
562 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
563
564 for (uint32_t i = 0; i < queryCount; i++) {
565 uint64_t *slot = query_slot(pool, firstQuery + i);
566 *slot = 0;
567 }
568 }
569
570 static const uint32_t vk_pipeline_stat_to_reg[] = {
571 GENX(IA_VERTICES_COUNT_num),
572 GENX(IA_PRIMITIVES_COUNT_num),
573 GENX(VS_INVOCATION_COUNT_num),
574 GENX(GS_INVOCATION_COUNT_num),
575 GENX(GS_PRIMITIVES_COUNT_num),
576 GENX(CL_INVOCATION_COUNT_num),
577 GENX(CL_PRIMITIVES_COUNT_num),
578 GENX(PS_INVOCATION_COUNT_num),
579 GENX(HS_INVOCATION_COUNT_num),
580 GENX(DS_INVOCATION_COUNT_num),
581 GENX(CS_INVOCATION_COUNT_num),
582 };
583
584 static void
585 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
586 struct anv_address addr)
587 {
588 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
589 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
590
591 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
592 gen_mi_store(b, gen_mi_mem64(addr),
593 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
594 }
595
596 static void
597 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
598 struct anv_address addr)
599 {
600 assert(stream < MAX_XFB_STREAMS);
601
602 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
603 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
604 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
605 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
606 }
607
608 void genX(CmdBeginQuery)(
609 VkCommandBuffer commandBuffer,
610 VkQueryPool queryPool,
611 uint32_t query,
612 VkQueryControlFlags flags)
613 {
614 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
615 }
616
617 void genX(CmdBeginQueryIndexedEXT)(
618 VkCommandBuffer commandBuffer,
619 VkQueryPool queryPool,
620 uint32_t query,
621 VkQueryControlFlags flags,
622 uint32_t index)
623 {
624 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
625 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
626 struct anv_address query_addr = anv_query_address(pool, query);
627
628 struct gen_mi_builder b;
629 gen_mi_builder_init(&b, &cmd_buffer->batch);
630
631 switch (pool->type) {
632 case VK_QUERY_TYPE_OCCLUSION:
633 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
634 break;
635
636 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
637 /* TODO: This might only be necessary for certain stats */
638 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
639 pc.CommandStreamerStallEnable = true;
640 pc.StallAtPixelScoreboard = true;
641 }
642
643 uint32_t statistics = pool->pipeline_statistics;
644 uint32_t offset = 8;
645 while (statistics) {
646 uint32_t stat = u_bit_scan(&statistics);
647 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
648 offset += 16;
649 }
650 break;
651 }
652
653 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
654 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
655 pc.CommandStreamerStallEnable = true;
656 pc.StallAtPixelScoreboard = true;
657 }
658 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
659 break;
660
661 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
662 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
663 pc.CommandStreamerStallEnable = true;
664 pc.StallAtPixelScoreboard = true;
665 }
666 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
667 rpc.MemoryAddress =
668 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
669 }
670 #if GEN_GEN < 9
671 gen_mi_store(&b,
672 gen_mi_mem32(anv_address_add(query_addr,
673 intel_perf_rpstart_offset(false))),
674 gen_mi_reg32(GENX(RPSTAT1_num)));
675 #else
676 gen_mi_store(&b,
677 gen_mi_mem32(anv_address_add(query_addr,
678 intel_perf_rpstart_offset(false))),
679 gen_mi_reg32(GENX(RPSTAT0_num)));
680 #endif
681 #if GEN_GEN >= 8 && GEN_GEN <= 11
682 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
683 intel_perf_counter(false))),
684 gen_mi_reg64(GENX(PERFCNT1_num)));
685 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
686 intel_perf_counter(false) + 8)),
687 gen_mi_reg64(GENX(PERFCNT2_num)));
688 #endif
689 break;
690 }
691
692 default:
693 unreachable("");
694 }
695 }
696
697 void genX(CmdEndQuery)(
698 VkCommandBuffer commandBuffer,
699 VkQueryPool queryPool,
700 uint32_t query)
701 {
702 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
703 }
704
705 void genX(CmdEndQueryIndexedEXT)(
706 VkCommandBuffer commandBuffer,
707 VkQueryPool queryPool,
708 uint32_t query,
709 uint32_t index)
710 {
711 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
712 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
713 struct anv_address query_addr = anv_query_address(pool, query);
714
715 struct gen_mi_builder b;
716 gen_mi_builder_init(&b, &cmd_buffer->batch);
717
718 switch (pool->type) {
719 case VK_QUERY_TYPE_OCCLUSION:
720 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
721 emit_query_pc_availability(cmd_buffer, query_addr, true);
722 break;
723
724 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
725 /* TODO: This might only be necessary for certain stats */
726 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
727 pc.CommandStreamerStallEnable = true;
728 pc.StallAtPixelScoreboard = true;
729 }
730
731 uint32_t statistics = pool->pipeline_statistics;
732 uint32_t offset = 16;
733 while (statistics) {
734 uint32_t stat = u_bit_scan(&statistics);
735 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
736 offset += 16;
737 }
738
739 emit_query_mi_availability(&b, query_addr, true);
740 break;
741 }
742
743 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
744 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
745 pc.CommandStreamerStallEnable = true;
746 pc.StallAtPixelScoreboard = true;
747 }
748
749 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
750 emit_query_mi_availability(&b, query_addr, true);
751 break;
752
753 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
754 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
755 pc.CommandStreamerStallEnable = true;
756 pc.StallAtPixelScoreboard = true;
757 }
758 uint32_t marker_offset = intel_perf_marker_offset();
759 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
760 gen_mi_imm(cmd_buffer->intel_perf_marker));
761 #if GEN_GEN >= 8 && GEN_GEN <= 11
762 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
763 gen_mi_reg64(GENX(PERFCNT1_num)));
764 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
765 gen_mi_reg64(GENX(PERFCNT2_num)));
766 #endif
767 #if GEN_GEN < 9
768 gen_mi_store(&b,
769 gen_mi_mem32(anv_address_add(query_addr,
770 intel_perf_rpstart_offset(true))),
771 gen_mi_reg32(GENX(RPSTAT1_num)));
772 #else
773 gen_mi_store(&b,
774 gen_mi_mem32(anv_address_add(query_addr,
775 intel_perf_rpstart_offset(true))),
776 gen_mi_reg32(GENX(RPSTAT0_num)));
777 #endif
778 /* Position the last OA snapshot at the beginning of the query so that
779 * we can tell whether it's ready.
780 */
781 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
782 rpc.MemoryAddress = anv_address_add(query_addr,
783 intel_perf_mi_rpc_offset(true));
784 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
785 }
786 emit_query_mi_availability(&b, query_addr, true);
787 break;
788 }
789
790 default:
791 unreachable("");
792 }
793
794 /* When multiview is active the spec requires that N consecutive query
795 * indices are used, where N is the number of active views in the subpass.
796 * The spec allows that we only write the results to one of the queries
797 * but we still need to manage result availability for all the query indices.
798 * Since we only emit a single query for all active views in the
799 * first index, mark the other query indices as being already available
800 * with result 0.
801 */
802 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
803 const uint32_t num_queries =
804 util_bitcount(cmd_buffer->state.subpass->view_mask);
805 if (num_queries > 1)
806 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
807 }
808 }
809
810 #define TIMESTAMP 0x2358
811
812 void genX(CmdWriteTimestamp)(
813 VkCommandBuffer commandBuffer,
814 VkPipelineStageFlagBits pipelineStage,
815 VkQueryPool queryPool,
816 uint32_t query)
817 {
818 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
819 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
820 struct anv_address query_addr = anv_query_address(pool, query);
821
822 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
823
824 struct gen_mi_builder b;
825 gen_mi_builder_init(&b, &cmd_buffer->batch);
826
827 switch (pipelineStage) {
828 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
829 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
830 gen_mi_reg64(TIMESTAMP));
831 break;
832
833 default:
834 /* Everything else is bottom-of-pipe */
835 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
836 pc.DestinationAddressType = DAT_PPGTT;
837 pc.PostSyncOperation = WriteTimestamp;
838 pc.Address = anv_address_add(query_addr, 8);
839
840 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
841 pc.CommandStreamerStallEnable = true;
842 }
843 break;
844 }
845
846 emit_query_pc_availability(cmd_buffer, query_addr, true);
847
848 /* When multiview is active the spec requires that N consecutive query
849 * indices are used, where N is the number of active views in the subpass.
850 * The spec allows that we only write the results to one of the queries
851 * but we still need to manage result availability for all the query indices.
852 * Since we only emit a single query for all active views in the
853 * first index, mark the other query indices as being already available
854 * with result 0.
855 */
856 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
857 const uint32_t num_queries =
858 util_bitcount(cmd_buffer->state.subpass->view_mask);
859 if (num_queries > 1)
860 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
861 }
862 }
863
864 #if GEN_GEN > 7 || GEN_IS_HASWELL
865
866 #if GEN_GEN >= 8 || GEN_IS_HASWELL
867
868 #define MI_PREDICATE_SRC0 0x2400
869 #define MI_PREDICATE_SRC1 0x2408
870 #define MI_PREDICATE_RESULT 0x2418
871
872 /**
873 * Writes the results of a query to dst_addr is the value at poll_addr is equal
874 * to the reference value.
875 */
876 static void
877 gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
878 struct gen_mi_builder *b,
879 struct anv_address poll_addr,
880 struct anv_address dst_addr,
881 uint64_t ref_value,
882 VkQueryResultFlags flags,
883 uint32_t value_index,
884 struct gen_mi_value query_result)
885 {
886 gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC0), gen_mi_mem64(poll_addr));
887 gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(ref_value));
888 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
889 mip.LoadOperation = LOAD_LOAD;
890 mip.CombineOperation = COMBINE_SET;
891 mip.CompareOperation = COMPARE_SRCS_EQUAL;
892 }
893
894 if (flags & VK_QUERY_RESULT_64_BIT) {
895 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
896 gen_mi_store_if(b, gen_mi_mem64(res_addr), query_result);
897 } else {
898 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
899 gen_mi_store_if(b, gen_mi_mem32(res_addr), query_result);
900 }
901 }
902
903 #endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */
904
905 static void
906 gpu_write_query_result(struct gen_mi_builder *b,
907 struct anv_address dst_addr,
908 VkQueryResultFlags flags,
909 uint32_t value_index,
910 struct gen_mi_value query_result)
911 {
912 if (flags & VK_QUERY_RESULT_64_BIT) {
913 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
914 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
915 } else {
916 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
917 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
918 }
919 }
920
921 static struct gen_mi_value
922 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
923 {
924 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
925 gen_mi_mem64(anv_address_add(addr, 0)));
926 }
927
928 void genX(CmdCopyQueryPoolResults)(
929 VkCommandBuffer commandBuffer,
930 VkQueryPool queryPool,
931 uint32_t firstQuery,
932 uint32_t queryCount,
933 VkBuffer destBuffer,
934 VkDeviceSize destOffset,
935 VkDeviceSize destStride,
936 VkQueryResultFlags flags)
937 {
938 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
939 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
940 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
941
942 struct gen_mi_builder b;
943 gen_mi_builder_init(&b, &cmd_buffer->batch);
944 struct gen_mi_value result;
945
946 /* If render target writes are ongoing, request a render target cache flush
947 * to ensure proper ordering of the commands from the 3d pipe and the
948 * command streamer.
949 */
950 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
951 cmd_buffer->state.pending_pipe_bits |=
952 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
953 }
954
955 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
956 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
957 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
958 * because we're about to copy values from MI commands, we need to
959 * stall the command streamer to make sure the PIPE_CONTROL values have
960 * landed, otherwise we could see inconsistent values & availability.
961 *
962 * From the vulkan spec:
963 *
964 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
965 * previous uses of vkCmdResetQueryPool in the same queue, without
966 * any additional synchronization."
967 */
968 pool->type == VK_QUERY_TYPE_OCCLUSION ||
969 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
970 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
971 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
972 }
973
974 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
975 for (uint32_t i = 0; i < queryCount; i++) {
976 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
977 uint32_t idx = 0;
978 switch (pool->type) {
979 case VK_QUERY_TYPE_OCCLUSION:
980 result = compute_query_result(&b, anv_address_add(query_addr, 8));
981 #if GEN_GEN >= 8 || GEN_IS_HASWELL
982 /* Like in the case of vkGetQueryPoolResults, if the query is
983 * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
984 * conservatively write 0 as the query result. If the
985 * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
986 */
987 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
988 1 /* available */, flags, idx, result);
989 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
990 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
991 0 /* unavailable */, flags, idx, gen_mi_imm(0));
992 }
993 idx++;
994 #else /* GEN_GEN < 8 && !GEN_IS_HASWELL */
995 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
996 #endif
997 break;
998
999 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1000 uint32_t statistics = pool->pipeline_statistics;
1001 while (statistics) {
1002 uint32_t stat = u_bit_scan(&statistics);
1003
1004 result = compute_query_result(&b, anv_address_add(query_addr,
1005 idx * 16 + 8));
1006
1007 /* WaDividePSInvocationCountBy4:HSW,BDW */
1008 if ((cmd_buffer->device->info.gen == 8 ||
1009 cmd_buffer->device->info.is_haswell) &&
1010 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1011 result = gen_mi_ushr32_imm(&b, result, 2);
1012 }
1013
1014 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1015 }
1016 assert(idx == util_bitcount(pool->pipeline_statistics));
1017 break;
1018 }
1019
1020 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1021 result = compute_query_result(&b, anv_address_add(query_addr, 8));
1022 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1023 result = compute_query_result(&b, anv_address_add(query_addr, 24));
1024 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1025 break;
1026
1027 case VK_QUERY_TYPE_TIMESTAMP:
1028 result = gen_mi_mem64(anv_address_add(query_addr, 8));
1029 gpu_write_query_result(&b, dest_addr, flags, 0, result);
1030 break;
1031
1032 default:
1033 unreachable("unhandled query type");
1034 }
1035
1036 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1037 gpu_write_query_result(&b, dest_addr, flags, idx,
1038 gen_mi_mem64(query_addr));
1039 }
1040
1041 dest_addr = anv_address_add(dest_addr, destStride);
1042 }
1043 }
1044
1045 #else
1046 void genX(CmdCopyQueryPoolResults)(
1047 VkCommandBuffer commandBuffer,
1048 VkQueryPool queryPool,
1049 uint32_t firstQuery,
1050 uint32_t queryCount,
1051 VkBuffer destBuffer,
1052 VkDeviceSize destOffset,
1053 VkDeviceSize destStride,
1054 VkQueryResultFlags flags)
1055 {
1056 anv_finishme("Queries not yet supported on Ivy Bridge");
1057 }
1058 #endif