ba7c1e94fe5bc1f633952b4b40e4fe21f88e63e1
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = device->physical;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 72; /* 576 bytes, see layout below */
98 break;
99 }
100 default:
101 assert(!"Invalid query type");
102 }
103
104 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (pool == NULL)
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 pool->type = pCreateInfo->queryType;
110 pool->pipeline_statistics = pipeline_statistics;
111 pool->stride = uint64s_per_slot * sizeof(uint64_t);
112 pool->slots = pCreateInfo->queryCount;
113
114 uint32_t bo_flags = 0;
115 if (pdevice->supports_48bit_addresses)
116 bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
117
118 if (pdevice->use_softpin)
119 bo_flags |= EXEC_OBJECT_PINNED;
120
121 if (pdevice->has_exec_async)
122 bo_flags |= EXEC_OBJECT_ASYNC;
123
124 uint64_t size = pool->slots * pool->stride;
125 result = anv_device_alloc_bo(device, size,
126 ANV_BO_ALLOC_MAPPED |
127 ANV_BO_ALLOC_SNOOPED,
128 0 /* explicit_address */,
129 &pool->bo);
130 if (result != VK_SUCCESS)
131 goto fail;
132
133 *pQueryPool = anv_query_pool_to_handle(pool);
134
135 return VK_SUCCESS;
136
137 fail:
138 vk_free2(&device->alloc, pAllocator, pool);
139
140 return result;
141 }
142
143 void genX(DestroyQueryPool)(
144 VkDevice _device,
145 VkQueryPool _pool,
146 const VkAllocationCallbacks* pAllocator)
147 {
148 ANV_FROM_HANDLE(anv_device, device, _device);
149 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
150
151 if (!pool)
152 return;
153
154 anv_device_release_bo(device, pool->bo);
155 vk_free2(&device->alloc, pAllocator, pool);
156 }
157
158 static struct anv_address
159 anv_query_address(struct anv_query_pool *pool, uint32_t query)
160 {
161 return (struct anv_address) {
162 .bo = pool->bo,
163 .offset = query * pool->stride,
164 };
165 }
166
167 /**
168 * VK_INTEL_performance_query layout (576 bytes) :
169 *
170 * ------------------------------
171 * | availability (8b) |
172 * |----------------------------|
173 * | marker (8b) |
174 * |----------------------------|
175 * | begin RPSTAT register (4b) |
176 * |----------------------------|
177 * | end RPSTAT register (4b) |
178 * |----------------------------|
179 * | begin perfcntr 1 & 2 (16b) |
180 * |----------------------------|
181 * | end perfcntr 1 & 2 (16b) |
182 * |----------------------------|
183 * | Unused (8b) |
184 * |----------------------------|
185 * | begin MI_RPC (256b) |
186 * |----------------------------|
187 * | end MI_RPC (256b) |
188 * ------------------------------
189 */
190
191 static uint32_t
192 intel_perf_marker_offset(void)
193 {
194 return 8;
195 }
196
197 static uint32_t
198 intel_perf_rpstart_offset(bool end)
199 {
200 return 16 + (end ? sizeof(uint32_t) : 0);
201 }
202
203 #if GEN_GEN >= 8 && GEN_GEN <= 11
204 static uint32_t
205 intel_perf_counter(bool end)
206 {
207 return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
208 }
209 #endif
210
211 static uint32_t
212 intel_perf_mi_rpc_offset(bool end)
213 {
214 return 64 + (end ? 256 : 0);
215 }
216
217 static void
218 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
219 uint32_t value_index, uint64_t result)
220 {
221 if (flags & VK_QUERY_RESULT_64_BIT) {
222 uint64_t *dst64 = dst_slot;
223 dst64[value_index] = result;
224 } else {
225 uint32_t *dst32 = dst_slot;
226 dst32[value_index] = result;
227 }
228 }
229
230 static void *
231 query_slot(struct anv_query_pool *pool, uint32_t query)
232 {
233 return pool->bo->map + query * pool->stride;
234 }
235
236 static bool
237 query_is_available(struct anv_query_pool *pool, uint32_t query)
238 {
239 return *(volatile uint64_t *)query_slot(pool, query);
240 }
241
242 static VkResult
243 wait_for_available(struct anv_device *device,
244 struct anv_query_pool *pool, uint32_t query)
245 {
246 uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC);
247
248 while (anv_gettime_ns() < abs_timeout) {
249 if (query_is_available(pool, query))
250 return VK_SUCCESS;
251 VkResult status = anv_device_query_status(device);
252 if (status != VK_SUCCESS)
253 return status;
254 }
255
256 return anv_device_set_lost(device, "query timeout");
257 }
258
259 VkResult genX(GetQueryPoolResults)(
260 VkDevice _device,
261 VkQueryPool queryPool,
262 uint32_t firstQuery,
263 uint32_t queryCount,
264 size_t dataSize,
265 void* pData,
266 VkDeviceSize stride,
267 VkQueryResultFlags flags)
268 {
269 ANV_FROM_HANDLE(anv_device, device, _device);
270 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
271
272 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
273 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
274 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
275 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
276 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
277
278 if (anv_device_is_lost(device))
279 return VK_ERROR_DEVICE_LOST;
280
281 if (pData == NULL)
282 return VK_SUCCESS;
283
284 void *data_end = pData + dataSize;
285
286 VkResult status = VK_SUCCESS;
287 for (uint32_t i = 0; i < queryCount; i++) {
288 bool available = query_is_available(pool, firstQuery + i);
289
290 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
291 status = wait_for_available(device, pool, firstQuery + i);
292 if (status != VK_SUCCESS)
293 return status;
294
295 available = true;
296 }
297
298 /* From the Vulkan 1.0.42 spec:
299 *
300 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
301 * both not set then no result values are written to pData for
302 * queries that are in the unavailable state at the time of the call,
303 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
304 * availability state is still written to pData for those queries if
305 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
306 */
307 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
308
309 uint32_t idx = 0;
310 switch (pool->type) {
311 case VK_QUERY_TYPE_OCCLUSION: {
312 uint64_t *slot = query_slot(pool, firstQuery + i);
313 if (write_results) {
314 /* From the Vulkan 1.2.132 spec:
315 *
316 * "If VK_QUERY_RESULT_PARTIAL_BIT is set,
317 * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
318 * is unavailable, an intermediate result value between zero and
319 * the final result value is written to pData for that query."
320 */
321 uint64_t result = available ? slot[2] - slot[1] : 0;
322 cpu_write_query_result(pData, flags, idx, result);
323 }
324 idx++;
325 break;
326 }
327
328 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
329 uint64_t *slot = query_slot(pool, firstQuery + i);
330 uint32_t statistics = pool->pipeline_statistics;
331 while (statistics) {
332 uint32_t stat = u_bit_scan(&statistics);
333 if (write_results) {
334 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
335
336 /* WaDividePSInvocationCountBy4:HSW,BDW */
337 if ((device->info.gen == 8 || device->info.is_haswell) &&
338 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
339 result >>= 2;
340
341 cpu_write_query_result(pData, flags, idx, result);
342 }
343 idx++;
344 }
345 assert(idx == util_bitcount(pool->pipeline_statistics));
346 break;
347 }
348
349 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
350 uint64_t *slot = query_slot(pool, firstQuery + i);
351 if (write_results)
352 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
353 idx++;
354 if (write_results)
355 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
356 idx++;
357 break;
358 }
359
360 case VK_QUERY_TYPE_TIMESTAMP: {
361 uint64_t *slot = query_slot(pool, firstQuery + i);
362 if (write_results)
363 cpu_write_query_result(pData, flags, idx, slot[1]);
364 idx++;
365 break;
366 }
367
368 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
369 if (!write_results)
370 break;
371 const void *query_data = query_slot(pool, firstQuery + i);
372 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
373 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
374 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
375 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
376 struct gen_perf_query_result result;
377 struct gen_perf_query_info metric = {
378 .oa_format = (GEN_GEN >= 8 ?
379 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
380 I915_OA_FORMAT_A45_B8_C8),
381 };
382 uint32_t core_freq[2];
383 #if GEN_GEN < 9
384 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
385 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
386 #else
387 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
388 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
389 #endif
390 gen_perf_query_result_clear(&result);
391 gen_perf_query_result_accumulate(&result, &metric,
392 oa_begin, oa_end);
393 gen_perf_query_result_read_frequencies(&result, &device->info,
394 oa_begin, oa_end);
395 gen_perf_query_result_write_mdapi(pData, stride,
396 &device->info,
397 &result,
398 core_freq[0], core_freq[1]);
399 #if GEN_GEN >= 8 && GEN_GEN <= 11
400 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
401 query_data + intel_perf_counter(false),
402 query_data + intel_perf_counter(true));
403 #endif
404 const uint64_t *marker = query_data + intel_perf_marker_offset();
405 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
406 break;
407 }
408
409 default:
410 unreachable("invalid pool type");
411 }
412
413 if (!write_results)
414 status = VK_NOT_READY;
415
416 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
417 cpu_write_query_result(pData, flags, idx, available);
418
419 pData += stride;
420 if (pData >= data_end)
421 break;
422 }
423
424 return status;
425 }
426
427 static void
428 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
429 struct anv_address addr)
430 {
431 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
432 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
433
434 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
435 pc.DestinationAddressType = DAT_PPGTT;
436 pc.PostSyncOperation = WritePSDepthCount;
437 pc.DepthStallEnable = true;
438 pc.Address = addr;
439
440 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
441 pc.CommandStreamerStallEnable = true;
442 }
443 }
444
445 static void
446 emit_query_mi_availability(struct gen_mi_builder *b,
447 struct anv_address addr,
448 bool available)
449 {
450 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
451 }
452
453 static void
454 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
455 struct anv_address addr,
456 bool available)
457 {
458 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
459 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
460
461 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
462 pc.DestinationAddressType = DAT_PPGTT;
463 pc.PostSyncOperation = WriteImmediateData;
464 pc.Address = addr;
465 pc.ImmediateData = available;
466 }
467 }
468
469 /**
470 * Goes through a series of consecutive query indices in the given pool
471 * setting all element values to 0 and emitting them as available.
472 */
473 static void
474 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
475 struct gen_mi_builder *b, struct anv_query_pool *pool,
476 uint32_t first_index, uint32_t num_queries)
477 {
478 switch (pool->type) {
479 case VK_QUERY_TYPE_OCCLUSION:
480 case VK_QUERY_TYPE_TIMESTAMP:
481 /* These queries are written with a PIPE_CONTROL so clear them using the
482 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
483 * of operations.
484 */
485 assert((pool->stride % 8) == 0);
486 for (uint32_t i = 0; i < num_queries; i++) {
487 struct anv_address slot_addr =
488 anv_query_address(pool, first_index + i);
489
490 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
491 emit_query_pc_availability(cmd_buffer,
492 anv_address_add(slot_addr, qword * 8),
493 false);
494 }
495 emit_query_pc_availability(cmd_buffer, slot_addr, true);
496 }
497 break;
498
499 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
500 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
501 for (uint32_t i = 0; i < num_queries; i++) {
502 struct anv_address slot_addr =
503 anv_query_address(pool, first_index + i);
504 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
505 emit_query_mi_availability(b, slot_addr, true);
506 }
507 break;
508
509 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
510 for (uint32_t i = 0; i < num_queries; i++) {
511 struct anv_address slot_addr =
512 anv_query_address(pool, first_index + i);
513 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
514 emit_query_mi_availability(b, slot_addr, true);
515 }
516 break;
517
518 default:
519 unreachable("Unsupported query type");
520 }
521 }
522
523 void genX(CmdResetQueryPool)(
524 VkCommandBuffer commandBuffer,
525 VkQueryPool queryPool,
526 uint32_t firstQuery,
527 uint32_t queryCount)
528 {
529 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
530 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
531
532 switch (pool->type) {
533 case VK_QUERY_TYPE_OCCLUSION:
534 case VK_QUERY_TYPE_TIMESTAMP:
535 for (uint32_t i = 0; i < queryCount; i++) {
536 emit_query_pc_availability(cmd_buffer,
537 anv_query_address(pool, firstQuery + i),
538 false);
539 }
540 break;
541
542 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
543 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
544 struct gen_mi_builder b;
545 gen_mi_builder_init(&b, &cmd_buffer->batch);
546
547 for (uint32_t i = 0; i < queryCount; i++)
548 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
549 break;
550 }
551
552 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
553 struct gen_mi_builder b;
554 gen_mi_builder_init(&b, &cmd_buffer->batch);
555
556 for (uint32_t i = 0; i < queryCount; i++)
557 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
558 break;
559 }
560
561 default:
562 unreachable("Unsupported query type");
563 }
564 }
565
566 void genX(ResetQueryPool)(
567 VkDevice _device,
568 VkQueryPool queryPool,
569 uint32_t firstQuery,
570 uint32_t queryCount)
571 {
572 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
573
574 for (uint32_t i = 0; i < queryCount; i++) {
575 uint64_t *slot = query_slot(pool, firstQuery + i);
576 *slot = 0;
577 }
578 }
579
580 static const uint32_t vk_pipeline_stat_to_reg[] = {
581 GENX(IA_VERTICES_COUNT_num),
582 GENX(IA_PRIMITIVES_COUNT_num),
583 GENX(VS_INVOCATION_COUNT_num),
584 GENX(GS_INVOCATION_COUNT_num),
585 GENX(GS_PRIMITIVES_COUNT_num),
586 GENX(CL_INVOCATION_COUNT_num),
587 GENX(CL_PRIMITIVES_COUNT_num),
588 GENX(PS_INVOCATION_COUNT_num),
589 GENX(HS_INVOCATION_COUNT_num),
590 GENX(DS_INVOCATION_COUNT_num),
591 GENX(CS_INVOCATION_COUNT_num),
592 };
593
594 static void
595 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
596 struct anv_address addr)
597 {
598 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
599 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
600
601 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
602 gen_mi_store(b, gen_mi_mem64(addr),
603 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
604 }
605
606 static void
607 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
608 struct anv_address addr)
609 {
610 assert(stream < MAX_XFB_STREAMS);
611
612 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
613 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
614 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
615 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
616 }
617
618 void genX(CmdBeginQuery)(
619 VkCommandBuffer commandBuffer,
620 VkQueryPool queryPool,
621 uint32_t query,
622 VkQueryControlFlags flags)
623 {
624 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
625 }
626
627 void genX(CmdBeginQueryIndexedEXT)(
628 VkCommandBuffer commandBuffer,
629 VkQueryPool queryPool,
630 uint32_t query,
631 VkQueryControlFlags flags,
632 uint32_t index)
633 {
634 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
635 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
636 struct anv_address query_addr = anv_query_address(pool, query);
637
638 struct gen_mi_builder b;
639 gen_mi_builder_init(&b, &cmd_buffer->batch);
640
641 switch (pool->type) {
642 case VK_QUERY_TYPE_OCCLUSION:
643 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
644 break;
645
646 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
647 /* TODO: This might only be necessary for certain stats */
648 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
649 pc.CommandStreamerStallEnable = true;
650 pc.StallAtPixelScoreboard = true;
651 }
652
653 uint32_t statistics = pool->pipeline_statistics;
654 uint32_t offset = 8;
655 while (statistics) {
656 uint32_t stat = u_bit_scan(&statistics);
657 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
658 offset += 16;
659 }
660 break;
661 }
662
663 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
664 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
665 pc.CommandStreamerStallEnable = true;
666 pc.StallAtPixelScoreboard = true;
667 }
668 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
669 break;
670
671 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
672 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
673 pc.CommandStreamerStallEnable = true;
674 pc.StallAtPixelScoreboard = true;
675 }
676 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
677 rpc.MemoryAddress =
678 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
679 }
680 #if GEN_GEN < 9
681 gen_mi_store(&b,
682 gen_mi_mem32(anv_address_add(query_addr,
683 intel_perf_rpstart_offset(false))),
684 gen_mi_reg32(GENX(RPSTAT1_num)));
685 #else
686 gen_mi_store(&b,
687 gen_mi_mem32(anv_address_add(query_addr,
688 intel_perf_rpstart_offset(false))),
689 gen_mi_reg32(GENX(RPSTAT0_num)));
690 #endif
691 #if GEN_GEN >= 8 && GEN_GEN <= 11
692 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
693 intel_perf_counter(false))),
694 gen_mi_reg64(GENX(PERFCNT1_num)));
695 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
696 intel_perf_counter(false) + 8)),
697 gen_mi_reg64(GENX(PERFCNT2_num)));
698 #endif
699 break;
700 }
701
702 default:
703 unreachable("");
704 }
705 }
706
707 void genX(CmdEndQuery)(
708 VkCommandBuffer commandBuffer,
709 VkQueryPool queryPool,
710 uint32_t query)
711 {
712 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
713 }
714
715 void genX(CmdEndQueryIndexedEXT)(
716 VkCommandBuffer commandBuffer,
717 VkQueryPool queryPool,
718 uint32_t query,
719 uint32_t index)
720 {
721 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
722 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
723 struct anv_address query_addr = anv_query_address(pool, query);
724
725 struct gen_mi_builder b;
726 gen_mi_builder_init(&b, &cmd_buffer->batch);
727
728 switch (pool->type) {
729 case VK_QUERY_TYPE_OCCLUSION:
730 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
731 emit_query_pc_availability(cmd_buffer, query_addr, true);
732 break;
733
734 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
735 /* TODO: This might only be necessary for certain stats */
736 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
737 pc.CommandStreamerStallEnable = true;
738 pc.StallAtPixelScoreboard = true;
739 }
740
741 uint32_t statistics = pool->pipeline_statistics;
742 uint32_t offset = 16;
743 while (statistics) {
744 uint32_t stat = u_bit_scan(&statistics);
745 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
746 offset += 16;
747 }
748
749 emit_query_mi_availability(&b, query_addr, true);
750 break;
751 }
752
753 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
754 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
755 pc.CommandStreamerStallEnable = true;
756 pc.StallAtPixelScoreboard = true;
757 }
758
759 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
760 emit_query_mi_availability(&b, query_addr, true);
761 break;
762
763 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
764 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
765 pc.CommandStreamerStallEnable = true;
766 pc.StallAtPixelScoreboard = true;
767 }
768 uint32_t marker_offset = intel_perf_marker_offset();
769 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
770 gen_mi_imm(cmd_buffer->intel_perf_marker));
771 #if GEN_GEN >= 8 && GEN_GEN <= 11
772 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
773 gen_mi_reg64(GENX(PERFCNT1_num)));
774 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
775 gen_mi_reg64(GENX(PERFCNT2_num)));
776 #endif
777 #if GEN_GEN < 9
778 gen_mi_store(&b,
779 gen_mi_mem32(anv_address_add(query_addr,
780 intel_perf_rpstart_offset(true))),
781 gen_mi_reg32(GENX(RPSTAT1_num)));
782 #else
783 gen_mi_store(&b,
784 gen_mi_mem32(anv_address_add(query_addr,
785 intel_perf_rpstart_offset(true))),
786 gen_mi_reg32(GENX(RPSTAT0_num)));
787 #endif
788 /* Position the last OA snapshot at the beginning of the query so that
789 * we can tell whether it's ready.
790 */
791 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
792 rpc.MemoryAddress = anv_address_add(query_addr,
793 intel_perf_mi_rpc_offset(true));
794 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
795 }
796 emit_query_mi_availability(&b, query_addr, true);
797 break;
798 }
799
800 default:
801 unreachable("");
802 }
803
804 /* When multiview is active the spec requires that N consecutive query
805 * indices are used, where N is the number of active views in the subpass.
806 * The spec allows that we only write the results to one of the queries
807 * but we still need to manage result availability for all the query indices.
808 * Since we only emit a single query for all active views in the
809 * first index, mark the other query indices as being already available
810 * with result 0.
811 */
812 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
813 const uint32_t num_queries =
814 util_bitcount(cmd_buffer->state.subpass->view_mask);
815 if (num_queries > 1)
816 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
817 }
818 }
819
820 #define TIMESTAMP 0x2358
821
822 void genX(CmdWriteTimestamp)(
823 VkCommandBuffer commandBuffer,
824 VkPipelineStageFlagBits pipelineStage,
825 VkQueryPool queryPool,
826 uint32_t query)
827 {
828 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
829 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
830 struct anv_address query_addr = anv_query_address(pool, query);
831
832 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
833
834 struct gen_mi_builder b;
835 gen_mi_builder_init(&b, &cmd_buffer->batch);
836
837 switch (pipelineStage) {
838 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
839 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
840 gen_mi_reg64(TIMESTAMP));
841 break;
842
843 default:
844 /* Everything else is bottom-of-pipe */
845 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
846 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
847
848 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
849 pc.DestinationAddressType = DAT_PPGTT;
850 pc.PostSyncOperation = WriteTimestamp;
851 pc.Address = anv_address_add(query_addr, 8);
852
853 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
854 pc.CommandStreamerStallEnable = true;
855 }
856 break;
857 }
858
859 emit_query_pc_availability(cmd_buffer, query_addr, true);
860
861 /* When multiview is active the spec requires that N consecutive query
862 * indices are used, where N is the number of active views in the subpass.
863 * The spec allows that we only write the results to one of the queries
864 * but we still need to manage result availability for all the query indices.
865 * Since we only emit a single query for all active views in the
866 * first index, mark the other query indices as being already available
867 * with result 0.
868 */
869 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
870 const uint32_t num_queries =
871 util_bitcount(cmd_buffer->state.subpass->view_mask);
872 if (num_queries > 1)
873 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
874 }
875 }
876
877 #if GEN_GEN > 7 || GEN_IS_HASWELL
878
879 #if GEN_GEN >= 8 || GEN_IS_HASWELL
880
881 #define MI_PREDICATE_SRC0 0x2400
882 #define MI_PREDICATE_SRC1 0x2408
883 #define MI_PREDICATE_RESULT 0x2418
884
885 /**
886 * Writes the results of a query to dst_addr is the value at poll_addr is equal
887 * to the reference value.
888 */
889 static void
890 gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
891 struct gen_mi_builder *b,
892 struct anv_address poll_addr,
893 struct anv_address dst_addr,
894 uint64_t ref_value,
895 VkQueryResultFlags flags,
896 uint32_t value_index,
897 struct gen_mi_value query_result)
898 {
899 gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC0), gen_mi_mem64(poll_addr));
900 gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(ref_value));
901 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
902 mip.LoadOperation = LOAD_LOAD;
903 mip.CombineOperation = COMBINE_SET;
904 mip.CompareOperation = COMPARE_SRCS_EQUAL;
905 }
906
907 if (flags & VK_QUERY_RESULT_64_BIT) {
908 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
909 gen_mi_store_if(b, gen_mi_mem64(res_addr), query_result);
910 } else {
911 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
912 gen_mi_store_if(b, gen_mi_mem32(res_addr), query_result);
913 }
914 }
915
916 #endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */
917
918 static void
919 gpu_write_query_result(struct gen_mi_builder *b,
920 struct anv_address dst_addr,
921 VkQueryResultFlags flags,
922 uint32_t value_index,
923 struct gen_mi_value query_result)
924 {
925 if (flags & VK_QUERY_RESULT_64_BIT) {
926 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
927 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
928 } else {
929 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
930 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
931 }
932 }
933
934 static struct gen_mi_value
935 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
936 {
937 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
938 gen_mi_mem64(anv_address_add(addr, 0)));
939 }
940
941 void genX(CmdCopyQueryPoolResults)(
942 VkCommandBuffer commandBuffer,
943 VkQueryPool queryPool,
944 uint32_t firstQuery,
945 uint32_t queryCount,
946 VkBuffer destBuffer,
947 VkDeviceSize destOffset,
948 VkDeviceSize destStride,
949 VkQueryResultFlags flags)
950 {
951 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
952 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
953 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
954
955 struct gen_mi_builder b;
956 gen_mi_builder_init(&b, &cmd_buffer->batch);
957 struct gen_mi_value result;
958
959 /* If render target writes are ongoing, request a render target cache flush
960 * to ensure proper ordering of the commands from the 3d pipe and the
961 * command streamer.
962 */
963 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
964 cmd_buffer->state.pending_pipe_bits |=
965 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
966 }
967
968 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
969 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
970 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
971 * because we're about to copy values from MI commands, we need to
972 * stall the command streamer to make sure the PIPE_CONTROL values have
973 * landed, otherwise we could see inconsistent values & availability.
974 *
975 * From the vulkan spec:
976 *
977 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
978 * previous uses of vkCmdResetQueryPool in the same queue, without
979 * any additional synchronization."
980 */
981 pool->type == VK_QUERY_TYPE_OCCLUSION ||
982 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
983 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
984 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
985 }
986
987 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
988 for (uint32_t i = 0; i < queryCount; i++) {
989 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
990 uint32_t idx = 0;
991 switch (pool->type) {
992 case VK_QUERY_TYPE_OCCLUSION:
993 result = compute_query_result(&b, anv_address_add(query_addr, 8));
994 #if GEN_GEN >= 8 || GEN_IS_HASWELL
995 /* Like in the case of vkGetQueryPoolResults, if the query is
996 * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
997 * conservatively write 0 as the query result. If the
998 * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
999 */
1000 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1001 1 /* available */, flags, idx, result);
1002 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
1003 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1004 0 /* unavailable */, flags, idx, gen_mi_imm(0));
1005 }
1006 idx++;
1007 #else /* GEN_GEN < 8 && !GEN_IS_HASWELL */
1008 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1009 #endif
1010 break;
1011
1012 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1013 uint32_t statistics = pool->pipeline_statistics;
1014 while (statistics) {
1015 uint32_t stat = u_bit_scan(&statistics);
1016
1017 result = compute_query_result(&b, anv_address_add(query_addr,
1018 idx * 16 + 8));
1019
1020 /* WaDividePSInvocationCountBy4:HSW,BDW */
1021 if ((cmd_buffer->device->info.gen == 8 ||
1022 cmd_buffer->device->info.is_haswell) &&
1023 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1024 result = gen_mi_ushr32_imm(&b, result, 2);
1025 }
1026
1027 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1028 }
1029 assert(idx == util_bitcount(pool->pipeline_statistics));
1030 break;
1031 }
1032
1033 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1034 result = compute_query_result(&b, anv_address_add(query_addr, 8));
1035 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1036 result = compute_query_result(&b, anv_address_add(query_addr, 24));
1037 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1038 break;
1039
1040 case VK_QUERY_TYPE_TIMESTAMP:
1041 result = gen_mi_mem64(anv_address_add(query_addr, 8));
1042 gpu_write_query_result(&b, dest_addr, flags, 0, result);
1043 break;
1044
1045 default:
1046 unreachable("unhandled query type");
1047 }
1048
1049 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1050 gpu_write_query_result(&b, dest_addr, flags, idx,
1051 gen_mi_mem64(query_addr));
1052 }
1053
1054 dest_addr = anv_address_add(dest_addr, destStride);
1055 }
1056 }
1057
1058 #else
1059 void genX(CmdCopyQueryPoolResults)(
1060 VkCommandBuffer commandBuffer,
1061 VkQueryPool queryPool,
1062 uint32_t firstQuery,
1063 uint32_t queryCount,
1064 VkBuffer destBuffer,
1065 VkDeviceSize destOffset,
1066 VkDeviceSize destStride,
1067 VkQueryResultFlags flags)
1068 {
1069 anv_finishme("Queries not yet supported on Ivy Bridge");
1070 }
1071 #endif