2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include "anv_private.h"
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
45 VkResult
genX(CreateQueryPool
)(
47 const VkQueryPoolCreateInfo
* pCreateInfo
,
48 const VkAllocationCallbacks
* pAllocator
,
49 VkQueryPool
* pQueryPool
)
51 ANV_FROM_HANDLE(anv_device
, device
, _device
);
52 const struct anv_physical_device
*pdevice
= &device
->instance
->physicalDevice
;
53 struct anv_query_pool
*pool
;
56 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
68 uint32_t uint64s_per_slot
= 1;
70 VkQueryPipelineStatisticFlags pipeline_statistics
= 0;
71 switch (pCreateInfo
->queryType
) {
72 case VK_QUERY_TYPE_OCCLUSION
:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot
+= 2;
76 case VK_QUERY_TYPE_TIMESTAMP
:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot
+= 1;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
81 pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
85 pipeline_statistics
&= ANV_PIPELINE_STATISTICS_MASK
;
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot
+= 2 * util_bitcount(pipeline_statistics
);
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
91 /* Transform feedback queries are 4 values, begin/end for
94 uint64s_per_slot
+= 4;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
97 uint64s_per_slot
= 2 * OA_REPORT_N_UINT64
; /* begin & end OA reports */
98 uint64s_per_slot
+= 4; /* PerfCounter 1 & 2 */
99 uint64s_per_slot
++; /* 2 * 32bit RPSTAT register */
100 uint64s_per_slot
++; /* 64bit marker */
101 uint64s_per_slot
++; /* availability */
102 uint64s_per_slot
= align_u32(uint64s_per_slot
, 8); /* OA reports must be aligned to 64 bytes */
106 assert(!"Invalid query type");
109 pool
= vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*pool
), 8,
110 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
112 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
114 pool
->type
= pCreateInfo
->queryType
;
115 pool
->pipeline_statistics
= pipeline_statistics
;
116 pool
->stride
= uint64s_per_slot
* sizeof(uint64_t);
117 pool
->slots
= pCreateInfo
->queryCount
;
119 uint64_t size
= pool
->slots
* pool
->stride
;
120 result
= anv_bo_init_new(&pool
->bo
, device
, size
);
121 if (result
!= VK_SUCCESS
)
124 if (pdevice
->supports_48bit_addresses
)
125 pool
->bo
.flags
|= EXEC_OBJECT_SUPPORTS_48B_ADDRESS
;
127 if (pdevice
->use_softpin
)
128 pool
->bo
.flags
|= EXEC_OBJECT_PINNED
;
130 if (pdevice
->has_exec_async
)
131 pool
->bo
.flags
|= EXEC_OBJECT_ASYNC
;
133 anv_vma_alloc(device
, &pool
->bo
);
135 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
136 * platforms, this does nothing. On non-LLC platforms, this means snooping
137 * which comes at a slight cost. However, the buffers aren't big, won't be
138 * written frequently, and trying to handle the flushing manually without
139 * doing too much flushing is extremely painful.
141 anv_gem_set_caching(device
, pool
->bo
.gem_handle
, I915_CACHING_CACHED
);
143 pool
->bo
.map
= anv_gem_mmap(device
, pool
->bo
.gem_handle
, 0, size
, 0);
145 *pQueryPool
= anv_query_pool_to_handle(pool
);
150 vk_free2(&device
->alloc
, pAllocator
, pool
);
155 void genX(DestroyQueryPool
)(
158 const VkAllocationCallbacks
* pAllocator
)
160 ANV_FROM_HANDLE(anv_device
, device
, _device
);
161 ANV_FROM_HANDLE(anv_query_pool
, pool
, _pool
);
166 anv_gem_munmap(pool
->bo
.map
, pool
->bo
.size
);
167 anv_vma_free(device
, &pool
->bo
);
168 anv_gem_close(device
, pool
->bo
.gem_handle
);
169 vk_free2(&device
->alloc
, pAllocator
, pool
);
172 static struct anv_address
173 anv_query_address(struct anv_query_pool
*pool
, uint32_t query
)
175 return (struct anv_address
) {
177 .offset
= query
* pool
->stride
,
182 * VK_INTEL_performance_query layout:
184 * ------------------------------
185 * | end MI_RPC (256b) |
186 * |----------------------------|
187 * | begin MI_RPC (256b) |
188 * |----------------------------|
189 * | begin perfcntr 1 & 2 (16b) |
190 * |----------------------------|
191 * | end perfcntr 1 & 2 (16b) |
192 * |----------------------------|
193 * | begin RPSTAT register (4b) |
194 * |----------------------------|
195 * | end RPSTAT register (4b) |
196 * |----------------------------|
198 * |----------------------------|
199 * | availability (8b) |
200 * ------------------------------
204 intel_perf_mi_rpc_offset(bool end
)
206 return end
? 0 : 256;
210 intel_perf_counter(bool end
)
212 uint32_t offset
= 512;
213 offset
+= end
? 2 * sizeof(uint64_t) : 0;
218 intel_perf_rpstart_offset(bool end
)
220 uint32_t offset
= intel_perf_counter(false) +
221 4 * sizeof(uint64_t);
222 offset
+= end
? sizeof(uint32_t) : 0;
227 intel_perf_marker_offset(void)
229 return intel_perf_rpstart_offset(false) + sizeof(uint64_t);
233 cpu_write_query_result(void *dst_slot
, VkQueryResultFlags flags
,
234 uint32_t value_index
, uint64_t result
)
236 if (flags
& VK_QUERY_RESULT_64_BIT
) {
237 uint64_t *dst64
= dst_slot
;
238 dst64
[value_index
] = result
;
240 uint32_t *dst32
= dst_slot
;
241 dst32
[value_index
] = result
;
246 query_slot(struct anv_query_pool
*pool
, uint32_t query
)
248 return pool
->bo
.map
+ query
* pool
->stride
;
252 query_is_available(struct anv_query_pool
*pool
, uint32_t query
)
254 if (pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
) {
255 return *(volatile uint64_t *)((uint8_t *)query_slot(pool
, query
) +
258 return *(volatile uint64_t *)query_slot(pool
, query
);
262 wait_for_available(struct anv_device
*device
,
263 struct anv_query_pool
*pool
, uint32_t query
)
266 if (query_is_available(pool
, query
))
269 int ret
= anv_gem_busy(device
, pool
->bo
.gem_handle
);
271 /* The BO is still busy, keep waiting. */
273 } else if (ret
== -1) {
274 /* We don't know the real error. */
275 return anv_device_set_lost(device
, "gem wait failed: %m");
278 /* The BO is no longer busy. */
279 if (query_is_available(pool
, query
)) {
282 VkResult status
= anv_device_query_status(device
);
283 if (status
!= VK_SUCCESS
)
286 /* If we haven't seen availability yet, then we never will. This
287 * can only happen if we have a client error where they call
288 * GetQueryPoolResults on a query that they haven't submitted to
289 * the GPU yet. The spec allows us to do anything in this case,
290 * but returning VK_SUCCESS doesn't seem right and we shouldn't
291 * just keep spinning.
299 VkResult
genX(GetQueryPoolResults
)(
301 VkQueryPool queryPool
,
307 VkQueryResultFlags flags
)
309 ANV_FROM_HANDLE(anv_device
, device
, _device
);
310 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
312 assert(pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
313 pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
||
314 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
||
315 pool
->type
== VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
||
316 pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
);
318 if (anv_device_is_lost(device
))
319 return VK_ERROR_DEVICE_LOST
;
324 void *data_end
= pData
+ dataSize
;
326 VkResult status
= VK_SUCCESS
;
327 for (uint32_t i
= 0; i
< queryCount
; i
++) {
328 bool available
= query_is_available(pool
, firstQuery
+ i
);
330 if (!available
&& (flags
& VK_QUERY_RESULT_WAIT_BIT
)) {
331 status
= wait_for_available(device
, pool
, firstQuery
+ i
);
332 if (status
!= VK_SUCCESS
)
338 /* From the Vulkan 1.0.42 spec:
340 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
341 * both not set then no result values are written to pData for
342 * queries that are in the unavailable state at the time of the call,
343 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
344 * availability state is still written to pData for those queries if
345 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
347 bool write_results
= available
|| (flags
& VK_QUERY_RESULT_PARTIAL_BIT
);
350 switch (pool
->type
) {
351 case VK_QUERY_TYPE_OCCLUSION
: {
352 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
354 cpu_write_query_result(pData
, flags
, idx
, slot
[2] - slot
[1]);
359 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
360 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
361 uint32_t statistics
= pool
->pipeline_statistics
;
363 uint32_t stat
= u_bit_scan(&statistics
);
365 uint64_t result
= slot
[idx
* 2 + 2] - slot
[idx
* 2 + 1];
367 /* WaDividePSInvocationCountBy4:HSW,BDW */
368 if ((device
->info
.gen
== 8 || device
->info
.is_haswell
) &&
369 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
)
372 cpu_write_query_result(pData
, flags
, idx
, result
);
376 assert(idx
== util_bitcount(pool
->pipeline_statistics
));
380 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
: {
381 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
383 cpu_write_query_result(pData
, flags
, idx
, slot
[2] - slot
[1]);
386 cpu_write_query_result(pData
, flags
, idx
, slot
[4] - slot
[3]);
391 case VK_QUERY_TYPE_TIMESTAMP
: {
392 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
394 cpu_write_query_result(pData
, flags
, idx
, slot
[1]);
399 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
402 const void *query_data
= query_slot(pool
, firstQuery
+ i
);
403 const uint32_t *oa_begin
= query_data
+ intel_perf_mi_rpc_offset(false);
404 const uint32_t *oa_end
= query_data
+ intel_perf_mi_rpc_offset(true);
405 const uint32_t *rpstat_begin
= query_data
+ intel_perf_rpstart_offset(false);
406 const uint32_t *rpstat_end
= query_data
+ intel_perf_mi_rpc_offset(true);
407 struct gen_perf_query_result result
;
408 struct gen_perf_query_info metric
= {
409 .oa_format
= (GEN_GEN
>= 8 ?
410 I915_OA_FORMAT_A32u40_A4u32_B8_C8
:
411 I915_OA_FORMAT_A45_B8_C8
),
413 uint32_t core_freq
[2];
415 core_freq
[0] = ((*rpstat_begin
>> 7) & 0x7f) * 1000000ULL;
416 core_freq
[1] = ((*rpstat_end
>> 7) & 0x7f) * 1000000ULL;
418 core_freq
[0] = ((*rpstat_begin
>> 23) & 0x1ff) * 1000000ULL;
419 core_freq
[1] = ((*rpstat_end
>> 23) & 0x1ff) * 1000000ULL;
421 gen_perf_query_result_clear(&result
);
422 gen_perf_query_result_accumulate(&result
, &metric
,
424 gen_perf_query_result_read_frequencies(&result
, &device
->info
,
426 gen_perf_query_result_write_mdapi(pData
, stride
,
429 core_freq
[0], core_freq
[1]);
430 gen_perf_query_mdapi_write_perfcntr(pData
, stride
, &device
->info
,
431 query_data
+ intel_perf_counter(false),
432 query_data
+ intel_perf_counter(true));
433 const uint64_t *marker
= query_data
+ intel_perf_marker_offset();
434 gen_perf_query_mdapi_write_marker(pData
, stride
, &device
->info
, *marker
);
439 unreachable("invalid pool type");
443 status
= VK_NOT_READY
;
445 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
446 cpu_write_query_result(pData
, flags
, idx
, available
);
449 if (pData
>= data_end
)
457 emit_ps_depth_count(struct anv_cmd_buffer
*cmd_buffer
,
458 struct anv_address addr
)
460 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
461 pc
.DestinationAddressType
= DAT_PPGTT
;
462 pc
.PostSyncOperation
= WritePSDepthCount
;
463 pc
.DepthStallEnable
= true;
466 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
467 pc
.CommandStreamerStallEnable
= true;
472 emit_query_mi_availability(struct gen_mi_builder
*b
,
473 struct anv_address addr
,
476 gen_mi_store(b
, gen_mi_mem64(addr
), gen_mi_imm(available
));
480 emit_query_pc_availability(struct anv_cmd_buffer
*cmd_buffer
,
481 struct anv_address addr
,
484 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
485 pc
.DestinationAddressType
= DAT_PPGTT
;
486 pc
.PostSyncOperation
= WriteImmediateData
;
488 pc
.ImmediateData
= available
;
493 * Goes through a series of consecutive query indices in the given pool
494 * setting all element values to 0 and emitting them as available.
497 emit_zero_queries(struct anv_cmd_buffer
*cmd_buffer
,
498 struct gen_mi_builder
*b
, struct anv_query_pool
*pool
,
499 uint32_t first_index
, uint32_t num_queries
)
501 switch (pool
->type
) {
502 case VK_QUERY_TYPE_OCCLUSION
:
503 case VK_QUERY_TYPE_TIMESTAMP
:
504 /* These queries are written with a PIPE_CONTROL so clear them using the
505 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
508 assert((pool
->stride
% 8) == 0);
509 for (uint32_t i
= 0; i
< num_queries
; i
++) {
510 struct anv_address slot_addr
=
511 anv_query_address(pool
, first_index
+ i
);
513 for (uint32_t qword
= 1; qword
< (pool
->stride
/ 8); qword
++) {
514 emit_query_pc_availability(cmd_buffer
,
515 anv_address_add(slot_addr
, qword
* 8),
518 emit_query_pc_availability(cmd_buffer
, slot_addr
, true);
522 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
523 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
524 for (uint32_t i
= 0; i
< num_queries
; i
++) {
525 struct anv_address slot_addr
=
526 anv_query_address(pool
, first_index
+ i
);
527 gen_mi_memset(b
, anv_address_add(slot_addr
, 8), 0, pool
->stride
- 8);
528 emit_query_mi_availability(b
, slot_addr
, true);
532 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
:
533 for (uint32_t i
= 0; i
< num_queries
; i
++) {
534 struct anv_address slot_addr
=
535 anv_query_address(pool
, first_index
+ i
);
536 gen_mi_memset(b
, slot_addr
, 0, pool
->stride
- 8);
537 emit_query_mi_availability(b
, anv_address_add(slot_addr
,
538 pool
->stride
- 8), true);
543 unreachable("Unsupported query type");
547 void genX(CmdResetQueryPool
)(
548 VkCommandBuffer commandBuffer
,
549 VkQueryPool queryPool
,
553 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
554 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
556 switch (pool
->type
) {
557 case VK_QUERY_TYPE_OCCLUSION
:
558 case VK_QUERY_TYPE_TIMESTAMP
:
559 for (uint32_t i
= 0; i
< queryCount
; i
++) {
560 emit_query_pc_availability(cmd_buffer
,
561 anv_query_address(pool
, firstQuery
+ i
),
566 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
567 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
: {
568 struct gen_mi_builder b
;
569 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
571 for (uint32_t i
= 0; i
< queryCount
; i
++)
572 emit_query_mi_availability(&b
, anv_query_address(pool
, firstQuery
+ i
), false);
576 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
577 struct gen_mi_builder b
;
578 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
580 for (uint32_t i
= 0; i
< queryCount
; i
++) {
581 emit_query_mi_availability(
584 anv_query_address(pool
, firstQuery
+ i
),
592 unreachable("Unsupported query type");
596 void genX(ResetQueryPoolEXT
)(
598 VkQueryPool queryPool
,
602 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
604 for (uint32_t i
= 0; i
< queryCount
; i
++) {
605 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
610 static const uint32_t vk_pipeline_stat_to_reg
[] = {
611 GENX(IA_VERTICES_COUNT_num
),
612 GENX(IA_PRIMITIVES_COUNT_num
),
613 GENX(VS_INVOCATION_COUNT_num
),
614 GENX(GS_INVOCATION_COUNT_num
),
615 GENX(GS_PRIMITIVES_COUNT_num
),
616 GENX(CL_INVOCATION_COUNT_num
),
617 GENX(CL_PRIMITIVES_COUNT_num
),
618 GENX(PS_INVOCATION_COUNT_num
),
619 GENX(HS_INVOCATION_COUNT_num
),
620 GENX(DS_INVOCATION_COUNT_num
),
621 GENX(CS_INVOCATION_COUNT_num
),
625 emit_pipeline_stat(struct gen_mi_builder
*b
, uint32_t stat
,
626 struct anv_address addr
)
628 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK
==
629 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg
)) - 1);
631 assert(stat
< ARRAY_SIZE(vk_pipeline_stat_to_reg
));
632 gen_mi_store(b
, gen_mi_mem64(addr
),
633 gen_mi_reg64(vk_pipeline_stat_to_reg
[stat
]));
637 emit_xfb_query(struct gen_mi_builder
*b
, uint32_t stream
,
638 struct anv_address addr
)
640 assert(stream
< MAX_XFB_STREAMS
);
642 gen_mi_store(b
, gen_mi_mem64(anv_address_add(addr
, 0)),
643 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num
) + stream
* 8));
644 gen_mi_store(b
, gen_mi_mem64(anv_address_add(addr
, 16)),
645 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num
) + stream
* 8));
648 void genX(CmdBeginQuery
)(
649 VkCommandBuffer commandBuffer
,
650 VkQueryPool queryPool
,
652 VkQueryControlFlags flags
)
654 genX(CmdBeginQueryIndexedEXT
)(commandBuffer
, queryPool
, query
, flags
, 0);
657 void genX(CmdBeginQueryIndexedEXT
)(
658 VkCommandBuffer commandBuffer
,
659 VkQueryPool queryPool
,
661 VkQueryControlFlags flags
,
664 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
665 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
666 struct anv_address query_addr
= anv_query_address(pool
, query
);
668 struct gen_mi_builder b
;
669 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
671 switch (pool
->type
) {
672 case VK_QUERY_TYPE_OCCLUSION
:
673 emit_ps_depth_count(cmd_buffer
, anv_address_add(query_addr
, 8));
676 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
677 /* TODO: This might only be necessary for certain stats */
678 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
679 pc
.CommandStreamerStallEnable
= true;
680 pc
.StallAtPixelScoreboard
= true;
683 uint32_t statistics
= pool
->pipeline_statistics
;
686 uint32_t stat
= u_bit_scan(&statistics
);
687 emit_pipeline_stat(&b
, stat
, anv_address_add(query_addr
, offset
));
693 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
694 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
695 pc
.CommandStreamerStallEnable
= true;
696 pc
.StallAtPixelScoreboard
= true;
698 emit_xfb_query(&b
, index
, anv_address_add(query_addr
, 8));
701 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
702 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
703 pc
.CommandStreamerStallEnable
= true;
704 pc
.StallAtPixelScoreboard
= true;
706 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_REPORT_PERF_COUNT
), rpc
) {
708 anv_address_add(query_addr
, intel_perf_mi_rpc_offset(false));
712 gen_mi_mem32(anv_address_add(query_addr
,
713 intel_perf_rpstart_offset(false))),
714 gen_mi_reg32(GENX(RPSTAT1_num
)));
717 gen_mi_mem32(anv_address_add(query_addr
,
718 intel_perf_rpstart_offset(false))),
719 gen_mi_reg32(GENX(RPSTAT0_num
)));
721 #if GEN_GEN >= 8 && GEN_GEN <= 11
722 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
,
723 intel_perf_counter(false))),
724 gen_mi_reg64(GENX(PERFCNT1_num
)));
725 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
,
726 intel_perf_counter(false) + 8)),
727 gen_mi_reg64(GENX(PERFCNT2_num
)));
737 void genX(CmdEndQuery
)(
738 VkCommandBuffer commandBuffer
,
739 VkQueryPool queryPool
,
742 genX(CmdEndQueryIndexedEXT
)(commandBuffer
, queryPool
, query
, 0);
745 void genX(CmdEndQueryIndexedEXT
)(
746 VkCommandBuffer commandBuffer
,
747 VkQueryPool queryPool
,
751 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
752 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
753 struct anv_address query_addr
= anv_query_address(pool
, query
);
755 struct gen_mi_builder b
;
756 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
758 switch (pool
->type
) {
759 case VK_QUERY_TYPE_OCCLUSION
:
760 emit_ps_depth_count(cmd_buffer
, anv_address_add(query_addr
, 16));
761 emit_query_pc_availability(cmd_buffer
, query_addr
, true);
764 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
765 /* TODO: This might only be necessary for certain stats */
766 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
767 pc
.CommandStreamerStallEnable
= true;
768 pc
.StallAtPixelScoreboard
= true;
771 uint32_t statistics
= pool
->pipeline_statistics
;
772 uint32_t offset
= 16;
774 uint32_t stat
= u_bit_scan(&statistics
);
775 emit_pipeline_stat(&b
, stat
, anv_address_add(query_addr
, offset
));
779 emit_query_mi_availability(&b
, query_addr
, true);
783 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
784 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
785 pc
.CommandStreamerStallEnable
= true;
786 pc
.StallAtPixelScoreboard
= true;
789 emit_xfb_query(&b
, index
, anv_address_add(query_addr
, 16));
790 emit_query_mi_availability(&b
, query_addr
, true);
793 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
794 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
795 pc
.CommandStreamerStallEnable
= true;
796 pc
.StallAtPixelScoreboard
= true;
798 uint32_t marker_offset
= intel_perf_marker_offset();
799 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, marker_offset
)),
800 gen_mi_imm(cmd_buffer
->intel_perf_marker
));
801 #if GEN_GEN >= 8 && GEN_GEN <= 11
802 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, intel_perf_counter(true))),
803 gen_mi_reg64(GENX(PERFCNT1_num
)));
804 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, intel_perf_counter(true) + 8)),
805 gen_mi_reg64(GENX(PERFCNT2_num
)));
809 gen_mi_mem32(anv_address_add(query_addr
,
810 intel_perf_rpstart_offset(true))),
811 gen_mi_reg32(GENX(RPSTAT1_num
)));
814 gen_mi_mem32(anv_address_add(query_addr
,
815 intel_perf_rpstart_offset(true))),
816 gen_mi_reg32(GENX(RPSTAT0_num
)));
818 /* Position the last OA snapshot at the beginning of the query so that
819 * we can tell whether it's ready.
821 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_REPORT_PERF_COUNT
), rpc
) {
822 rpc
.MemoryAddress
= anv_address_add(query_addr
,
823 intel_perf_mi_rpc_offset(true));
824 rpc
.ReportID
= 0xdeadbeef; /* This goes in the first dword */
826 emit_query_mi_availability(&b
,
827 anv_address_add(query_addr
, pool
->stride
- 8),
836 /* When multiview is active the spec requires that N consecutive query
837 * indices are used, where N is the number of active views in the subpass.
838 * The spec allows that we only write the results to one of the queries
839 * but we still need to manage result availability for all the query indices.
840 * Since we only emit a single query for all active views in the
841 * first index, mark the other query indices as being already available
844 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
845 const uint32_t num_queries
=
846 util_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
848 emit_zero_queries(cmd_buffer
, &b
, pool
, query
+ 1, num_queries
- 1);
852 #define TIMESTAMP 0x2358
854 void genX(CmdWriteTimestamp
)(
855 VkCommandBuffer commandBuffer
,
856 VkPipelineStageFlagBits pipelineStage
,
857 VkQueryPool queryPool
,
860 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
861 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
862 struct anv_address query_addr
= anv_query_address(pool
, query
);
864 assert(pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
866 struct gen_mi_builder b
;
867 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
869 switch (pipelineStage
) {
870 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
:
871 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, 8)),
872 gen_mi_reg64(TIMESTAMP
));
876 /* Everything else is bottom-of-pipe */
877 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
878 pc
.DestinationAddressType
= DAT_PPGTT
;
879 pc
.PostSyncOperation
= WriteTimestamp
;
880 pc
.Address
= anv_address_add(query_addr
, 8);
882 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
883 pc
.CommandStreamerStallEnable
= true;
888 emit_query_pc_availability(cmd_buffer
, query_addr
, true);
890 /* When multiview is active the spec requires that N consecutive query
891 * indices are used, where N is the number of active views in the subpass.
892 * The spec allows that we only write the results to one of the queries
893 * but we still need to manage result availability for all the query indices.
894 * Since we only emit a single query for all active views in the
895 * first index, mark the other query indices as being already available
898 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
899 const uint32_t num_queries
=
900 util_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
902 emit_zero_queries(cmd_buffer
, &b
, pool
, query
+ 1, num_queries
- 1);
906 #if GEN_GEN > 7 || GEN_IS_HASWELL
909 gpu_write_query_result(struct gen_mi_builder
*b
,
910 struct anv_address dst_addr
,
911 VkQueryResultFlags flags
,
912 uint32_t value_index
,
913 struct gen_mi_value query_result
)
915 if (flags
& VK_QUERY_RESULT_64_BIT
) {
916 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 8);
917 gen_mi_store(b
, gen_mi_mem64(res_addr
), query_result
);
919 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 4);
920 gen_mi_store(b
, gen_mi_mem32(res_addr
), query_result
);
924 static struct gen_mi_value
925 compute_query_result(struct gen_mi_builder
*b
, struct anv_address addr
)
927 return gen_mi_isub(b
, gen_mi_mem64(anv_address_add(addr
, 8)),
928 gen_mi_mem64(anv_address_add(addr
, 0)));
931 void genX(CmdCopyQueryPoolResults
)(
932 VkCommandBuffer commandBuffer
,
933 VkQueryPool queryPool
,
937 VkDeviceSize destOffset
,
938 VkDeviceSize destStride
,
939 VkQueryResultFlags flags
)
941 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
942 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
943 ANV_FROM_HANDLE(anv_buffer
, buffer
, destBuffer
);
945 struct gen_mi_builder b
;
946 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
947 struct gen_mi_value result
;
949 /* If render target writes are ongoing, request a render target cache flush
950 * to ensure proper ordering of the commands from the 3d pipe and the
953 if (cmd_buffer
->state
.pending_pipe_bits
& ANV_PIPE_RENDER_TARGET_BUFFER_WRITES
) {
954 cmd_buffer
->state
.pending_pipe_bits
|=
955 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT
;
958 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) ||
959 (cmd_buffer
->state
.pending_pipe_bits
& ANV_PIPE_FLUSH_BITS
) ||
960 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
961 * because we're about to copy values from MI commands, we need to
962 * stall the command streamer to make sure the PIPE_CONTROL values have
963 * landed, otherwise we could see inconsistent values & availability.
965 * From the vulkan spec:
967 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
968 * previous uses of vkCmdResetQueryPool in the same queue, without
969 * any additional synchronization."
971 pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
972 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
) {
973 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_CS_STALL_BIT
;
974 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
977 struct anv_address dest_addr
= anv_address_add(buffer
->address
, destOffset
);
978 for (uint32_t i
= 0; i
< queryCount
; i
++) {
979 struct anv_address query_addr
= anv_query_address(pool
, firstQuery
+ i
);
981 switch (pool
->type
) {
982 case VK_QUERY_TYPE_OCCLUSION
:
983 result
= compute_query_result(&b
, anv_address_add(query_addr
, 8));
984 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
987 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
988 uint32_t statistics
= pool
->pipeline_statistics
;
990 uint32_t stat
= u_bit_scan(&statistics
);
992 result
= compute_query_result(&b
, anv_address_add(query_addr
,
995 /* WaDividePSInvocationCountBy4:HSW,BDW */
996 if ((cmd_buffer
->device
->info
.gen
== 8 ||
997 cmd_buffer
->device
->info
.is_haswell
) &&
998 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
) {
999 result
= gen_mi_ushr32_imm(&b
, result
, 2);
1002 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1004 assert(idx
== util_bitcount(pool
->pipeline_statistics
));
1008 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
1009 result
= compute_query_result(&b
, anv_address_add(query_addr
, 8));
1010 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1011 result
= compute_query_result(&b
, anv_address_add(query_addr
, 24));
1012 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1015 case VK_QUERY_TYPE_TIMESTAMP
:
1016 result
= gen_mi_mem64(anv_address_add(query_addr
, 8));
1017 gpu_write_query_result(&b
, dest_addr
, flags
, 0, result
);
1021 unreachable("unhandled query type");
1024 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
1025 gpu_write_query_result(&b
, dest_addr
, flags
, idx
,
1026 gen_mi_mem64(query_addr
));
1029 dest_addr
= anv_address_add(dest_addr
, destStride
);
1034 void genX(CmdCopyQueryPoolResults
)(
1035 VkCommandBuffer commandBuffer
,
1036 VkQueryPool queryPool
,
1037 uint32_t firstQuery
,
1038 uint32_t queryCount
,
1039 VkBuffer destBuffer
,
1040 VkDeviceSize destOffset
,
1041 VkDeviceSize destStride
,
1042 VkQueryResultFlags flags
)
1044 anv_finishme("Queries not yet supported on Ivy Bridge");