2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include "anv_private.h"
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
45 VkResult
genX(CreateQueryPool
)(
47 const VkQueryPoolCreateInfo
* pCreateInfo
,
48 const VkAllocationCallbacks
* pAllocator
,
49 VkQueryPool
* pQueryPool
)
51 ANV_FROM_HANDLE(anv_device
, device
, _device
);
52 const struct anv_physical_device
*pdevice
= device
->physical
;
53 struct anv_query_pool
*pool
;
56 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
68 uint32_t uint64s_per_slot
= 1;
70 VkQueryPipelineStatisticFlags pipeline_statistics
= 0;
71 switch (pCreateInfo
->queryType
) {
72 case VK_QUERY_TYPE_OCCLUSION
:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot
+= 2;
76 case VK_QUERY_TYPE_TIMESTAMP
:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot
+= 1;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
81 pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
85 pipeline_statistics
&= ANV_PIPELINE_STATISTICS_MASK
;
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot
+= 2 * util_bitcount(pipeline_statistics
);
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
91 /* Transform feedback queries are 4 values, begin/end for
94 uint64s_per_slot
+= 4;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
97 uint64s_per_slot
= 72; /* 576 bytes, see layout below */
101 assert(!"Invalid query type");
104 pool
= vk_alloc2(&device
->vk
.alloc
, pAllocator
, sizeof(*pool
), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
109 vk_object_base_init(&device
->vk
, &pool
->base
, VK_OBJECT_TYPE_QUERY_POOL
);
110 pool
->type
= pCreateInfo
->queryType
;
111 pool
->pipeline_statistics
= pipeline_statistics
;
112 pool
->stride
= uint64s_per_slot
* sizeof(uint64_t);
113 pool
->slots
= pCreateInfo
->queryCount
;
115 uint32_t bo_flags
= 0;
116 if (pdevice
->supports_48bit_addresses
)
117 bo_flags
|= EXEC_OBJECT_SUPPORTS_48B_ADDRESS
;
119 if (pdevice
->use_softpin
)
120 bo_flags
|= EXEC_OBJECT_PINNED
;
122 if (pdevice
->has_exec_async
)
123 bo_flags
|= EXEC_OBJECT_ASYNC
;
125 uint64_t size
= pool
->slots
* pool
->stride
;
126 result
= anv_device_alloc_bo(device
, size
,
127 ANV_BO_ALLOC_MAPPED
|
128 ANV_BO_ALLOC_SNOOPED
,
129 0 /* explicit_address */,
131 if (result
!= VK_SUCCESS
)
134 *pQueryPool
= anv_query_pool_to_handle(pool
);
139 vk_free2(&device
->vk
.alloc
, pAllocator
, pool
);
144 void genX(DestroyQueryPool
)(
147 const VkAllocationCallbacks
* pAllocator
)
149 ANV_FROM_HANDLE(anv_device
, device
, _device
);
150 ANV_FROM_HANDLE(anv_query_pool
, pool
, _pool
);
155 anv_device_release_bo(device
, pool
->bo
);
156 vk_object_base_finish(&pool
->base
);
157 vk_free2(&device
->vk
.alloc
, pAllocator
, pool
);
160 static struct anv_address
161 anv_query_address(struct anv_query_pool
*pool
, uint32_t query
)
163 return (struct anv_address
) {
165 .offset
= query
* pool
->stride
,
170 * VK_INTEL_performance_query layout (576 bytes) :
172 * ------------------------------
173 * | availability (8b) |
174 * |----------------------------|
176 * |----------------------------|
177 * | begin RPSTAT register (4b) |
178 * |----------------------------|
179 * | end RPSTAT register (4b) |
180 * |----------------------------|
181 * | begin perfcntr 1 & 2 (16b) |
182 * |----------------------------|
183 * | end perfcntr 1 & 2 (16b) |
184 * |----------------------------|
186 * |----------------------------|
187 * | begin MI_RPC (256b) |
188 * |----------------------------|
189 * | end MI_RPC (256b) |
190 * ------------------------------
194 intel_perf_marker_offset(void)
200 intel_perf_rpstart_offset(bool end
)
202 return 16 + (end
? sizeof(uint32_t) : 0);
205 #if GEN_GEN >= 8 && GEN_GEN <= 11
207 intel_perf_counter(bool end
)
209 return 24 + (end
? (2 * sizeof(uint64_t)) : 0);
214 intel_perf_mi_rpc_offset(bool end
)
216 return 64 + (end
? 256 : 0);
220 cpu_write_query_result(void *dst_slot
, VkQueryResultFlags flags
,
221 uint32_t value_index
, uint64_t result
)
223 if (flags
& VK_QUERY_RESULT_64_BIT
) {
224 uint64_t *dst64
= dst_slot
;
225 dst64
[value_index
] = result
;
227 uint32_t *dst32
= dst_slot
;
228 dst32
[value_index
] = result
;
233 query_slot(struct anv_query_pool
*pool
, uint32_t query
)
235 return pool
->bo
->map
+ query
* pool
->stride
;
239 query_is_available(struct anv_query_pool
*pool
, uint32_t query
)
241 return *(volatile uint64_t *)query_slot(pool
, query
);
245 wait_for_available(struct anv_device
*device
,
246 struct anv_query_pool
*pool
, uint32_t query
)
248 uint64_t abs_timeout
= anv_get_absolute_timeout(5 * NSEC_PER_SEC
);
250 while (anv_gettime_ns() < abs_timeout
) {
251 if (query_is_available(pool
, query
))
253 VkResult status
= anv_device_query_status(device
);
254 if (status
!= VK_SUCCESS
)
258 return anv_device_set_lost(device
, "query timeout");
261 VkResult
genX(GetQueryPoolResults
)(
263 VkQueryPool queryPool
,
269 VkQueryResultFlags flags
)
271 ANV_FROM_HANDLE(anv_device
, device
, _device
);
272 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
274 assert(pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
275 pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
||
276 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
||
277 pool
->type
== VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
||
278 pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
);
280 if (anv_device_is_lost(device
))
281 return VK_ERROR_DEVICE_LOST
;
286 void *data_end
= pData
+ dataSize
;
288 VkResult status
= VK_SUCCESS
;
289 for (uint32_t i
= 0; i
< queryCount
; i
++) {
290 bool available
= query_is_available(pool
, firstQuery
+ i
);
292 if (!available
&& (flags
& VK_QUERY_RESULT_WAIT_BIT
)) {
293 status
= wait_for_available(device
, pool
, firstQuery
+ i
);
294 if (status
!= VK_SUCCESS
)
300 /* From the Vulkan 1.0.42 spec:
302 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
303 * both not set then no result values are written to pData for
304 * queries that are in the unavailable state at the time of the call,
305 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
306 * availability state is still written to pData for those queries if
307 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
309 bool write_results
= available
|| (flags
& VK_QUERY_RESULT_PARTIAL_BIT
);
312 switch (pool
->type
) {
313 case VK_QUERY_TYPE_OCCLUSION
: {
314 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
316 /* From the Vulkan 1.2.132 spec:
318 * "If VK_QUERY_RESULT_PARTIAL_BIT is set,
319 * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
320 * is unavailable, an intermediate result value between zero and
321 * the final result value is written to pData for that query."
323 uint64_t result
= available
? slot
[2] - slot
[1] : 0;
324 cpu_write_query_result(pData
, flags
, idx
, result
);
330 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
331 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
332 uint32_t statistics
= pool
->pipeline_statistics
;
334 uint32_t stat
= u_bit_scan(&statistics
);
336 uint64_t result
= slot
[idx
* 2 + 2] - slot
[idx
* 2 + 1];
338 /* WaDividePSInvocationCountBy4:HSW,BDW */
339 if ((device
->info
.gen
== 8 || device
->info
.is_haswell
) &&
340 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
)
343 cpu_write_query_result(pData
, flags
, idx
, result
);
347 assert(idx
== util_bitcount(pool
->pipeline_statistics
));
351 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
: {
352 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
354 cpu_write_query_result(pData
, flags
, idx
, slot
[2] - slot
[1]);
357 cpu_write_query_result(pData
, flags
, idx
, slot
[4] - slot
[3]);
362 case VK_QUERY_TYPE_TIMESTAMP
: {
363 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
365 cpu_write_query_result(pData
, flags
, idx
, slot
[1]);
370 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
373 const void *query_data
= query_slot(pool
, firstQuery
+ i
);
374 const uint32_t *oa_begin
= query_data
+ intel_perf_mi_rpc_offset(false);
375 const uint32_t *oa_end
= query_data
+ intel_perf_mi_rpc_offset(true);
376 const uint32_t *rpstat_begin
= query_data
+ intel_perf_rpstart_offset(false);
377 const uint32_t *rpstat_end
= query_data
+ intel_perf_mi_rpc_offset(true);
378 struct gen_perf_query_result result
;
379 struct gen_perf_query_info metric
= {
380 .oa_format
= (GEN_GEN
>= 8 ?
381 I915_OA_FORMAT_A32u40_A4u32_B8_C8
:
382 I915_OA_FORMAT_A45_B8_C8
),
384 uint32_t core_freq
[2];
386 core_freq
[0] = ((*rpstat_begin
>> 7) & 0x7f) * 1000000ULL;
387 core_freq
[1] = ((*rpstat_end
>> 7) & 0x7f) * 1000000ULL;
389 core_freq
[0] = ((*rpstat_begin
>> 23) & 0x1ff) * 1000000ULL;
390 core_freq
[1] = ((*rpstat_end
>> 23) & 0x1ff) * 1000000ULL;
392 gen_perf_query_result_clear(&result
);
393 gen_perf_query_result_accumulate(&result
, &metric
,
395 gen_perf_query_result_read_frequencies(&result
, &device
->info
,
397 gen_perf_query_result_write_mdapi(pData
, stride
,
400 core_freq
[0], core_freq
[1]);
401 #if GEN_GEN >= 8 && GEN_GEN <= 11
402 gen_perf_query_mdapi_write_perfcntr(pData
, stride
, &device
->info
,
403 query_data
+ intel_perf_counter(false),
404 query_data
+ intel_perf_counter(true));
406 const uint64_t *marker
= query_data
+ intel_perf_marker_offset();
407 gen_perf_query_mdapi_write_marker(pData
, stride
, &device
->info
, *marker
);
412 unreachable("invalid pool type");
416 status
= VK_NOT_READY
;
418 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
419 cpu_write_query_result(pData
, flags
, idx
, available
);
422 if (pData
>= data_end
)
430 emit_ps_depth_count(struct anv_cmd_buffer
*cmd_buffer
,
431 struct anv_address addr
)
433 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_POST_SYNC_BIT
;
434 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
436 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
437 pc
.DestinationAddressType
= DAT_PPGTT
;
438 pc
.PostSyncOperation
= WritePSDepthCount
;
439 pc
.DepthStallEnable
= true;
442 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
443 pc
.CommandStreamerStallEnable
= true;
448 emit_query_mi_availability(struct gen_mi_builder
*b
,
449 struct anv_address addr
,
452 gen_mi_store(b
, gen_mi_mem64(addr
), gen_mi_imm(available
));
456 emit_query_pc_availability(struct anv_cmd_buffer
*cmd_buffer
,
457 struct anv_address addr
,
460 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_POST_SYNC_BIT
;
461 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
463 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
464 pc
.DestinationAddressType
= DAT_PPGTT
;
465 pc
.PostSyncOperation
= WriteImmediateData
;
467 pc
.ImmediateData
= available
;
472 * Goes through a series of consecutive query indices in the given pool
473 * setting all element values to 0 and emitting them as available.
476 emit_zero_queries(struct anv_cmd_buffer
*cmd_buffer
,
477 struct gen_mi_builder
*b
, struct anv_query_pool
*pool
,
478 uint32_t first_index
, uint32_t num_queries
)
480 switch (pool
->type
) {
481 case VK_QUERY_TYPE_OCCLUSION
:
482 case VK_QUERY_TYPE_TIMESTAMP
:
483 /* These queries are written with a PIPE_CONTROL so clear them using the
484 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
487 assert((pool
->stride
% 8) == 0);
488 for (uint32_t i
= 0; i
< num_queries
; i
++) {
489 struct anv_address slot_addr
=
490 anv_query_address(pool
, first_index
+ i
);
492 for (uint32_t qword
= 1; qword
< (pool
->stride
/ 8); qword
++) {
493 emit_query_pc_availability(cmd_buffer
,
494 anv_address_add(slot_addr
, qword
* 8),
497 emit_query_pc_availability(cmd_buffer
, slot_addr
, true);
501 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
502 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
503 for (uint32_t i
= 0; i
< num_queries
; i
++) {
504 struct anv_address slot_addr
=
505 anv_query_address(pool
, first_index
+ i
);
506 gen_mi_memset(b
, anv_address_add(slot_addr
, 8), 0, pool
->stride
- 8);
507 emit_query_mi_availability(b
, slot_addr
, true);
511 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
:
512 for (uint32_t i
= 0; i
< num_queries
; i
++) {
513 struct anv_address slot_addr
=
514 anv_query_address(pool
, first_index
+ i
);
515 gen_mi_memset(b
, anv_address_add(slot_addr
, 8), 0, pool
->stride
- 8);
516 emit_query_mi_availability(b
, slot_addr
, true);
521 unreachable("Unsupported query type");
525 void genX(CmdResetQueryPool
)(
526 VkCommandBuffer commandBuffer
,
527 VkQueryPool queryPool
,
531 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
532 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
534 switch (pool
->type
) {
535 case VK_QUERY_TYPE_OCCLUSION
:
536 case VK_QUERY_TYPE_TIMESTAMP
:
537 for (uint32_t i
= 0; i
< queryCount
; i
++) {
538 emit_query_pc_availability(cmd_buffer
,
539 anv_query_address(pool
, firstQuery
+ i
),
544 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
545 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
: {
546 struct gen_mi_builder b
;
547 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
549 for (uint32_t i
= 0; i
< queryCount
; i
++)
550 emit_query_mi_availability(&b
, anv_query_address(pool
, firstQuery
+ i
), false);
554 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
555 struct gen_mi_builder b
;
556 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
558 for (uint32_t i
= 0; i
< queryCount
; i
++)
559 emit_query_mi_availability(&b
, anv_query_address(pool
, firstQuery
+ i
), false);
564 unreachable("Unsupported query type");
568 void genX(ResetQueryPool
)(
570 VkQueryPool queryPool
,
574 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
576 for (uint32_t i
= 0; i
< queryCount
; i
++) {
577 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
582 static const uint32_t vk_pipeline_stat_to_reg
[] = {
583 GENX(IA_VERTICES_COUNT_num
),
584 GENX(IA_PRIMITIVES_COUNT_num
),
585 GENX(VS_INVOCATION_COUNT_num
),
586 GENX(GS_INVOCATION_COUNT_num
),
587 GENX(GS_PRIMITIVES_COUNT_num
),
588 GENX(CL_INVOCATION_COUNT_num
),
589 GENX(CL_PRIMITIVES_COUNT_num
),
590 GENX(PS_INVOCATION_COUNT_num
),
591 GENX(HS_INVOCATION_COUNT_num
),
592 GENX(DS_INVOCATION_COUNT_num
),
593 GENX(CS_INVOCATION_COUNT_num
),
597 emit_pipeline_stat(struct gen_mi_builder
*b
, uint32_t stat
,
598 struct anv_address addr
)
600 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK
==
601 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg
)) - 1);
603 assert(stat
< ARRAY_SIZE(vk_pipeline_stat_to_reg
));
604 gen_mi_store(b
, gen_mi_mem64(addr
),
605 gen_mi_reg64(vk_pipeline_stat_to_reg
[stat
]));
609 emit_xfb_query(struct gen_mi_builder
*b
, uint32_t stream
,
610 struct anv_address addr
)
612 assert(stream
< MAX_XFB_STREAMS
);
614 gen_mi_store(b
, gen_mi_mem64(anv_address_add(addr
, 0)),
615 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num
) + stream
* 8));
616 gen_mi_store(b
, gen_mi_mem64(anv_address_add(addr
, 16)),
617 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num
) + stream
* 8));
620 void genX(CmdBeginQuery
)(
621 VkCommandBuffer commandBuffer
,
622 VkQueryPool queryPool
,
624 VkQueryControlFlags flags
)
626 genX(CmdBeginQueryIndexedEXT
)(commandBuffer
, queryPool
, query
, flags
, 0);
629 void genX(CmdBeginQueryIndexedEXT
)(
630 VkCommandBuffer commandBuffer
,
631 VkQueryPool queryPool
,
633 VkQueryControlFlags flags
,
636 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
637 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
638 struct anv_address query_addr
= anv_query_address(pool
, query
);
640 struct gen_mi_builder b
;
641 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
643 switch (pool
->type
) {
644 case VK_QUERY_TYPE_OCCLUSION
:
645 emit_ps_depth_count(cmd_buffer
, anv_address_add(query_addr
, 8));
648 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
649 /* TODO: This might only be necessary for certain stats */
650 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
651 pc
.CommandStreamerStallEnable
= true;
652 pc
.StallAtPixelScoreboard
= true;
655 uint32_t statistics
= pool
->pipeline_statistics
;
658 uint32_t stat
= u_bit_scan(&statistics
);
659 emit_pipeline_stat(&b
, stat
, anv_address_add(query_addr
, offset
));
665 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
666 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
667 pc
.CommandStreamerStallEnable
= true;
668 pc
.StallAtPixelScoreboard
= true;
670 emit_xfb_query(&b
, index
, anv_address_add(query_addr
, 8));
673 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
674 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
675 pc
.CommandStreamerStallEnable
= true;
676 pc
.StallAtPixelScoreboard
= true;
678 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_REPORT_PERF_COUNT
), rpc
) {
680 anv_address_add(query_addr
, intel_perf_mi_rpc_offset(false));
684 gen_mi_mem32(anv_address_add(query_addr
,
685 intel_perf_rpstart_offset(false))),
686 gen_mi_reg32(GENX(RPSTAT1_num
)));
689 gen_mi_mem32(anv_address_add(query_addr
,
690 intel_perf_rpstart_offset(false))),
691 gen_mi_reg32(GENX(RPSTAT0_num
)));
693 #if GEN_GEN >= 8 && GEN_GEN <= 11
694 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
,
695 intel_perf_counter(false))),
696 gen_mi_reg64(GENX(PERFCNT1_num
)));
697 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
,
698 intel_perf_counter(false) + 8)),
699 gen_mi_reg64(GENX(PERFCNT2_num
)));
709 void genX(CmdEndQuery
)(
710 VkCommandBuffer commandBuffer
,
711 VkQueryPool queryPool
,
714 genX(CmdEndQueryIndexedEXT
)(commandBuffer
, queryPool
, query
, 0);
717 void genX(CmdEndQueryIndexedEXT
)(
718 VkCommandBuffer commandBuffer
,
719 VkQueryPool queryPool
,
723 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
724 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
725 struct anv_address query_addr
= anv_query_address(pool
, query
);
727 struct gen_mi_builder b
;
728 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
730 switch (pool
->type
) {
731 case VK_QUERY_TYPE_OCCLUSION
:
732 emit_ps_depth_count(cmd_buffer
, anv_address_add(query_addr
, 16));
733 emit_query_pc_availability(cmd_buffer
, query_addr
, true);
736 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
737 /* TODO: This might only be necessary for certain stats */
738 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
739 pc
.CommandStreamerStallEnable
= true;
740 pc
.StallAtPixelScoreboard
= true;
743 uint32_t statistics
= pool
->pipeline_statistics
;
744 uint32_t offset
= 16;
746 uint32_t stat
= u_bit_scan(&statistics
);
747 emit_pipeline_stat(&b
, stat
, anv_address_add(query_addr
, offset
));
751 emit_query_mi_availability(&b
, query_addr
, true);
755 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
756 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
757 pc
.CommandStreamerStallEnable
= true;
758 pc
.StallAtPixelScoreboard
= true;
761 emit_xfb_query(&b
, index
, anv_address_add(query_addr
, 16));
762 emit_query_mi_availability(&b
, query_addr
, true);
765 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
766 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
767 pc
.CommandStreamerStallEnable
= true;
768 pc
.StallAtPixelScoreboard
= true;
770 uint32_t marker_offset
= intel_perf_marker_offset();
771 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, marker_offset
)),
772 gen_mi_imm(cmd_buffer
->intel_perf_marker
));
773 #if GEN_GEN >= 8 && GEN_GEN <= 11
774 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, intel_perf_counter(true))),
775 gen_mi_reg64(GENX(PERFCNT1_num
)));
776 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, intel_perf_counter(true) + 8)),
777 gen_mi_reg64(GENX(PERFCNT2_num
)));
781 gen_mi_mem32(anv_address_add(query_addr
,
782 intel_perf_rpstart_offset(true))),
783 gen_mi_reg32(GENX(RPSTAT1_num
)));
786 gen_mi_mem32(anv_address_add(query_addr
,
787 intel_perf_rpstart_offset(true))),
788 gen_mi_reg32(GENX(RPSTAT0_num
)));
790 /* Position the last OA snapshot at the beginning of the query so that
791 * we can tell whether it's ready.
793 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_REPORT_PERF_COUNT
), rpc
) {
794 rpc
.MemoryAddress
= anv_address_add(query_addr
,
795 intel_perf_mi_rpc_offset(true));
796 rpc
.ReportID
= 0xdeadbeef; /* This goes in the first dword */
798 emit_query_mi_availability(&b
, query_addr
, true);
806 /* When multiview is active the spec requires that N consecutive query
807 * indices are used, where N is the number of active views in the subpass.
808 * The spec allows that we only write the results to one of the queries
809 * but we still need to manage result availability for all the query indices.
810 * Since we only emit a single query for all active views in the
811 * first index, mark the other query indices as being already available
814 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
815 const uint32_t num_queries
=
816 util_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
818 emit_zero_queries(cmd_buffer
, &b
, pool
, query
+ 1, num_queries
- 1);
822 #define TIMESTAMP 0x2358
824 void genX(CmdWriteTimestamp
)(
825 VkCommandBuffer commandBuffer
,
826 VkPipelineStageFlagBits pipelineStage
,
827 VkQueryPool queryPool
,
830 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
831 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
832 struct anv_address query_addr
= anv_query_address(pool
, query
);
834 assert(pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
836 struct gen_mi_builder b
;
837 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
839 switch (pipelineStage
) {
840 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
:
841 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, 8)),
842 gen_mi_reg64(TIMESTAMP
));
846 /* Everything else is bottom-of-pipe */
847 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_POST_SYNC_BIT
;
848 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
850 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
851 pc
.DestinationAddressType
= DAT_PPGTT
;
852 pc
.PostSyncOperation
= WriteTimestamp
;
853 pc
.Address
= anv_address_add(query_addr
, 8);
855 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
856 pc
.CommandStreamerStallEnable
= true;
861 emit_query_pc_availability(cmd_buffer
, query_addr
, true);
863 /* When multiview is active the spec requires that N consecutive query
864 * indices are used, where N is the number of active views in the subpass.
865 * The spec allows that we only write the results to one of the queries
866 * but we still need to manage result availability for all the query indices.
867 * Since we only emit a single query for all active views in the
868 * first index, mark the other query indices as being already available
871 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
872 const uint32_t num_queries
=
873 util_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
875 emit_zero_queries(cmd_buffer
, &b
, pool
, query
+ 1, num_queries
- 1);
879 #if GEN_GEN > 7 || GEN_IS_HASWELL
881 #if GEN_GEN >= 8 || GEN_IS_HASWELL
883 #define MI_PREDICATE_SRC0 0x2400
884 #define MI_PREDICATE_SRC1 0x2408
885 #define MI_PREDICATE_RESULT 0x2418
888 * Writes the results of a query to dst_addr is the value at poll_addr is equal
889 * to the reference value.
892 gpu_write_query_result_cond(struct anv_cmd_buffer
*cmd_buffer
,
893 struct gen_mi_builder
*b
,
894 struct anv_address poll_addr
,
895 struct anv_address dst_addr
,
897 VkQueryResultFlags flags
,
898 uint32_t value_index
,
899 struct gen_mi_value query_result
)
901 gen_mi_store(b
, gen_mi_reg64(MI_PREDICATE_SRC0
), gen_mi_mem64(poll_addr
));
902 gen_mi_store(b
, gen_mi_reg64(MI_PREDICATE_SRC1
), gen_mi_imm(ref_value
));
903 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_PREDICATE
), mip
) {
904 mip
.LoadOperation
= LOAD_LOAD
;
905 mip
.CombineOperation
= COMBINE_SET
;
906 mip
.CompareOperation
= COMPARE_SRCS_EQUAL
;
909 if (flags
& VK_QUERY_RESULT_64_BIT
) {
910 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 8);
911 gen_mi_store_if(b
, gen_mi_mem64(res_addr
), query_result
);
913 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 4);
914 gen_mi_store_if(b
, gen_mi_mem32(res_addr
), query_result
);
918 #endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */
921 gpu_write_query_result(struct gen_mi_builder
*b
,
922 struct anv_address dst_addr
,
923 VkQueryResultFlags flags
,
924 uint32_t value_index
,
925 struct gen_mi_value query_result
)
927 if (flags
& VK_QUERY_RESULT_64_BIT
) {
928 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 8);
929 gen_mi_store(b
, gen_mi_mem64(res_addr
), query_result
);
931 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 4);
932 gen_mi_store(b
, gen_mi_mem32(res_addr
), query_result
);
936 static struct gen_mi_value
937 compute_query_result(struct gen_mi_builder
*b
, struct anv_address addr
)
939 return gen_mi_isub(b
, gen_mi_mem64(anv_address_add(addr
, 8)),
940 gen_mi_mem64(anv_address_add(addr
, 0)));
943 void genX(CmdCopyQueryPoolResults
)(
944 VkCommandBuffer commandBuffer
,
945 VkQueryPool queryPool
,
949 VkDeviceSize destOffset
,
950 VkDeviceSize destStride
,
951 VkQueryResultFlags flags
)
953 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
954 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
955 ANV_FROM_HANDLE(anv_buffer
, buffer
, destBuffer
);
957 struct gen_mi_builder b
;
958 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
959 struct gen_mi_value result
;
961 /* If render target writes are ongoing, request a render target cache flush
962 * to ensure proper ordering of the commands from the 3d pipe and the
965 if (cmd_buffer
->state
.pending_pipe_bits
& ANV_PIPE_RENDER_TARGET_BUFFER_WRITES
) {
966 cmd_buffer
->state
.pending_pipe_bits
|=
967 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT
;
970 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) ||
971 (cmd_buffer
->state
.pending_pipe_bits
& ANV_PIPE_FLUSH_BITS
) ||
972 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
973 * because we're about to copy values from MI commands, we need to
974 * stall the command streamer to make sure the PIPE_CONTROL values have
975 * landed, otherwise we could see inconsistent values & availability.
977 * From the vulkan spec:
979 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
980 * previous uses of vkCmdResetQueryPool in the same queue, without
981 * any additional synchronization."
983 pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
984 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
) {
985 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_CS_STALL_BIT
;
986 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
989 struct anv_address dest_addr
= anv_address_add(buffer
->address
, destOffset
);
990 for (uint32_t i
= 0; i
< queryCount
; i
++) {
991 struct anv_address query_addr
= anv_query_address(pool
, firstQuery
+ i
);
993 switch (pool
->type
) {
994 case VK_QUERY_TYPE_OCCLUSION
:
995 result
= compute_query_result(&b
, anv_address_add(query_addr
, 8));
996 #if GEN_GEN >= 8 || GEN_IS_HASWELL
997 /* Like in the case of vkGetQueryPoolResults, if the query is
998 * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
999 * conservatively write 0 as the query result. If the
1000 * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1002 gpu_write_query_result_cond(cmd_buffer
, &b
, query_addr
, dest_addr
,
1003 1 /* available */, flags
, idx
, result
);
1004 if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
) {
1005 gpu_write_query_result_cond(cmd_buffer
, &b
, query_addr
, dest_addr
,
1006 0 /* unavailable */, flags
, idx
, gen_mi_imm(0));
1009 #else /* GEN_GEN < 8 && !GEN_IS_HASWELL */
1010 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1014 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
1015 uint32_t statistics
= pool
->pipeline_statistics
;
1016 while (statistics
) {
1017 uint32_t stat
= u_bit_scan(&statistics
);
1019 result
= compute_query_result(&b
, anv_address_add(query_addr
,
1022 /* WaDividePSInvocationCountBy4:HSW,BDW */
1023 if ((cmd_buffer
->device
->info
.gen
== 8 ||
1024 cmd_buffer
->device
->info
.is_haswell
) &&
1025 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
) {
1026 result
= gen_mi_ushr32_imm(&b
, result
, 2);
1029 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1031 assert(idx
== util_bitcount(pool
->pipeline_statistics
));
1035 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
1036 result
= compute_query_result(&b
, anv_address_add(query_addr
, 8));
1037 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1038 result
= compute_query_result(&b
, anv_address_add(query_addr
, 24));
1039 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1042 case VK_QUERY_TYPE_TIMESTAMP
:
1043 result
= gen_mi_mem64(anv_address_add(query_addr
, 8));
1044 gpu_write_query_result(&b
, dest_addr
, flags
, 0, result
);
1048 unreachable("unhandled query type");
1051 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
1052 gpu_write_query_result(&b
, dest_addr
, flags
, idx
,
1053 gen_mi_mem64(query_addr
));
1056 dest_addr
= anv_address_add(dest_addr
, destStride
);
1061 void genX(CmdCopyQueryPoolResults
)(
1062 VkCommandBuffer commandBuffer
,
1063 VkQueryPool queryPool
,
1064 uint32_t firstQuery
,
1065 uint32_t queryCount
,
1066 VkBuffer destBuffer
,
1067 VkDeviceSize destOffset
,
1068 VkDeviceSize destStride
,
1069 VkQueryResultFlags flags
)
1071 anv_finishme("Queries not yet supported on Ivy Bridge");