2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include "anv_private.h"
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
36 * - GPR 14 for perf queries
37 * - GPR 15 for conditional rendering
39 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
40 #define GEN_MI_BUILDER_CAN_WRITE_BATCH GEN_GEN >= 8
41 #define __gen_get_batch_dwords anv_batch_emit_dwords
42 #define __gen_address_offset anv_address_add
43 #define __gen_get_batch_address(b, a) anv_address_physical(anv_batch_address(b, a))
44 #include "common/gen_mi_builder.h"
45 #include "perf/gen_perf.h"
46 #include "perf/gen_perf_mdapi.h"
48 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
52 static struct anv_address
53 anv_query_address(struct anv_query_pool
*pool
, uint32_t query
)
55 return (struct anv_address
) {
57 .offset
= query
* pool
->stride
,
61 VkResult
genX(CreateQueryPool
)(
63 const VkQueryPoolCreateInfo
* pCreateInfo
,
64 const VkAllocationCallbacks
* pAllocator
,
65 VkQueryPool
* pQueryPool
)
67 ANV_FROM_HANDLE(anv_device
, device
, _device
);
68 const struct anv_physical_device
*pdevice
= device
->physical
;
69 const VkQueryPoolPerformanceCreateInfoKHR
*perf_query_info
= NULL
;
70 struct anv_query_pool
*pool
;
71 struct gen_perf_counter_pass
*counter_pass
;
72 struct gen_perf_query_info
**pass_query
;
76 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
78 /* Query pool slots are made up of some number of 64-bit values packed
79 * tightly together. For most query types have the first 64-bit value is
80 * the "available" bit which is 0 when the query is unavailable and 1 when
81 * it is available. The 64-bit values that follow are determined by the
84 * For performance queries, we have a requirement to align OA reports at
85 * 64bytes so we put those first and have the "available" bit behind
86 * together with some other counters.
88 uint32_t uint64s_per_slot
= 0;
89 UNUSED
uint32_t n_passes
= 0;
91 anv_multialloc_add(&ma
, &pool
, 1);
93 VkQueryPipelineStatisticFlags pipeline_statistics
= 0;
94 switch (pCreateInfo
->queryType
) {
95 case VK_QUERY_TYPE_OCCLUSION
:
96 /* Occlusion queries have two values: begin and end. */
97 uint64s_per_slot
= 1 + 2;
99 case VK_QUERY_TYPE_TIMESTAMP
:
100 /* Timestamps just have the one timestamp value */
101 uint64s_per_slot
= 1 + 1;
103 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
104 pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
105 /* We're going to trust this field implicitly so we need to ensure that
106 * no unhandled extension bits leak in.
108 pipeline_statistics
&= ANV_PIPELINE_STATISTICS_MASK
;
110 /* Statistics queries have a min and max for every statistic */
111 uint64s_per_slot
= 1 + 2 * util_bitcount(pipeline_statistics
);
113 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
114 /* Transform feedback queries are 4 values, begin/end for
117 uint64s_per_slot
= 1 + 4;
119 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
:
120 uint64s_per_slot
= 72; /* 576 bytes, see layout below */
122 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
:
123 perf_query_info
= vk_find_struct_const(pCreateInfo
->pNext
,
124 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR
);
125 n_passes
= gen_perf_get_n_passes(pdevice
->perf
,
126 perf_query_info
->pCounterIndices
,
127 perf_query_info
->counterIndexCount
,
129 anv_multialloc_add(&ma
, &counter_pass
, perf_query_info
->counterIndexCount
);
130 anv_multialloc_add(&ma
, &pass_query
, n_passes
);
131 STATIC_ASSERT(ANV_KHR_PERF_QUERY_SIZE
% sizeof(uint64_t) == 0);
132 uint64s_per_slot
= (ANV_KHR_PERF_QUERY_SIZE
/ sizeof(uint64_t)) * n_passes
;
135 assert(!"Invalid query type");
138 if (!anv_multialloc_alloc2(&ma
, &device
->vk
.alloc
,
140 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
))
141 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
143 vk_object_base_init(&device
->vk
, &pool
->base
, VK_OBJECT_TYPE_QUERY_POOL
);
144 pool
->type
= pCreateInfo
->queryType
;
145 pool
->pipeline_statistics
= pipeline_statistics
;
146 pool
->stride
= uint64s_per_slot
* sizeof(uint64_t);
147 pool
->slots
= pCreateInfo
->queryCount
;
149 if (pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
) {
150 pool
->n_counters
= perf_query_info
->counterIndexCount
;
151 pool
->counter_pass
= counter_pass
;
152 gen_perf_get_counters_passes(pdevice
->perf
,
153 perf_query_info
->pCounterIndices
,
154 perf_query_info
->counterIndexCount
,
156 pool
->n_passes
= n_passes
;
157 pool
->pass_query
= pass_query
;
158 gen_perf_get_n_passes(pdevice
->perf
,
159 perf_query_info
->pCounterIndices
,
160 perf_query_info
->counterIndexCount
,
164 uint32_t bo_flags
= 0;
165 if (pdevice
->supports_48bit_addresses
)
166 bo_flags
|= EXEC_OBJECT_SUPPORTS_48B_ADDRESS
;
168 if (pdevice
->use_softpin
)
169 bo_flags
|= EXEC_OBJECT_PINNED
;
171 if (pdevice
->has_exec_async
)
172 bo_flags
|= EXEC_OBJECT_ASYNC
;
174 uint64_t size
= pool
->slots
* pool
->stride
;
175 result
= anv_device_alloc_bo(device
, size
,
176 ANV_BO_ALLOC_MAPPED
|
177 ANV_BO_ALLOC_SNOOPED
,
178 0 /* explicit_address */,
180 if (result
!= VK_SUCCESS
)
183 if (pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
) {
184 for (uint32_t p
= 0; p
< pool
->n_passes
; p
++) {
185 struct gen_mi_builder b
;
186 struct anv_batch batch
= {
187 .start
= pool
->bo
->map
+ ANV_KHR_PERF_QUERY_SIZE
* p
+ 8,
188 .end
= pool
->bo
->map
+ ANV_KHR_PERF_QUERY_SIZE
* p
+ 64,
190 batch
.next
= batch
.start
;
192 gen_mi_builder_init(&b
, &batch
);
193 gen_mi_store(&b
, gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG
),
194 gen_mi_imm(p
* ANV_KHR_PERF_QUERY_SIZE
));
195 anv_batch_emit(&batch
, GENX(MI_BATCH_BUFFER_END
), bbe
);
196 assert(batch
.next
<= (pool
->bo
->map
+ ANV_KHR_PERF_QUERY_SIZE
* p
+ 64));
200 *pQueryPool
= anv_query_pool_to_handle(pool
);
205 vk_free2(&device
->vk
.alloc
, pAllocator
, pool
);
210 void genX(DestroyQueryPool
)(
213 const VkAllocationCallbacks
* pAllocator
)
215 ANV_FROM_HANDLE(anv_device
, device
, _device
);
216 ANV_FROM_HANDLE(anv_query_pool
, pool
, _pool
);
221 anv_device_release_bo(device
, pool
->bo
);
222 vk_object_base_finish(&pool
->base
);
223 vk_free2(&device
->vk
.alloc
, pAllocator
, pool
);
227 * VK_KHR_performance_query layout (576 bytes * number of passes) :
229 * -----------------------------------------
230 * | availability (8b) | | |
231 * |----------------------------| | |
232 * | Small batch loading | | |
233 * | ANV_PERF_QUERY_OFFSET_REG | | |
234 * | (56b) | | Pass 0 |
235 * |----------------------------| | |
236 * | begin MI_RPC (256b) | | |
237 * |----------------------------| | |
238 * | end MI_RPC (256b) | | |
239 * |----------------------------|-- | Query 0
240 * | availability (8b) | | |
241 * |----------------------------| | |
242 * | Small batch loading | | |
243 * | ANV_PERF_QUERY_OFFSET_REG | | |
244 * | (56b) | | Pass 1 |
245 * |----------------------------| | |
246 * | begin MI_RPC (256b) | | |
247 * |----------------------------| | |
248 * | end MI_RPC (256b) | | |
249 * |----------------------------|-----------
250 * | availability (8b) | | |
251 * |----------------------------| | |
252 * | Unused (48b) | | |
253 * |----------------------------| | Pass 0 |
254 * | begin MI_RPC (256b) | | |
255 * |----------------------------| | | Query 1
256 * | end MI_RPC (256b) | | |
257 * |----------------------------|-- |
259 * -----------------------------------------
261 UNUSED
static uint64_t
262 khr_perf_query_availability_offset(struct anv_query_pool
*pool
, uint32_t query
, uint32_t pass
)
264 return query
* (pool
->n_passes
* ANV_KHR_PERF_QUERY_SIZE
) +
265 pass
* ANV_KHR_PERF_QUERY_SIZE
;
268 UNUSED
static uint64_t
269 khr_perf_query_oa_offset(struct anv_query_pool
*pool
, uint32_t query
, uint32_t pass
, bool end
)
271 return query
* (pool
->n_passes
* ANV_KHR_PERF_QUERY_SIZE
) +
272 pass
* ANV_KHR_PERF_QUERY_SIZE
+
273 64 + (end
? OA_SNAPSHOT_SIZE
: 0);
276 UNUSED
static struct anv_address
277 khr_perf_query_availability_address(struct anv_query_pool
*pool
, uint32_t query
, uint32_t pass
)
279 return anv_address_add(
280 (struct anv_address
) { .bo
= pool
->bo
, },
281 khr_perf_query_availability_offset(pool
, query
, pass
));
284 UNUSED
static struct anv_address
285 khr_perf_query_oa_address(struct anv_query_pool
*pool
, uint32_t query
, uint32_t pass
, bool end
)
287 return anv_address_add(
288 (struct anv_address
) { .bo
= pool
->bo
, },
289 khr_perf_query_oa_offset(pool
, query
, pass
, end
));
294 * VK_INTEL_performance_query layout (576 bytes) :
296 * ------------------------------
297 * | availability (8b) |
298 * |----------------------------|
300 * |----------------------------|
301 * | begin RPSTAT register (4b) |
302 * |----------------------------|
303 * | end RPSTAT register (4b) |
304 * |----------------------------|
305 * | begin perfcntr 1 & 2 (16b) |
306 * |----------------------------|
307 * | end perfcntr 1 & 2 (16b) |
308 * |----------------------------|
310 * |----------------------------|
311 * | begin MI_RPC (256b) |
312 * |----------------------------|
313 * | end MI_RPC (256b) |
314 * ------------------------------
318 intel_perf_marker_offset(void)
324 intel_perf_rpstart_offset(bool end
)
326 return 16 + (end
? sizeof(uint32_t) : 0);
329 #if GEN_GEN >= 8 && GEN_GEN <= 11
331 intel_perf_counter(bool end
)
333 return 24 + (end
? (2 * sizeof(uint64_t)) : 0);
338 intel_perf_mi_rpc_offset(bool end
)
340 return 64 + (end
? 256 : 0);
344 cpu_write_query_result(void *dst_slot
, VkQueryResultFlags flags
,
345 uint32_t value_index
, uint64_t result
)
347 if (flags
& VK_QUERY_RESULT_64_BIT
) {
348 uint64_t *dst64
= dst_slot
;
349 dst64
[value_index
] = result
;
351 uint32_t *dst32
= dst_slot
;
352 dst32
[value_index
] = result
;
357 query_slot(struct anv_query_pool
*pool
, uint32_t query
)
359 return pool
->bo
->map
+ query
* pool
->stride
;
363 query_is_available(struct anv_query_pool
*pool
, uint32_t query
)
365 if (pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
) {
366 for (uint32_t p
= 0; p
< pool
->n_passes
; p
++) {
367 volatile uint64_t *slot
=
368 pool
->bo
->map
+ khr_perf_query_availability_offset(pool
, query
, p
);
374 return *(volatile uint64_t *)query_slot(pool
, query
);
379 wait_for_available(struct anv_device
*device
,
380 struct anv_query_pool
*pool
, uint32_t query
)
382 uint64_t abs_timeout
= anv_get_absolute_timeout(5 * NSEC_PER_SEC
);
384 while (anv_gettime_ns() < abs_timeout
) {
385 if (query_is_available(pool
, query
))
387 VkResult status
= anv_device_query_status(device
);
388 if (status
!= VK_SUCCESS
)
392 return anv_device_set_lost(device
, "query timeout");
395 VkResult
genX(GetQueryPoolResults
)(
397 VkQueryPool queryPool
,
403 VkQueryResultFlags flags
)
405 ANV_FROM_HANDLE(anv_device
, device
, _device
);
406 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
408 assert(pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
409 pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
||
410 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
||
411 pool
->type
== VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
||
412 pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
||
413 pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
);
415 if (anv_device_is_lost(device
))
416 return VK_ERROR_DEVICE_LOST
;
421 void *data_end
= pData
+ dataSize
;
423 VkResult status
= VK_SUCCESS
;
424 for (uint32_t i
= 0; i
< queryCount
; i
++) {
425 bool available
= query_is_available(pool
, firstQuery
+ i
);
427 if (!available
&& (flags
& VK_QUERY_RESULT_WAIT_BIT
)) {
428 status
= wait_for_available(device
, pool
, firstQuery
+ i
);
429 if (status
!= VK_SUCCESS
)
435 /* From the Vulkan 1.0.42 spec:
437 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
438 * both not set then no result values are written to pData for
439 * queries that are in the unavailable state at the time of the call,
440 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
441 * availability state is still written to pData for those queries if
442 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
444 * From VK_KHR_performance_query :
446 * "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies
447 * that the result should contain the number of counters that were recorded
448 * into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR"
450 bool write_results
= available
|| (flags
& VK_QUERY_RESULT_PARTIAL_BIT
);
453 switch (pool
->type
) {
454 case VK_QUERY_TYPE_OCCLUSION
: {
455 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
457 /* From the Vulkan 1.2.132 spec:
459 * "If VK_QUERY_RESULT_PARTIAL_BIT is set,
460 * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
461 * is unavailable, an intermediate result value between zero and
462 * the final result value is written to pData for that query."
464 uint64_t result
= available
? slot
[2] - slot
[1] : 0;
465 cpu_write_query_result(pData
, flags
, idx
, result
);
471 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
472 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
473 uint32_t statistics
= pool
->pipeline_statistics
;
475 uint32_t stat
= u_bit_scan(&statistics
);
477 uint64_t result
= slot
[idx
* 2 + 2] - slot
[idx
* 2 + 1];
479 /* WaDividePSInvocationCountBy4:HSW,BDW */
480 if ((device
->info
.gen
== 8 || device
->info
.is_haswell
) &&
481 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
)
484 cpu_write_query_result(pData
, flags
, idx
, result
);
488 assert(idx
== util_bitcount(pool
->pipeline_statistics
));
492 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
: {
493 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
495 cpu_write_query_result(pData
, flags
, idx
, slot
[2] - slot
[1]);
498 cpu_write_query_result(pData
, flags
, idx
, slot
[4] - slot
[3]);
503 case VK_QUERY_TYPE_TIMESTAMP
: {
504 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
506 cpu_write_query_result(pData
, flags
, idx
, slot
[1]);
512 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
: {
513 const struct anv_physical_device
*pdevice
= device
->physical
;
514 assert((flags
& (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
|
515 VK_QUERY_RESULT_PARTIAL_BIT
)) == 0);
516 for (uint32_t p
= 0; p
< pool
->n_passes
; p
++) {
517 const uint32_t *begin
= pool
->bo
->map
+ khr_perf_query_oa_offset(pool
, firstQuery
+ i
, p
, false);
518 const uint32_t *end
= pool
->bo
->map
+ khr_perf_query_oa_offset(pool
, firstQuery
+ i
, p
, true);
519 struct gen_perf_query_result result
;
520 gen_perf_query_result_clear(&result
);
521 gen_perf_query_result_accumulate(&result
, pool
->pass_query
[p
], begin
, end
);
522 anv_perf_write_pass_results(pdevice
->perf
, pool
, p
, &result
, pData
);
528 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
531 const void *query_data
= query_slot(pool
, firstQuery
+ i
);
532 const uint32_t *oa_begin
= query_data
+ intel_perf_mi_rpc_offset(false);
533 const uint32_t *oa_end
= query_data
+ intel_perf_mi_rpc_offset(true);
534 const uint32_t *rpstat_begin
= query_data
+ intel_perf_rpstart_offset(false);
535 const uint32_t *rpstat_end
= query_data
+ intel_perf_mi_rpc_offset(true);
536 struct gen_perf_query_result result
;
537 uint32_t core_freq
[2];
539 core_freq
[0] = ((*rpstat_begin
>> 7) & 0x7f) * 1000000ULL;
540 core_freq
[1] = ((*rpstat_end
>> 7) & 0x7f) * 1000000ULL;
542 core_freq
[0] = ((*rpstat_begin
>> 23) & 0x1ff) * 1000000ULL;
543 core_freq
[1] = ((*rpstat_end
>> 23) & 0x1ff) * 1000000ULL;
545 gen_perf_query_result_clear(&result
);
546 gen_perf_query_result_accumulate(&result
, &device
->physical
->perf
->queries
[0],
548 gen_perf_query_result_read_frequencies(&result
, &device
->info
,
550 gen_perf_query_result_write_mdapi(pData
, stride
,
553 core_freq
[0], core_freq
[1]);
554 #if GEN_GEN >= 8 && GEN_GEN <= 11
555 gen_perf_query_mdapi_write_perfcntr(pData
, stride
, &device
->info
,
556 query_data
+ intel_perf_counter(false),
557 query_data
+ intel_perf_counter(true));
559 const uint64_t *marker
= query_data
+ intel_perf_marker_offset();
560 gen_perf_query_mdapi_write_marker(pData
, stride
, &device
->info
, *marker
);
565 unreachable("invalid pool type");
569 status
= VK_NOT_READY
;
571 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
572 cpu_write_query_result(pData
, flags
, idx
, available
);
575 if (pData
>= data_end
)
583 emit_ps_depth_count(struct anv_cmd_buffer
*cmd_buffer
,
584 struct anv_address addr
)
586 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_POST_SYNC_BIT
;
587 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
589 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
590 pc
.DestinationAddressType
= DAT_PPGTT
;
591 pc
.PostSyncOperation
= WritePSDepthCount
;
592 pc
.DepthStallEnable
= true;
595 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
596 pc
.CommandStreamerStallEnable
= true;
601 emit_query_mi_availability(struct gen_mi_builder
*b
,
602 struct anv_address addr
,
605 gen_mi_store(b
, gen_mi_mem64(addr
), gen_mi_imm(available
));
609 emit_query_pc_availability(struct anv_cmd_buffer
*cmd_buffer
,
610 struct anv_address addr
,
613 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_POST_SYNC_BIT
;
614 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
616 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
617 pc
.DestinationAddressType
= DAT_PPGTT
;
618 pc
.PostSyncOperation
= WriteImmediateData
;
620 pc
.ImmediateData
= available
;
625 * Goes through a series of consecutive query indices in the given pool
626 * setting all element values to 0 and emitting them as available.
629 emit_zero_queries(struct anv_cmd_buffer
*cmd_buffer
,
630 struct gen_mi_builder
*b
, struct anv_query_pool
*pool
,
631 uint32_t first_index
, uint32_t num_queries
)
633 switch (pool
->type
) {
634 case VK_QUERY_TYPE_OCCLUSION
:
635 case VK_QUERY_TYPE_TIMESTAMP
:
636 /* These queries are written with a PIPE_CONTROL so clear them using the
637 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
640 assert((pool
->stride
% 8) == 0);
641 for (uint32_t i
= 0; i
< num_queries
; i
++) {
642 struct anv_address slot_addr
=
643 anv_query_address(pool
, first_index
+ i
);
645 for (uint32_t qword
= 1; qword
< (pool
->stride
/ 8); qword
++) {
646 emit_query_pc_availability(cmd_buffer
,
647 anv_address_add(slot_addr
, qword
* 8),
650 emit_query_pc_availability(cmd_buffer
, slot_addr
, true);
654 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
655 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
656 for (uint32_t i
= 0; i
< num_queries
; i
++) {
657 struct anv_address slot_addr
=
658 anv_query_address(pool
, first_index
+ i
);
659 gen_mi_memset(b
, anv_address_add(slot_addr
, 8), 0, pool
->stride
- 8);
660 emit_query_mi_availability(b
, slot_addr
, true);
665 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
: {
666 for (uint32_t i
= 0; i
< num_queries
; i
++) {
667 for (uint32_t p
= 0; p
< pool
->n_passes
; p
++) {
669 khr_perf_query_oa_address(pool
,
670 first_index
+ i
, p
, false),
671 0, 2 * OA_SNAPSHOT_SIZE
);
672 emit_query_mi_availability(b
,
673 khr_perf_query_availability_address(pool
, first_index
+ i
, p
),
681 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
:
682 for (uint32_t i
= 0; i
< num_queries
; i
++) {
683 struct anv_address slot_addr
=
684 anv_query_address(pool
, first_index
+ i
);
685 gen_mi_memset(b
, anv_address_add(slot_addr
, 8), 0, pool
->stride
- 8);
686 emit_query_mi_availability(b
, slot_addr
, true);
691 unreachable("Unsupported query type");
695 void genX(CmdResetQueryPool
)(
696 VkCommandBuffer commandBuffer
,
697 VkQueryPool queryPool
,
701 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
702 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
704 switch (pool
->type
) {
705 case VK_QUERY_TYPE_OCCLUSION
:
706 case VK_QUERY_TYPE_TIMESTAMP
:
707 for (uint32_t i
= 0; i
< queryCount
; i
++) {
708 emit_query_pc_availability(cmd_buffer
,
709 anv_query_address(pool
, firstQuery
+ i
),
714 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
715 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
: {
716 struct gen_mi_builder b
;
717 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
719 for (uint32_t i
= 0; i
< queryCount
; i
++)
720 emit_query_mi_availability(&b
, anv_query_address(pool
, firstQuery
+ i
), false);
725 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
: {
726 struct gen_mi_builder b
;
727 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
729 for (uint32_t i
= 0; i
< queryCount
; i
++) {
730 for (uint32_t p
= 0; p
< pool
->n_passes
; p
++) {
731 emit_query_mi_availability(
733 khr_perf_query_availability_address(pool
, firstQuery
+ i
, p
),
741 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
742 struct gen_mi_builder b
;
743 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
745 for (uint32_t i
= 0; i
< queryCount
; i
++)
746 emit_query_mi_availability(&b
, anv_query_address(pool
, firstQuery
+ i
), false);
751 unreachable("Unsupported query type");
755 void genX(ResetQueryPool
)(
757 VkQueryPool queryPool
,
761 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
763 for (uint32_t i
= 0; i
< queryCount
; i
++) {
764 if (pool
->type
== VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
) {
765 for (uint32_t p
= 0; p
< pool
->n_passes
; p
++) {
766 uint64_t *pass_slot
= pool
->bo
->map
+
767 khr_perf_query_availability_offset(pool
, firstQuery
+ i
, p
);
771 uint64_t *slot
= query_slot(pool
, firstQuery
+ i
);
777 static const uint32_t vk_pipeline_stat_to_reg
[] = {
778 GENX(IA_VERTICES_COUNT_num
),
779 GENX(IA_PRIMITIVES_COUNT_num
),
780 GENX(VS_INVOCATION_COUNT_num
),
781 GENX(GS_INVOCATION_COUNT_num
),
782 GENX(GS_PRIMITIVES_COUNT_num
),
783 GENX(CL_INVOCATION_COUNT_num
),
784 GENX(CL_PRIMITIVES_COUNT_num
),
785 GENX(PS_INVOCATION_COUNT_num
),
786 GENX(HS_INVOCATION_COUNT_num
),
787 GENX(DS_INVOCATION_COUNT_num
),
788 GENX(CS_INVOCATION_COUNT_num
),
792 emit_pipeline_stat(struct gen_mi_builder
*b
, uint32_t stat
,
793 struct anv_address addr
)
795 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK
==
796 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg
)) - 1);
798 assert(stat
< ARRAY_SIZE(vk_pipeline_stat_to_reg
));
799 gen_mi_store(b
, gen_mi_mem64(addr
),
800 gen_mi_reg64(vk_pipeline_stat_to_reg
[stat
]));
804 emit_xfb_query(struct gen_mi_builder
*b
, uint32_t stream
,
805 struct anv_address addr
)
807 assert(stream
< MAX_XFB_STREAMS
);
809 gen_mi_store(b
, gen_mi_mem64(anv_address_add(addr
, 0)),
810 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num
) + stream
* 8));
811 gen_mi_store(b
, gen_mi_mem64(anv_address_add(addr
, 16)),
812 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num
) + stream
* 8));
815 void genX(CmdBeginQuery
)(
816 VkCommandBuffer commandBuffer
,
817 VkQueryPool queryPool
,
819 VkQueryControlFlags flags
)
821 genX(CmdBeginQueryIndexedEXT
)(commandBuffer
, queryPool
, query
, flags
, 0);
824 void genX(CmdBeginQueryIndexedEXT
)(
825 VkCommandBuffer commandBuffer
,
826 VkQueryPool queryPool
,
828 VkQueryControlFlags flags
,
831 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
832 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
833 struct anv_address query_addr
= anv_query_address(pool
, query
);
835 struct gen_mi_builder b
;
836 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
838 switch (pool
->type
) {
839 case VK_QUERY_TYPE_OCCLUSION
:
840 emit_ps_depth_count(cmd_buffer
, anv_address_add(query_addr
, 8));
843 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
844 /* TODO: This might only be necessary for certain stats */
845 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
846 pc
.CommandStreamerStallEnable
= true;
847 pc
.StallAtPixelScoreboard
= true;
850 uint32_t statistics
= pool
->pipeline_statistics
;
853 uint32_t stat
= u_bit_scan(&statistics
);
854 emit_pipeline_stat(&b
, stat
, anv_address_add(query_addr
, offset
));
860 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
861 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
862 pc
.CommandStreamerStallEnable
= true;
863 pc
.StallAtPixelScoreboard
= true;
865 emit_xfb_query(&b
, index
, anv_address_add(query_addr
, 8));
869 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
: {
870 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
871 pc
.CommandStreamerStallEnable
= true;
872 pc
.StallAtPixelScoreboard
= true;
874 cmd_buffer
->perf_query_pool
= pool
;
876 /* We know the bottom bits of the address are 0s which match what we
877 * want in the MI_RPC packet.
879 struct gen_mi_value mi_rpc_write_offset
=
883 gen_canonical_address(
885 khr_perf_query_oa_offset(pool
, query
, 0 /* pass */, false))),
886 gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG
));
887 struct gen_mi_address_token mi_rpc_addr_dest
=
888 gen_mi_store_address(&b
, mi_rpc_write_offset
);
889 gen_mi_self_mod_barrier(&b
);
892 anv_batch_emitn(&cmd_buffer
->batch
,
893 GENX(MI_REPORT_PERF_COUNT_length
),
894 GENX(MI_REPORT_PERF_COUNT
),
895 .MemoryAddress
= query_addr
/* Will be overwritten */ );
896 _gen_mi_resolve_address_token(&b
, mi_rpc_addr_dest
,
898 GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start
) / 8);
903 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
904 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
905 pc
.CommandStreamerStallEnable
= true;
906 pc
.StallAtPixelScoreboard
= true;
908 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_REPORT_PERF_COUNT
), rpc
) {
910 anv_address_add(query_addr
, intel_perf_mi_rpc_offset(false));
914 gen_mi_mem32(anv_address_add(query_addr
,
915 intel_perf_rpstart_offset(false))),
916 gen_mi_reg32(GENX(RPSTAT1_num
)));
919 gen_mi_mem32(anv_address_add(query_addr
,
920 intel_perf_rpstart_offset(false))),
921 gen_mi_reg32(GENX(RPSTAT0_num
)));
923 #if GEN_GEN >= 8 && GEN_GEN <= 11
924 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
,
925 intel_perf_counter(false))),
926 gen_mi_reg64(GENX(PERFCNT1_num
)));
927 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
,
928 intel_perf_counter(false) + 8)),
929 gen_mi_reg64(GENX(PERFCNT2_num
)));
939 void genX(CmdEndQuery
)(
940 VkCommandBuffer commandBuffer
,
941 VkQueryPool queryPool
,
944 genX(CmdEndQueryIndexedEXT
)(commandBuffer
, queryPool
, query
, 0);
947 void genX(CmdEndQueryIndexedEXT
)(
948 VkCommandBuffer commandBuffer
,
949 VkQueryPool queryPool
,
953 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
954 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
955 struct anv_address query_addr
= anv_query_address(pool
, query
);
957 struct gen_mi_builder b
;
958 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
960 switch (pool
->type
) {
961 case VK_QUERY_TYPE_OCCLUSION
:
962 emit_ps_depth_count(cmd_buffer
, anv_address_add(query_addr
, 16));
963 emit_query_pc_availability(cmd_buffer
, query_addr
, true);
966 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
967 /* TODO: This might only be necessary for certain stats */
968 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
969 pc
.CommandStreamerStallEnable
= true;
970 pc
.StallAtPixelScoreboard
= true;
973 uint32_t statistics
= pool
->pipeline_statistics
;
974 uint32_t offset
= 16;
976 uint32_t stat
= u_bit_scan(&statistics
);
977 emit_pipeline_stat(&b
, stat
, anv_address_add(query_addr
, offset
));
981 emit_query_mi_availability(&b
, query_addr
, true);
985 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
986 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
987 pc
.CommandStreamerStallEnable
= true;
988 pc
.StallAtPixelScoreboard
= true;
991 emit_xfb_query(&b
, index
, anv_address_add(query_addr
, 16));
992 emit_query_mi_availability(&b
, query_addr
, true);
996 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
: {
997 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
998 pc
.CommandStreamerStallEnable
= true;
999 pc
.StallAtPixelScoreboard
= true;
1002 /* We know the bottom bits of the address are 0s which match what we
1003 * want in the MI_RPC/MI_SDI packets.
1005 struct gen_mi_value mi_rpc_write_offset
=
1009 gen_canonical_address(
1011 khr_perf_query_oa_offset(pool
, query
, 0 /* pass*/, true))),
1012 gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG
));
1013 struct gen_mi_value availability_write_offset
=
1017 gen_canonical_address(
1019 khr_perf_query_availability_offset(pool
, query
, 0 /* pass */))),
1020 gen_mi_reg64(ANV_PERF_QUERY_OFFSET_REG
));
1022 struct gen_mi_address_token mi_rpc_addr_dest
=
1023 gen_mi_store_address(&b
, mi_rpc_write_offset
);
1024 struct gen_mi_address_token availability_addr_dest
=
1025 gen_mi_store_address(&b
, availability_write_offset
);
1026 gen_mi_self_mod_barrier(&b
);
1029 anv_batch_emitn(&cmd_buffer
->batch
,
1030 GENX(MI_REPORT_PERF_COUNT_length
),
1031 GENX(MI_REPORT_PERF_COUNT
),
1032 .MemoryAddress
= query_addr
/* Will be overwritten */ );
1033 _gen_mi_resolve_address_token(&b
, mi_rpc_addr_dest
,
1035 GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start
) / 8);
1037 void *availability_dws
=
1038 anv_batch_emitn(&cmd_buffer
->batch
,
1039 GENX(MI_STORE_DATA_IMM_length
),
1040 GENX(MI_STORE_DATA_IMM
),
1041 .ImmediateData
= true);
1042 _gen_mi_resolve_address_token(&b
, availability_addr_dest
,
1044 GENX(MI_STORE_DATA_IMM_Address_start
) / 8);
1049 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL
: {
1050 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
1051 pc
.CommandStreamerStallEnable
= true;
1052 pc
.StallAtPixelScoreboard
= true;
1054 uint32_t marker_offset
= intel_perf_marker_offset();
1055 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, marker_offset
)),
1056 gen_mi_imm(cmd_buffer
->intel_perf_marker
));
1057 #if GEN_GEN >= 8 && GEN_GEN <= 11
1058 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, intel_perf_counter(true))),
1059 gen_mi_reg64(GENX(PERFCNT1_num
)));
1060 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, intel_perf_counter(true) + 8)),
1061 gen_mi_reg64(GENX(PERFCNT2_num
)));
1065 gen_mi_mem32(anv_address_add(query_addr
,
1066 intel_perf_rpstart_offset(true))),
1067 gen_mi_reg32(GENX(RPSTAT1_num
)));
1070 gen_mi_mem32(anv_address_add(query_addr
,
1071 intel_perf_rpstart_offset(true))),
1072 gen_mi_reg32(GENX(RPSTAT0_num
)));
1074 /* Position the last OA snapshot at the beginning of the query so that
1075 * we can tell whether it's ready.
1077 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_REPORT_PERF_COUNT
), rpc
) {
1078 rpc
.MemoryAddress
= anv_address_add(query_addr
,
1079 intel_perf_mi_rpc_offset(true));
1080 rpc
.ReportID
= 0xdeadbeef; /* This goes in the first dword */
1082 emit_query_mi_availability(&b
, query_addr
, true);
1090 /* When multiview is active the spec requires that N consecutive query
1091 * indices are used, where N is the number of active views in the subpass.
1092 * The spec allows that we only write the results to one of the queries
1093 * but we still need to manage result availability for all the query indices.
1094 * Since we only emit a single query for all active views in the
1095 * first index, mark the other query indices as being already available
1098 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
1099 const uint32_t num_queries
=
1100 util_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
1101 if (num_queries
> 1)
1102 emit_zero_queries(cmd_buffer
, &b
, pool
, query
+ 1, num_queries
- 1);
1106 #define TIMESTAMP 0x2358
1108 void genX(CmdWriteTimestamp
)(
1109 VkCommandBuffer commandBuffer
,
1110 VkPipelineStageFlagBits pipelineStage
,
1111 VkQueryPool queryPool
,
1114 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
1115 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
1116 struct anv_address query_addr
= anv_query_address(pool
, query
);
1118 assert(pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
1120 struct gen_mi_builder b
;
1121 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
1123 switch (pipelineStage
) {
1124 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
:
1125 gen_mi_store(&b
, gen_mi_mem64(anv_address_add(query_addr
, 8)),
1126 gen_mi_reg64(TIMESTAMP
));
1130 /* Everything else is bottom-of-pipe */
1131 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_POST_SYNC_BIT
;
1132 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
1134 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
1135 pc
.DestinationAddressType
= DAT_PPGTT
;
1136 pc
.PostSyncOperation
= WriteTimestamp
;
1137 pc
.Address
= anv_address_add(query_addr
, 8);
1139 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
1140 pc
.CommandStreamerStallEnable
= true;
1145 emit_query_pc_availability(cmd_buffer
, query_addr
, true);
1147 /* When multiview is active the spec requires that N consecutive query
1148 * indices are used, where N is the number of active views in the subpass.
1149 * The spec allows that we only write the results to one of the queries
1150 * but we still need to manage result availability for all the query indices.
1151 * Since we only emit a single query for all active views in the
1152 * first index, mark the other query indices as being already available
1155 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
1156 const uint32_t num_queries
=
1157 util_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
1158 if (num_queries
> 1)
1159 emit_zero_queries(cmd_buffer
, &b
, pool
, query
+ 1, num_queries
- 1);
1163 #if GEN_GEN > 7 || GEN_IS_HASWELL
1165 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1167 #define MI_PREDICATE_SRC0 0x2400
1168 #define MI_PREDICATE_SRC1 0x2408
1169 #define MI_PREDICATE_RESULT 0x2418
1172 * Writes the results of a query to dst_addr is the value at poll_addr is equal
1173 * to the reference value.
1176 gpu_write_query_result_cond(struct anv_cmd_buffer
*cmd_buffer
,
1177 struct gen_mi_builder
*b
,
1178 struct anv_address poll_addr
,
1179 struct anv_address dst_addr
,
1181 VkQueryResultFlags flags
,
1182 uint32_t value_index
,
1183 struct gen_mi_value query_result
)
1185 gen_mi_store(b
, gen_mi_reg64(MI_PREDICATE_SRC0
), gen_mi_mem64(poll_addr
));
1186 gen_mi_store(b
, gen_mi_reg64(MI_PREDICATE_SRC1
), gen_mi_imm(ref_value
));
1187 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_PREDICATE
), mip
) {
1188 mip
.LoadOperation
= LOAD_LOAD
;
1189 mip
.CombineOperation
= COMBINE_SET
;
1190 mip
.CompareOperation
= COMPARE_SRCS_EQUAL
;
1193 if (flags
& VK_QUERY_RESULT_64_BIT
) {
1194 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 8);
1195 gen_mi_store_if(b
, gen_mi_mem64(res_addr
), query_result
);
1197 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 4);
1198 gen_mi_store_if(b
, gen_mi_mem32(res_addr
), query_result
);
1202 #endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */
1205 gpu_write_query_result(struct gen_mi_builder
*b
,
1206 struct anv_address dst_addr
,
1207 VkQueryResultFlags flags
,
1208 uint32_t value_index
,
1209 struct gen_mi_value query_result
)
1211 if (flags
& VK_QUERY_RESULT_64_BIT
) {
1212 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 8);
1213 gen_mi_store(b
, gen_mi_mem64(res_addr
), query_result
);
1215 struct anv_address res_addr
= anv_address_add(dst_addr
, value_index
* 4);
1216 gen_mi_store(b
, gen_mi_mem32(res_addr
), query_result
);
1220 static struct gen_mi_value
1221 compute_query_result(struct gen_mi_builder
*b
, struct anv_address addr
)
1223 return gen_mi_isub(b
, gen_mi_mem64(anv_address_add(addr
, 8)),
1224 gen_mi_mem64(anv_address_add(addr
, 0)));
1227 void genX(CmdCopyQueryPoolResults
)(
1228 VkCommandBuffer commandBuffer
,
1229 VkQueryPool queryPool
,
1230 uint32_t firstQuery
,
1231 uint32_t queryCount
,
1232 VkBuffer destBuffer
,
1233 VkDeviceSize destOffset
,
1234 VkDeviceSize destStride
,
1235 VkQueryResultFlags flags
)
1237 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
1238 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
1239 ANV_FROM_HANDLE(anv_buffer
, buffer
, destBuffer
);
1241 struct gen_mi_builder b
;
1242 gen_mi_builder_init(&b
, &cmd_buffer
->batch
);
1243 struct gen_mi_value result
;
1245 /* If render target writes are ongoing, request a render target cache flush
1246 * to ensure proper ordering of the commands from the 3d pipe and the
1249 if (cmd_buffer
->state
.pending_pipe_bits
& ANV_PIPE_RENDER_TARGET_BUFFER_WRITES
) {
1250 cmd_buffer
->state
.pending_pipe_bits
|=
1251 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT
;
1254 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) ||
1255 (cmd_buffer
->state
.pending_pipe_bits
& ANV_PIPE_FLUSH_BITS
) ||
1256 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
1257 * because we're about to copy values from MI commands, we need to
1258 * stall the command streamer to make sure the PIPE_CONTROL values have
1259 * landed, otherwise we could see inconsistent values & availability.
1261 * From the vulkan spec:
1263 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1264 * previous uses of vkCmdResetQueryPool in the same queue, without
1265 * any additional synchronization."
1267 pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
1268 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
) {
1269 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_CS_STALL_BIT
;
1270 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
1273 struct anv_address dest_addr
= anv_address_add(buffer
->address
, destOffset
);
1274 for (uint32_t i
= 0; i
< queryCount
; i
++) {
1275 struct anv_address query_addr
= anv_query_address(pool
, firstQuery
+ i
);
1277 switch (pool
->type
) {
1278 case VK_QUERY_TYPE_OCCLUSION
:
1279 result
= compute_query_result(&b
, anv_address_add(query_addr
, 8));
1280 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1281 /* Like in the case of vkGetQueryPoolResults, if the query is
1282 * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
1283 * conservatively write 0 as the query result. If the
1284 * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1286 gpu_write_query_result_cond(cmd_buffer
, &b
, query_addr
, dest_addr
,
1287 1 /* available */, flags
, idx
, result
);
1288 if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
) {
1289 gpu_write_query_result_cond(cmd_buffer
, &b
, query_addr
, dest_addr
,
1290 0 /* unavailable */, flags
, idx
, gen_mi_imm(0));
1293 #else /* GEN_GEN < 8 && !GEN_IS_HASWELL */
1294 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1298 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
1299 uint32_t statistics
= pool
->pipeline_statistics
;
1300 while (statistics
) {
1301 uint32_t stat
= u_bit_scan(&statistics
);
1303 result
= compute_query_result(&b
, anv_address_add(query_addr
,
1306 /* WaDividePSInvocationCountBy4:HSW,BDW */
1307 if ((cmd_buffer
->device
->info
.gen
== 8 ||
1308 cmd_buffer
->device
->info
.is_haswell
) &&
1309 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
) {
1310 result
= gen_mi_ushr32_imm(&b
, result
, 2);
1313 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1315 assert(idx
== util_bitcount(pool
->pipeline_statistics
));
1319 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
1320 result
= compute_query_result(&b
, anv_address_add(query_addr
, 8));
1321 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1322 result
= compute_query_result(&b
, anv_address_add(query_addr
, 24));
1323 gpu_write_query_result(&b
, dest_addr
, flags
, idx
++, result
);
1326 case VK_QUERY_TYPE_TIMESTAMP
:
1327 result
= gen_mi_mem64(anv_address_add(query_addr
, 8));
1328 gpu_write_query_result(&b
, dest_addr
, flags
, 0, result
);
1332 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR
:
1333 unreachable("Copy KHR performance query results not implemented");
1338 unreachable("unhandled query type");
1341 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
1342 gpu_write_query_result(&b
, dest_addr
, flags
, idx
,
1343 gen_mi_mem64(query_addr
));
1346 dest_addr
= anv_address_add(dest_addr
, destStride
);
1351 void genX(CmdCopyQueryPoolResults
)(
1352 VkCommandBuffer commandBuffer
,
1353 VkQueryPool queryPool
,
1354 uint32_t firstQuery
,
1355 uint32_t queryCount
,
1356 VkBuffer destBuffer
,
1357 VkDeviceSize destOffset
,
1358 VkDeviceSize destStride
,
1359 VkQueryResultFlags flags
)
1361 anv_finishme("Queries not yet supported on Ivy Bridge");