2 * Copyrigh 2016 Red Hat Inc.
4 * Copyright © 2015 Intel Corporation
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
26 #include "tu_private.h"
34 #include "adreno_pm4.xml.h"
35 #include "adreno_common.xml.h"
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
46 struct PACKED query_slot
{
50 struct PACKED occlusion_slot_value
{
51 /* Seems sample counters are placed to be 16-byte aligned
52 * even though this query needs an 8-byte slot. */
57 struct PACKED occlusion_query_slot
{
58 struct query_slot common
;
61 struct occlusion_slot_value begin
;
62 struct occlusion_slot_value end
;
65 struct PACKED timestamp_query_slot
{
66 struct query_slot common
;
70 struct PACKED primitive_slot_value
{
74 struct PACKED primitive_query_slot
{
75 struct query_slot common
;
76 /* The result of transform feedback queries is two integer values:
77 * results[0] is the count of primitives written,
78 * results[1] is the count of primitives generated.
79 * Also a result for each stream is stored at 4 slots respectively.
83 /* Primitive counters also need to be 16-byte aligned. */
86 struct primitive_slot_value begin
[4];
87 struct primitive_slot_value end
[4];
90 /* Returns the IOVA of a given uint64_t field in a given slot of a query
92 #define query_iova(type, pool, query, field) \
93 pool->bo.iova + pool->stride * (query) + offsetof(type, field)
95 #define occlusion_query_iova(pool, query, field) \
96 query_iova(struct occlusion_query_slot, pool, query, field)
98 #define primitive_query_iova(pool, query, field, i) \
99 query_iova(struct primitive_query_slot, pool, query, field) + \
100 offsetof(struct primitive_slot_value, values[i])
102 #define query_available_iova(pool, query) \
103 query_iova(struct query_slot, pool, query, available)
105 #define query_result_iova(pool, query, i) \
106 pool->bo.iova + pool->stride * (query) + \
107 sizeof(struct query_slot) + sizeof(uint64_t) * i
109 #define query_result_addr(pool, query, i) \
110 pool->bo.map + pool->stride * query + \
111 sizeof(struct query_slot) + sizeof(uint64_t) * i
113 #define query_is_available(slot) slot->available
116 * Returns a pointer to a given slot in a query pool.
118 static void* slot_address(struct tu_query_pool
*pool
, uint32_t query
)
120 return (char*)pool
->bo
.map
+ query
* pool
->stride
;
124 tu_CreateQueryPool(VkDevice _device
,
125 const VkQueryPoolCreateInfo
*pCreateInfo
,
126 const VkAllocationCallbacks
*pAllocator
,
127 VkQueryPool
*pQueryPool
)
129 TU_FROM_HANDLE(tu_device
, device
, _device
);
130 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
131 assert(pCreateInfo
->queryCount
> 0);
134 switch (pCreateInfo
->queryType
) {
135 case VK_QUERY_TYPE_OCCLUSION
:
136 slot_size
= sizeof(struct occlusion_query_slot
);
138 case VK_QUERY_TYPE_TIMESTAMP
:
139 slot_size
= sizeof(struct timestamp_query_slot
);
141 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
142 slot_size
= sizeof(struct primitive_query_slot
);
144 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
145 unreachable("Unimplemented query type");
147 assert(!"Invalid query type");
150 struct tu_query_pool
*pool
=
151 vk_object_alloc(&device
->vk
, pAllocator
, sizeof(*pool
),
152 VK_OBJECT_TYPE_QUERY_POOL
);
154 return vk_error(device
->instance
, VK_ERROR_OUT_OF_HOST_MEMORY
);
156 VkResult result
= tu_bo_init_new(device
, &pool
->bo
,
157 pCreateInfo
->queryCount
* slot_size
);
158 if (result
!= VK_SUCCESS
) {
159 vk_object_free(&device
->vk
, pAllocator
, pool
);
163 result
= tu_bo_map(device
, &pool
->bo
);
164 if (result
!= VK_SUCCESS
) {
165 tu_bo_finish(device
, &pool
->bo
);
166 vk_object_free(&device
->vk
, pAllocator
, pool
);
170 /* Initialize all query statuses to unavailable */
171 memset(pool
->bo
.map
, 0, pool
->bo
.size
);
173 pool
->type
= pCreateInfo
->queryType
;
174 pool
->stride
= slot_size
;
175 pool
->size
= pCreateInfo
->queryCount
;
176 pool
->pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
177 *pQueryPool
= tu_query_pool_to_handle(pool
);
183 tu_DestroyQueryPool(VkDevice _device
,
185 const VkAllocationCallbacks
*pAllocator
)
187 TU_FROM_HANDLE(tu_device
, device
, _device
);
188 TU_FROM_HANDLE(tu_query_pool
, pool
, _pool
);
193 tu_bo_finish(device
, &pool
->bo
);
194 vk_object_free(&device
->vk
, pAllocator
, pool
);
198 get_result_count(struct tu_query_pool
*pool
)
200 switch (pool
->type
) {
201 /* Occulusion and timestamp queries write one integer value */
202 case VK_QUERY_TYPE_OCCLUSION
:
203 case VK_QUERY_TYPE_TIMESTAMP
:
205 /* Transform feedback queries write two integer values */
206 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
209 assert(!"Invalid query type");
214 /* Wait on the the availability status of a query up until a timeout. */
216 wait_for_available(struct tu_device
*device
, struct tu_query_pool
*pool
,
219 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
220 * scheduler friendly way instead of busy polling once the patch has landed
222 struct query_slot
*slot
= slot_address(pool
, query
);
223 uint64_t abs_timeout
= os_time_get_absolute_timeout(
224 WAIT_TIMEOUT
* NSEC_PER_SEC
);
225 while(os_time_get_nano() < abs_timeout
) {
226 if (query_is_available(slot
))
229 return vk_error(device
->instance
, VK_TIMEOUT
);
232 /* Writes a query value to a buffer from the CPU. */
234 write_query_value_cpu(char* base
,
237 VkQueryResultFlags flags
)
239 if (flags
& VK_QUERY_RESULT_64_BIT
) {
240 *(uint64_t*)(base
+ (offset
* sizeof(uint64_t))) = value
;
242 *(uint32_t*)(base
+ (offset
* sizeof(uint32_t))) = value
;
247 get_query_pool_results(struct tu_device
*device
,
248 struct tu_query_pool
*pool
,
254 VkQueryResultFlags flags
)
256 assert(dataSize
>= stride
* queryCount
);
258 char *result_base
= pData
;
259 VkResult result
= VK_SUCCESS
;
260 for (uint32_t i
= 0; i
< queryCount
; i
++) {
261 uint32_t query
= firstQuery
+ i
;
262 struct query_slot
*slot
= slot_address(pool
, query
);
263 bool available
= query_is_available(slot
);
264 uint32_t result_count
= get_result_count(pool
);
266 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) && !available
) {
267 VkResult wait_result
= wait_for_available(device
, pool
, query
);
268 if (wait_result
!= VK_SUCCESS
)
271 } else if (!(flags
& VK_QUERY_RESULT_PARTIAL_BIT
) && !available
) {
272 /* From the Vulkan 1.1.130 spec:
274 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
275 * both not set then no result values are written to pData for
276 * queries that are in the unavailable state at the time of the
277 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
278 * availability state is still written to pData for those queries
279 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
281 result
= VK_NOT_READY
;
282 if (!(flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)) {
283 result_base
+= stride
;
288 for (uint32_t k
= 0; k
< result_count
; k
++) {
290 uint64_t *result
= query_result_addr(pool
, query
, k
);
291 write_query_value_cpu(result_base
, k
, *result
, flags
);
292 } else if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
)
293 /* From the Vulkan 1.1.130 spec:
295 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
296 * is not set, and the query’s status is unavailable, an
297 * intermediate result value between zero and the final result
298 * value is written to pData for that query.
300 * Just return 0 here for simplicity since it's a valid result.
302 write_query_value_cpu(result_base
, k
, 0, flags
);
305 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
306 /* From the Vulkan 1.1.130 spec:
308 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
309 * integer value written for each query is non-zero if the query’s
310 * status was available or zero if the status was unavailable.
312 write_query_value_cpu(result_base
, result_count
, available
, flags
);
314 result_base
+= stride
;
320 tu_GetQueryPoolResults(VkDevice _device
,
321 VkQueryPool queryPool
,
327 VkQueryResultFlags flags
)
329 TU_FROM_HANDLE(tu_device
, device
, _device
);
330 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
331 assert(firstQuery
+ queryCount
<= pool
->size
);
333 if (tu_device_is_lost(device
))
334 return VK_ERROR_DEVICE_LOST
;
336 switch (pool
->type
) {
337 case VK_QUERY_TYPE_OCCLUSION
:
338 case VK_QUERY_TYPE_TIMESTAMP
:
339 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
340 return get_query_pool_results(device
, pool
, firstQuery
, queryCount
,
341 dataSize
, pData
, stride
, flags
);
342 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
343 unreachable("Unimplemented query type");
345 assert(!"Invalid query type");
350 /* Copies a query value from one buffer to another from the GPU. */
352 copy_query_value_gpu(struct tu_cmd_buffer
*cmdbuf
,
355 uint64_t base_write_iova
,
357 VkQueryResultFlags flags
) {
358 uint32_t element_size
= flags
& VK_QUERY_RESULT_64_BIT
?
359 sizeof(uint64_t) : sizeof(uint32_t);
360 uint64_t write_iova
= base_write_iova
+ (offset
* element_size
);
362 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 5);
363 uint32_t mem_to_mem_flags
= flags
& VK_QUERY_RESULT_64_BIT
?
364 CP_MEM_TO_MEM_0_DOUBLE
: 0;
365 tu_cs_emit(cs
, mem_to_mem_flags
);
366 tu_cs_emit_qw(cs
, write_iova
);
367 tu_cs_emit_qw(cs
, src_iova
);
371 emit_copy_query_pool_results(struct tu_cmd_buffer
*cmdbuf
,
373 struct tu_query_pool
*pool
,
376 struct tu_buffer
*buffer
,
377 VkDeviceSize dstOffset
,
379 VkQueryResultFlags flags
)
381 /* From the Vulkan 1.1.130 spec:
383 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
384 * uses of vkCmdResetQueryPool in the same queue, without any additional
387 * To ensure that previous writes to the available bit are coherent, first
388 * wait for all writes to complete.
390 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
392 for (uint32_t i
= 0; i
< queryCount
; i
++) {
393 uint32_t query
= firstQuery
+ i
;
394 uint64_t available_iova
= query_available_iova(pool
, query
);
395 uint64_t buffer_iova
= tu_buffer_iova(buffer
) + dstOffset
+ i
* stride
;
396 uint32_t result_count
= get_result_count(pool
);
398 /* Wait for the available bit to be set if executed with the
399 * VK_QUERY_RESULT_WAIT_BIT flag. */
400 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
401 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
402 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ
) |
403 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
404 tu_cs_emit_qw(cs
, available_iova
);
405 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0x1));
406 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
407 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
410 for (uint32_t k
= 0; k
< result_count
; k
++) {
411 uint64_t result_iova
= query_result_iova(pool
, query
, k
);
413 if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
) {
414 /* Unconditionally copying the bo->result into the buffer here is
415 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
416 * if the query is unavailable, this will copy the correct partial
419 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
420 k
/* offset */, flags
);
422 /* Conditionally copy bo->result into the buffer based on whether the
423 * query is available.
425 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
426 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
427 * that 0 < available < 2, aka available == 1.
429 tu_cs_reserve(cs
, 7 + 6);
430 tu_cs_emit_pkt7(cs
, CP_COND_EXEC
, 6);
431 tu_cs_emit_qw(cs
, available_iova
);
432 tu_cs_emit_qw(cs
, available_iova
);
433 tu_cs_emit(cs
, CP_COND_EXEC_4_REF(0x2));
434 tu_cs_emit(cs
, 6); /* Cond execute the next 6 DWORDS */
436 /* Start of conditional execution */
437 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
438 k
/* offset */, flags
);
439 /* End of conditional execution */
443 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
444 copy_query_value_gpu(cmdbuf
, cs
, available_iova
, buffer_iova
,
445 result_count
/* offset */, flags
);
449 tu_bo_list_add(&cmdbuf
->bo_list
, buffer
->bo
, MSM_SUBMIT_BO_WRITE
);
453 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer
,
454 VkQueryPool queryPool
,
458 VkDeviceSize dstOffset
,
460 VkQueryResultFlags flags
)
462 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
463 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
464 TU_FROM_HANDLE(tu_buffer
, buffer
, dstBuffer
);
465 struct tu_cs
*cs
= &cmdbuf
->cs
;
466 assert(firstQuery
+ queryCount
<= pool
->size
);
468 switch (pool
->type
) {
469 case VK_QUERY_TYPE_OCCLUSION
:
470 case VK_QUERY_TYPE_TIMESTAMP
:
471 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
472 return emit_copy_query_pool_results(cmdbuf
, cs
, pool
, firstQuery
,
473 queryCount
, buffer
, dstOffset
, stride
, flags
);
474 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
475 unreachable("Unimplemented query type");
477 assert(!"Invalid query type");
482 emit_reset_query_pool(struct tu_cmd_buffer
*cmdbuf
,
483 struct tu_query_pool
*pool
,
487 struct tu_cs
*cs
= &cmdbuf
->cs
;
489 for (uint32_t i
= 0; i
< queryCount
; i
++) {
490 uint32_t query
= firstQuery
+ i
;
492 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
493 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
494 tu_cs_emit_qw(cs
, 0x0);
496 for (uint32_t k
= 0; k
< get_result_count(pool
); k
++) {
497 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
498 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, k
));
499 tu_cs_emit_qw(cs
, 0x0);
505 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer
,
506 VkQueryPool queryPool
,
510 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
511 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
513 switch (pool
->type
) {
514 case VK_QUERY_TYPE_TIMESTAMP
:
515 case VK_QUERY_TYPE_OCCLUSION
:
516 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
517 emit_reset_query_pool(cmdbuf
, pool
, firstQuery
, queryCount
);
519 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
520 unreachable("Unimplemented query type");
522 assert(!"Invalid query type");
525 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
529 emit_begin_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
530 struct tu_query_pool
*pool
,
533 /* From the Vulkan 1.1.130 spec:
535 * A query must begin and end inside the same subpass of a render pass
536 * instance, or must both begin and end outside of a render pass
539 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
540 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
541 * query begins/ends inside the same subpass of a render pass, we need to
542 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
543 * is then run on every tile during render, so we just need to accumulate
544 * sample counts in slot->result to compute the query result.
546 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
548 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
551 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
554 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova
));
556 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
557 tu_cs_emit(cs
, ZPASS_DONE
);
561 emit_begin_xfb_query(struct tu_cmd_buffer
*cmdbuf
,
562 struct tu_query_pool
*pool
,
566 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
567 uint64_t begin_iova
= primitive_query_iova(pool
, query
, begin
[0], 0);
569 tu_cs_emit_regs(cs
, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova
));
570 tu6_emit_event_write(cmdbuf
, cs
, WRITE_PRIMITIVE_COUNTS
);
574 tu_CmdBeginQuery(VkCommandBuffer commandBuffer
,
575 VkQueryPool queryPool
,
577 VkQueryControlFlags flags
)
579 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
580 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
581 assert(query
< pool
->size
);
583 switch (pool
->type
) {
584 case VK_QUERY_TYPE_OCCLUSION
:
585 /* In freedreno, there is no implementation difference between
586 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
587 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
589 emit_begin_occlusion_query(cmdbuf
, pool
, query
);
591 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
592 emit_begin_xfb_query(cmdbuf
, pool
, query
, 0);
594 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
595 case VK_QUERY_TYPE_TIMESTAMP
:
596 unreachable("Unimplemented query type");
598 assert(!"Invalid query type");
601 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
605 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer
,
606 VkQueryPool queryPool
,
608 VkQueryControlFlags flags
,
611 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
612 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
613 assert(query
< pool
->size
);
615 switch (pool
->type
) {
616 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
617 emit_begin_xfb_query(cmdbuf
, pool
, query
, index
);
620 assert(!"Invalid query type");
623 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
627 emit_end_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
628 struct tu_query_pool
*pool
,
631 /* Ending an occlusion query happens in a few steps:
632 * 1) Set the slot->end to UINT64_MAX.
633 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
634 * write the current sample count value into slot->end.
635 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
636 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
637 * 4) Accumulate the results of the query (slot->end - slot->begin) into
639 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
640 * pass, set the slot's available bit since the query is now done.
641 * 6) If vkCmdEndQuery *is* called from within the scope of a render
642 * pass, we cannot mark as available yet since the commands in
643 * draw_cs are not run until vkCmdEndRenderPass.
645 const struct tu_render_pass
*pass
= cmdbuf
->state
.pass
;
646 struct tu_cs
*cs
= pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
648 uint64_t available_iova
= query_available_iova(pool
, query
);
649 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
650 uint64_t end_iova
= occlusion_query_iova(pool
, query
, end
);
651 uint64_t result_iova
= query_result_iova(pool
, query
, 0);
652 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
653 tu_cs_emit_qw(cs
, end_iova
);
654 tu_cs_emit_qw(cs
, 0xffffffffffffffffull
);
656 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
659 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
662 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova
));
664 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
665 tu_cs_emit(cs
, ZPASS_DONE
);
667 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
668 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE
) |
669 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
670 tu_cs_emit_qw(cs
, end_iova
);
671 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0xffffffff));
672 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
673 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
675 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
676 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
677 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
);
678 tu_cs_emit_qw(cs
, result_iova
);
679 tu_cs_emit_qw(cs
, result_iova
);
680 tu_cs_emit_qw(cs
, end_iova
);
681 tu_cs_emit_qw(cs
, begin_iova
);
683 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
686 /* Technically, queries should be tracked per-subpass, but here we track
687 * at the render pass level to simply the code a bit. This is safe
688 * because the only commands that use the available bit are
689 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
690 * cannot be invoked from inside a render pass scope.
692 cs
= &cmdbuf
->draw_epilogue_cs
;
694 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
695 tu_cs_emit_qw(cs
, available_iova
);
696 tu_cs_emit_qw(cs
, 0x1);
700 emit_end_xfb_query(struct tu_cmd_buffer
*cmdbuf
,
701 struct tu_query_pool
*pool
,
705 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
707 uint64_t end_iova
= primitive_query_iova(pool
, query
, end
[0], 0);
708 uint64_t result_written_iova
= query_result_iova(pool
, query
, 0);
709 uint64_t result_generated_iova
= query_result_iova(pool
, query
, 1);
710 uint64_t begin_written_iova
= primitive_query_iova(pool
, query
, begin
[stream_id
], 0);
711 uint64_t begin_generated_iova
= primitive_query_iova(pool
, query
, begin
[stream_id
], 1);
712 uint64_t end_written_iova
= primitive_query_iova(pool
, query
, end
[stream_id
], 0);
713 uint64_t end_generated_iova
= primitive_query_iova(pool
, query
, end
[stream_id
], 1);
714 uint64_t available_iova
= query_available_iova(pool
, query
);
716 tu_cs_emit_regs(cs
, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova
));
717 tu6_emit_event_write(cmdbuf
, cs
, WRITE_PRIMITIVE_COUNTS
);
720 tu6_emit_event_write(cmdbuf
, cs
, CACHE_FLUSH_TS
);
722 /* Set the count of written primitives */
723 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
724 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
|
725 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES
| 0x80000000);
726 tu_cs_emit_qw(cs
, result_written_iova
);
727 tu_cs_emit_qw(cs
, result_written_iova
);
728 tu_cs_emit_qw(cs
, end_written_iova
);
729 tu_cs_emit_qw(cs
, begin_written_iova
);
731 tu6_emit_event_write(cmdbuf
, cs
, CACHE_FLUSH_TS
);
733 /* Set the count of generated primitives */
734 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
735 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
|
736 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES
| 0x80000000);
737 tu_cs_emit_qw(cs
, result_generated_iova
);
738 tu_cs_emit_qw(cs
, result_generated_iova
);
739 tu_cs_emit_qw(cs
, end_generated_iova
);
740 tu_cs_emit_qw(cs
, begin_generated_iova
);
742 /* Set the availability to 1 */
743 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
744 tu_cs_emit_qw(cs
, available_iova
);
745 tu_cs_emit_qw(cs
, 0x1);
748 /* Implement this bit of spec text from section 17.2 "Query Operation":
750 * If queries are used while executing a render pass instance that has
751 * multiview enabled, the query uses N consecutive query indices in the
752 * query pool (starting at query) where N is the number of bits set in the
753 * view mask in the subpass the query is used in. How the numerical
754 * results of the query are distributed among the queries is
755 * implementation-dependent. For example, some implementations may write
756 * each view’s results to a distinct query, while other implementations
757 * may write the total result to the first query and write zero to the
758 * other queries. However, the sum of the results in all the queries must
759 * accurately reflect the total result of the query summed over all views.
760 * Applications can sum the results from all the queries to compute the
763 * Since we execute all views at once, we write zero to the other queries.
764 * Furthermore, because queries must be reset before use, and we set the
765 * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
769 handle_multiview_queries(struct tu_cmd_buffer
*cmd
,
770 struct tu_query_pool
*pool
,
773 if (!cmd
->state
.pass
|| !cmd
->state
.subpass
->multiview_mask
)
776 unsigned views
= util_bitcount(cmd
->state
.subpass
->multiview_mask
);
777 struct tu_cs
*cs
= &cmd
->draw_epilogue_cs
;
779 for (uint32_t i
= 1; i
< views
; i
++) {
780 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
781 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
+ i
));
782 tu_cs_emit_qw(cs
, 0x1);
787 tu_CmdEndQuery(VkCommandBuffer commandBuffer
,
788 VkQueryPool queryPool
,
791 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
792 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
793 assert(query
< pool
->size
);
795 switch (pool
->type
) {
796 case VK_QUERY_TYPE_OCCLUSION
:
797 emit_end_occlusion_query(cmdbuf
, pool
, query
);
799 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
800 emit_end_xfb_query(cmdbuf
, pool
, query
, 0);
802 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
803 case VK_QUERY_TYPE_TIMESTAMP
:
804 unreachable("Unimplemented query type");
806 assert(!"Invalid query type");
809 handle_multiview_queries(cmdbuf
, pool
, query
);
811 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
815 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer
,
816 VkQueryPool queryPool
,
820 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
821 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
822 assert(query
< pool
->size
);
824 switch (pool
->type
) {
825 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
827 emit_end_xfb_query(cmdbuf
, pool
, query
, index
);
830 assert(!"Invalid query type");
833 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
837 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer
,
838 VkPipelineStageFlagBits pipelineStage
,
839 VkQueryPool queryPool
,
842 TU_FROM_HANDLE(tu_cmd_buffer
, cmd
, commandBuffer
);
843 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
845 tu_bo_list_add(&cmd
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
847 /* Inside a render pass, just write the timestamp multiple times so that
848 * the user gets the last one if we use GMEM. There isn't really much
849 * better we can do, and this seems to be what the blob does too.
851 struct tu_cs
*cs
= cmd
->state
.pass
? &cmd
->draw_cs
: &cmd
->cs
;
853 /* Stages that will already have been executed by the time the CP executes
854 * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
855 * indirect stage counts as top-of-pipe too.
857 VkPipelineStageFlags top_of_pipe_flags
=
858 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
|
859 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT
;
861 if (pipelineStage
& ~top_of_pipe_flags
) {
862 /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
863 * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
866 * Stalling the CP like this is really unfortunate, but I don't think
867 * there's a better solution that allows all 48 bits of precision
868 * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
873 tu_cs_emit_pkt7(cs
, CP_REG_TO_MEM
, 3);
874 tu_cs_emit(cs
, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO
) |
875 CP_REG_TO_MEM_0_CNT(2) |
876 CP_REG_TO_MEM_0_64B
);
877 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, 0));
879 /* Only flag availability once the entire renderpass is done, similar to
880 * the begin/end path.
882 cs
= cmd
->state
.pass
? &cmd
->draw_epilogue_cs
: &cmd
->cs
;
884 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
885 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
886 tu_cs_emit_qw(cs
, 0x1);
888 /* From the spec for vkCmdWriteTimestamp:
890 * If vkCmdWriteTimestamp is called while executing a render pass
891 * instance that has multiview enabled, the timestamp uses N consecutive
892 * query indices in the query pool (starting at query) where N is the
893 * number of bits set in the view mask of the subpass the command is
894 * executed in. The resulting query values are determined by an
895 * implementation-dependent choice of one of the following behaviors:
897 * - The first query is a timestamp value and (if more than one bit is
898 * set in the view mask) zero is written to the remaining queries.
899 * If two timestamps are written in the same subpass, the sum of the
900 * execution time of all views between those commands is the
901 * difference between the first query written by each command.
903 * - All N queries are timestamp values. If two timestamps are written
904 * in the same subpass, the sum of the execution time of all views
905 * between those commands is the sum of the difference between
906 * corresponding queries written by each command. The difference
907 * between corresponding queries may be the execution time of a
910 * We execute all views in the same draw call, so we implement the first
911 * option, the same as regular queries.
913 handle_multiview_queries(cmd
, pool
, query
);