2 * Copyrigh 2016 Red Hat Inc.
4 * Copyright © 2015 Intel Corporation
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
26 #include "tu_private.h"
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
46 /* Depending on the query type, there might be 2 integer values.
47 * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
48 * values[0] : primitives written, values[1]: primitives generated
50 struct PACKED slot_value
{
54 struct PACKED query_slot
{
55 struct slot_value available
; /* 0 when unavailable, 1 when available */
56 struct slot_value result
;
59 struct PACKED occlusion_query_slot
{
60 struct query_slot common
;
61 struct slot_value begin
;
62 struct slot_value end
;
65 /* Returns the IOVA of a given uint64_t field in a given slot of a query
67 #define query_iova(type, pool, query, field, value_index) \
68 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
69 offsetof(struct slot_value, values[value_index])
71 #define occlusion_query_iova(pool, query, field) \
72 query_iova(struct occlusion_query_slot, pool, query, field, 0)
74 #define query_available_iova(pool, query) \
75 query_iova(struct query_slot, pool, query, available, 0)
77 #define query_result_iova(pool, query, i) \
78 query_iova(struct query_slot, pool, query, result, i)
80 #define query_is_available(slot) slot->available.values[0]
83 * Returns a pointer to a given slot in a query pool.
85 static void* slot_address(struct tu_query_pool
*pool
, uint32_t query
)
87 return (char*)pool
->bo
.map
+ query
* pool
->stride
;
91 tu_CreateQueryPool(VkDevice _device
,
92 const VkQueryPoolCreateInfo
*pCreateInfo
,
93 const VkAllocationCallbacks
*pAllocator
,
94 VkQueryPool
*pQueryPool
)
96 TU_FROM_HANDLE(tu_device
, device
, _device
);
97 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
98 assert(pCreateInfo
->queryCount
> 0);
101 switch (pCreateInfo
->queryType
) {
102 case VK_QUERY_TYPE_OCCLUSION
:
103 slot_size
= sizeof(struct occlusion_query_slot
);
105 case VK_QUERY_TYPE_TIMESTAMP
:
106 slot_size
= sizeof(struct query_slot
);
108 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
109 unreachable("Unimplemented query type");
111 assert(!"Invalid query type");
114 struct tu_query_pool
*pool
=
115 vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*pool
), 8,
116 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
119 return vk_error(device
->instance
, VK_ERROR_OUT_OF_HOST_MEMORY
);
121 VkResult result
= tu_bo_init_new(device
, &pool
->bo
,
122 pCreateInfo
->queryCount
* slot_size
);
123 if (result
!= VK_SUCCESS
) {
124 vk_free2(&device
->alloc
, pAllocator
, pool
);
128 result
= tu_bo_map(device
, &pool
->bo
);
129 if (result
!= VK_SUCCESS
) {
130 tu_bo_finish(device
, &pool
->bo
);
131 vk_free2(&device
->alloc
, pAllocator
, pool
);
135 /* Initialize all query statuses to unavailable */
136 memset(pool
->bo
.map
, 0, pool
->bo
.size
);
138 pool
->type
= pCreateInfo
->queryType
;
139 pool
->stride
= slot_size
;
140 pool
->size
= pCreateInfo
->queryCount
;
141 pool
->pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
142 *pQueryPool
= tu_query_pool_to_handle(pool
);
148 tu_DestroyQueryPool(VkDevice _device
,
150 const VkAllocationCallbacks
*pAllocator
)
152 TU_FROM_HANDLE(tu_device
, device
, _device
);
153 TU_FROM_HANDLE(tu_query_pool
, pool
, _pool
);
158 tu_bo_finish(device
, &pool
->bo
);
159 vk_free2(&device
->alloc
, pAllocator
, pool
);
163 get_result_count(struct tu_query_pool
*pool
)
165 switch (pool
->type
) {
166 /* Occulusion and timestamp queries write one integer value */
167 case VK_QUERY_TYPE_OCCLUSION
:
168 case VK_QUERY_TYPE_TIMESTAMP
:
170 /* Transform feedback queries write two integer values */
171 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
174 assert(!"Invalid query type");
179 /* Wait on the the availability status of a query up until a timeout. */
181 wait_for_available(struct tu_device
*device
, struct tu_query_pool
*pool
,
184 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
185 * scheduler friendly way instead of busy polling once the patch has landed
187 struct query_slot
*slot
= slot_address(pool
, query
);
188 uint64_t abs_timeout
= os_time_get_absolute_timeout(
189 WAIT_TIMEOUT
* NSEC_PER_SEC
);
190 while(os_time_get_nano() < abs_timeout
) {
191 if (query_is_available(slot
))
194 return vk_error(device
->instance
, VK_TIMEOUT
);
197 /* Writes a query value to a buffer from the CPU. */
199 write_query_value_cpu(char* base
,
202 VkQueryResultFlags flags
)
204 if (flags
& VK_QUERY_RESULT_64_BIT
) {
205 *(uint64_t*)(base
+ (offset
* sizeof(uint64_t))) = value
;
207 *(uint32_t*)(base
+ (offset
* sizeof(uint32_t))) = value
;
212 get_query_pool_results(struct tu_device
*device
,
213 struct tu_query_pool
*pool
,
219 VkQueryResultFlags flags
)
221 assert(dataSize
>= stride
* queryCount
);
223 char *result_base
= pData
;
224 VkResult result
= VK_SUCCESS
;
225 for (uint32_t i
= 0; i
< queryCount
; i
++) {
226 uint32_t query
= firstQuery
+ i
;
227 struct query_slot
*slot
= slot_address(pool
, query
);
228 bool available
= query_is_available(slot
);
229 uint32_t result_count
= get_result_count(pool
);
231 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) && !available
) {
232 VkResult wait_result
= wait_for_available(device
, pool
, query
);
233 if (wait_result
!= VK_SUCCESS
)
236 } else if (!(flags
& VK_QUERY_RESULT_PARTIAL_BIT
) && !available
) {
237 /* From the Vulkan 1.1.130 spec:
239 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
240 * both not set then no result values are written to pData for
241 * queries that are in the unavailable state at the time of the
242 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
243 * availability state is still written to pData for those queries
244 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
246 result
= VK_NOT_READY
;
247 if (!(flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)) {
248 result_base
+= stride
;
253 for (uint32_t k
= 0; k
< result_count
; k
++) {
255 write_query_value_cpu(result_base
, k
, slot
->result
.values
[k
], flags
);
256 else if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
)
257 /* From the Vulkan 1.1.130 spec:
259 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
260 * is not set, and the query’s status is unavailable, an
261 * intermediate result value between zero and the final result
262 * value is written to pData for that query.
264 * Just return 0 here for simplicity since it's a valid result.
266 write_query_value_cpu(result_base
, k
, 0, flags
);
269 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
270 /* From the Vulkan 1.1.130 spec:
272 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
273 * integer value written for each query is non-zero if the query’s
274 * status was available or zero if the status was unavailable.
276 write_query_value_cpu(result_base
, result_count
, available
, flags
);
278 result_base
+= stride
;
284 tu_GetQueryPoolResults(VkDevice _device
,
285 VkQueryPool queryPool
,
291 VkQueryResultFlags flags
)
293 TU_FROM_HANDLE(tu_device
, device
, _device
);
294 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
295 assert(firstQuery
+ queryCount
<= pool
->size
);
297 switch (pool
->type
) {
298 case VK_QUERY_TYPE_OCCLUSION
:
299 case VK_QUERY_TYPE_TIMESTAMP
:
300 return get_query_pool_results(device
, pool
, firstQuery
, queryCount
,
301 dataSize
, pData
, stride
, flags
);
302 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
303 unreachable("Unimplemented query type");
305 assert(!"Invalid query type");
310 /* Copies a query value from one buffer to another from the GPU. */
312 copy_query_value_gpu(struct tu_cmd_buffer
*cmdbuf
,
315 uint64_t base_write_iova
,
317 VkQueryResultFlags flags
) {
318 uint32_t element_size
= flags
& VK_QUERY_RESULT_64_BIT
?
319 sizeof(uint64_t) : sizeof(uint32_t);
320 uint64_t write_iova
= base_write_iova
+ (offset
* element_size
);
322 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 5);
323 uint32_t mem_to_mem_flags
= flags
& VK_QUERY_RESULT_64_BIT
?
324 CP_MEM_TO_MEM_0_DOUBLE
: 0;
325 tu_cs_emit(cs
, mem_to_mem_flags
);
326 tu_cs_emit_qw(cs
, write_iova
);
327 tu_cs_emit_qw(cs
, src_iova
);
331 emit_copy_query_pool_results(struct tu_cmd_buffer
*cmdbuf
,
333 struct tu_query_pool
*pool
,
336 struct tu_buffer
*buffer
,
337 VkDeviceSize dstOffset
,
339 VkQueryResultFlags flags
)
341 /* From the Vulkan 1.1.130 spec:
343 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
344 * uses of vkCmdResetQueryPool in the same queue, without any additional
347 * To ensure that previous writes to the available bit are coherent, first
348 * wait for all writes to complete.
350 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
352 for (uint32_t i
= 0; i
< queryCount
; i
++) {
353 uint32_t query
= firstQuery
+ i
;
354 uint64_t available_iova
= query_available_iova(pool
, query
);
355 uint64_t buffer_iova
= tu_buffer_iova(buffer
) + dstOffset
+ i
* stride
;
356 uint32_t result_count
= get_result_count(pool
);
358 /* Wait for the available bit to be set if executed with the
359 * VK_QUERY_RESULT_WAIT_BIT flag. */
360 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
361 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
362 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ
) |
363 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
364 tu_cs_emit_qw(cs
, available_iova
);
365 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0x1));
366 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
367 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
370 for (uint32_t k
= 0; k
< result_count
; k
++) {
371 uint64_t result_iova
= query_result_iova(pool
, query
, k
);
373 if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
) {
374 /* Unconditionally copying the bo->result into the buffer here is
375 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
376 * if the query is unavailable, this will copy the correct partial
379 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
380 k
/* offset */, flags
);
382 /* Conditionally copy bo->result into the buffer based on whether the
383 * query is available.
385 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
386 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
387 * that 0 < available < 2, aka available == 1.
389 tu_cs_reserve(cs
, 7 + 6);
390 tu_cs_emit_pkt7(cs
, CP_COND_EXEC
, 6);
391 tu_cs_emit_qw(cs
, available_iova
);
392 tu_cs_emit_qw(cs
, available_iova
);
393 tu_cs_emit(cs
, CP_COND_EXEC_4_REF(0x2));
394 tu_cs_emit(cs
, 6); /* Cond execute the next 6 DWORDS */
396 /* Start of conditional execution */
397 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
398 k
/* offset */, flags
);
399 /* End of conditional execution */
403 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
404 copy_query_value_gpu(cmdbuf
, cs
, available_iova
, buffer_iova
,
405 result_count
/* offset */, flags
);
409 tu_bo_list_add(&cmdbuf
->bo_list
, buffer
->bo
, MSM_SUBMIT_BO_WRITE
);
413 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer
,
414 VkQueryPool queryPool
,
418 VkDeviceSize dstOffset
,
420 VkQueryResultFlags flags
)
422 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
423 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
424 TU_FROM_HANDLE(tu_buffer
, buffer
, dstBuffer
);
425 struct tu_cs
*cs
= &cmdbuf
->cs
;
426 assert(firstQuery
+ queryCount
<= pool
->size
);
428 switch (pool
->type
) {
429 case VK_QUERY_TYPE_OCCLUSION
:
430 case VK_QUERY_TYPE_TIMESTAMP
:
431 return emit_copy_query_pool_results(cmdbuf
, cs
, pool
, firstQuery
,
432 queryCount
, buffer
, dstOffset
, stride
, flags
);
433 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
434 unreachable("Unimplemented query type");
436 assert(!"Invalid query type");
441 emit_reset_query_pool(struct tu_cmd_buffer
*cmdbuf
,
442 struct tu_query_pool
*pool
,
446 struct tu_cs
*cs
= &cmdbuf
->cs
;
448 for (uint32_t i
= 0; i
< queryCount
; i
++) {
449 uint32_t query
= firstQuery
+ i
;
451 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
452 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
453 tu_cs_emit_qw(cs
, 0x0);
455 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
456 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, 0));
457 tu_cs_emit_qw(cs
, 0x0);
458 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
459 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, 1));
460 tu_cs_emit_qw(cs
, 0x0);
465 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer
,
466 VkQueryPool queryPool
,
470 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
471 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
473 switch (pool
->type
) {
474 case VK_QUERY_TYPE_TIMESTAMP
:
475 case VK_QUERY_TYPE_OCCLUSION
:
476 emit_reset_query_pool(cmdbuf
, pool
, firstQuery
, queryCount
);
478 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
479 unreachable("Unimplemented query type");
481 assert(!"Invalid query type");
484 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
488 emit_begin_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
489 struct tu_query_pool
*pool
,
492 /* From the Vulkan 1.1.130 spec:
494 * A query must begin and end inside the same subpass of a render pass
495 * instance, or must both begin and end outside of a render pass
498 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
499 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
500 * query begins/ends inside the same subpass of a render pass, we need to
501 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
502 * is then run on every tile during render, so we just need to accumulate
503 * sample counts in slot->result to compute the query result.
505 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
507 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
510 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
513 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova
));
515 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
516 tu_cs_emit(cs
, ZPASS_DONE
);
520 tu_CmdBeginQuery(VkCommandBuffer commandBuffer
,
521 VkQueryPool queryPool
,
523 VkQueryControlFlags flags
)
525 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
526 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
527 assert(query
< pool
->size
);
529 switch (pool
->type
) {
530 case VK_QUERY_TYPE_OCCLUSION
:
531 /* In freedreno, there is no implementation difference between
532 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
533 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
535 emit_begin_occlusion_query(cmdbuf
, pool
, query
);
537 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
538 case VK_QUERY_TYPE_TIMESTAMP
:
539 unreachable("Unimplemented query type");
541 assert(!"Invalid query type");
544 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
548 emit_end_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
549 struct tu_query_pool
*pool
,
552 /* Ending an occlusion query happens in a few steps:
553 * 1) Set the slot->end to UINT64_MAX.
554 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
555 * write the current sample count value into slot->end.
556 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
557 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
558 * 4) Accumulate the results of the query (slot->end - slot->begin) into
560 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
561 * pass, set the slot's available bit since the query is now done.
562 * 6) If vkCmdEndQuery *is* called from within the scope of a render
563 * pass, we cannot mark as available yet since the commands in
564 * draw_cs are not run until vkCmdEndRenderPass.
566 const struct tu_render_pass
*pass
= cmdbuf
->state
.pass
;
567 struct tu_cs
*cs
= pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
569 uint64_t available_iova
= query_available_iova(pool
, query
);
570 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
571 uint64_t end_iova
= occlusion_query_iova(pool
, query
, end
);
572 uint64_t result_iova
= query_result_iova(pool
, query
, 0);
573 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
574 tu_cs_emit_qw(cs
, end_iova
);
575 tu_cs_emit_qw(cs
, 0xffffffffffffffffull
);
577 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
580 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
583 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova
));
585 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
586 tu_cs_emit(cs
, ZPASS_DONE
);
588 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
589 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE
) |
590 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
591 tu_cs_emit_qw(cs
, end_iova
);
592 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0xffffffff));
593 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
594 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
596 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
597 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
598 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
);
599 tu_cs_emit_qw(cs
, result_iova
);
600 tu_cs_emit_qw(cs
, result_iova
);
601 tu_cs_emit_qw(cs
, end_iova
);
602 tu_cs_emit_qw(cs
, begin_iova
);
604 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
607 /* Technically, queries should be tracked per-subpass, but here we track
608 * at the render pass level to simply the code a bit. This is safe
609 * because the only commands that use the available bit are
610 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
611 * cannot be invoked from inside a render pass scope.
613 cs
= &cmdbuf
->draw_epilogue_cs
;
615 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
616 tu_cs_emit_qw(cs
, available_iova
);
617 tu_cs_emit_qw(cs
, 0x1);
621 tu_CmdEndQuery(VkCommandBuffer commandBuffer
,
622 VkQueryPool queryPool
,
625 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
626 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
627 assert(query
< pool
->size
);
629 switch (pool
->type
) {
630 case VK_QUERY_TYPE_OCCLUSION
:
631 emit_end_occlusion_query(cmdbuf
, pool
, query
);
633 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
634 case VK_QUERY_TYPE_TIMESTAMP
:
635 unreachable("Unimplemented query type");
637 assert(!"Invalid query type");
640 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
644 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer
,
645 VkPipelineStageFlagBits pipelineStage
,
646 VkQueryPool queryPool
,
649 TU_FROM_HANDLE(tu_cmd_buffer
, cmd
, commandBuffer
);
650 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
651 struct tu_cs
*cs
= cmd
->state
.pass
? &cmd
->draw_epilogue_cs
: &cmd
->cs
;
653 /* WFI to get more accurate timestamp */
656 tu_cs_emit_pkt7(cs
, CP_REG_TO_MEM
, 3);
657 tu_cs_emit(cs
, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO
) |
658 CP_REG_TO_MEM_0_CNT(2) |
659 CP_REG_TO_MEM_0_64B
);
660 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, 0));
662 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
663 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
664 tu_cs_emit_qw(cs
, 0x1);
666 if (cmd
->state
.pass
) {
667 /* TODO: to have useful in-renderpass timestamps:
668 * for sysmem path, we can just emit the timestamp in draw_cs,
669 * for gmem renderpass, we do something with accumulate,
670 * but I'm not sure that would follow the spec
672 tu_finishme("CmdWriteTimestam in renderpass not accurate");