2 * Copyrigh 2016 Red Hat Inc.
4 * Copyright © 2015 Intel Corporation
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
26 #include "tu_private.h"
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
46 /* It seems like sample counts need to be copied over to 16-byte aligned
48 struct PACKED slot_value
{
53 struct PACKED query_slot
{
54 struct slot_value available
; /* 0 when unavailable, 1 when available */
55 struct slot_value result
;
58 struct PACKED occlusion_query_slot
{
59 struct query_slot common
;
60 struct slot_value begin
;
61 struct slot_value end
;
64 /* Returns the IOVA of a given uint64_t field in a given slot of a query
66 #define query_iova(type, pool, query, field) \
67 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
68 offsetof(struct slot_value, value)
70 #define occlusion_query_iova(pool, query, field) \
71 query_iova(struct occlusion_query_slot, pool, query, field)
73 #define query_available_iova(pool, query) \
74 query_iova(struct query_slot, pool, query, available)
76 #define query_result_iova(pool, query) \
77 query_iova(struct query_slot, pool, query, result)
79 #define query_is_available(slot) slot->available.value
82 * Returns a pointer to a given slot in a query pool.
84 static void* slot_address(struct tu_query_pool
*pool
, uint32_t query
)
86 return (char*)pool
->bo
.map
+ query
* pool
->stride
;
90 tu_CreateQueryPool(VkDevice _device
,
91 const VkQueryPoolCreateInfo
*pCreateInfo
,
92 const VkAllocationCallbacks
*pAllocator
,
93 VkQueryPool
*pQueryPool
)
95 TU_FROM_HANDLE(tu_device
, device
, _device
);
96 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
97 assert(pCreateInfo
->queryCount
> 0);
100 switch (pCreateInfo
->queryType
) {
101 case VK_QUERY_TYPE_OCCLUSION
:
102 slot_size
= sizeof(struct occlusion_query_slot
);
104 case VK_QUERY_TYPE_TIMESTAMP
:
105 slot_size
= sizeof(struct query_slot
);
107 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
108 unreachable("Unimplemented query type");
110 assert(!"Invalid query type");
113 struct tu_query_pool
*pool
=
114 vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*pool
), 8,
115 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
118 return vk_error(device
->instance
, VK_ERROR_OUT_OF_HOST_MEMORY
);
120 VkResult result
= tu_bo_init_new(device
, &pool
->bo
,
121 pCreateInfo
->queryCount
* slot_size
);
122 if (result
!= VK_SUCCESS
) {
123 vk_free2(&device
->alloc
, pAllocator
, pool
);
127 result
= tu_bo_map(device
, &pool
->bo
);
128 if (result
!= VK_SUCCESS
) {
129 tu_bo_finish(device
, &pool
->bo
);
130 vk_free2(&device
->alloc
, pAllocator
, pool
);
134 /* Initialize all query statuses to unavailable */
135 memset(pool
->bo
.map
, 0, pool
->bo
.size
);
137 pool
->type
= pCreateInfo
->queryType
;
138 pool
->stride
= slot_size
;
139 pool
->size
= pCreateInfo
->queryCount
;
140 pool
->pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
141 *pQueryPool
= tu_query_pool_to_handle(pool
);
147 tu_DestroyQueryPool(VkDevice _device
,
149 const VkAllocationCallbacks
*pAllocator
)
151 TU_FROM_HANDLE(tu_device
, device
, _device
);
152 TU_FROM_HANDLE(tu_query_pool
, pool
, _pool
);
157 tu_bo_finish(device
, &pool
->bo
);
158 vk_free2(&device
->alloc
, pAllocator
, pool
);
161 /* Wait on the the availability status of a query up until a timeout. */
163 wait_for_available(struct tu_device
*device
, struct tu_query_pool
*pool
,
166 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
167 * scheduler friendly way instead of busy polling once the patch has landed
169 struct query_slot
*slot
= slot_address(pool
, query
);
170 uint64_t abs_timeout
= os_time_get_absolute_timeout(
171 WAIT_TIMEOUT
* NSEC_PER_SEC
);
172 while(os_time_get_nano() < abs_timeout
) {
173 if (query_is_available(slot
))
176 return vk_error(device
->instance
, VK_TIMEOUT
);
179 /* Writes a query value to a buffer from the CPU. */
181 write_query_value_cpu(char* base
,
184 VkQueryResultFlags flags
)
186 if (flags
& VK_QUERY_RESULT_64_BIT
) {
187 *(uint64_t*)(base
+ (offset
* sizeof(uint64_t))) = value
;
189 *(uint32_t*)(base
+ (offset
* sizeof(uint32_t))) = value
;
194 get_query_pool_results(struct tu_device
*device
,
195 struct tu_query_pool
*pool
,
201 VkQueryResultFlags flags
)
203 assert(dataSize
>= stride
* queryCount
);
205 char *result_base
= pData
;
206 VkResult result
= VK_SUCCESS
;
207 for (uint32_t i
= 0; i
< queryCount
; i
++) {
208 uint32_t query
= firstQuery
+ i
;
209 struct query_slot
*slot
= slot_address(pool
, query
);
210 bool available
= query_is_available(slot
);
211 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) && !available
) {
212 VkResult wait_result
= wait_for_available(device
, pool
, query
);
213 if (wait_result
!= VK_SUCCESS
)
216 } else if (!(flags
& VK_QUERY_RESULT_PARTIAL_BIT
) && !available
) {
217 /* From the Vulkan 1.1.130 spec:
219 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
220 * both not set then no result values are written to pData for
221 * queries that are in the unavailable state at the time of the
222 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
223 * availability state is still written to pData for those queries
224 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
226 result
= VK_NOT_READY
;
227 if (!(flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)) {
228 result_base
+= stride
;
234 write_query_value_cpu(result_base
, 0, slot
->result
.value
, flags
);
235 else if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
)
236 /* From the Vulkan 1.1.130 spec:
238 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
239 * is not set, and the query’s status is unavailable, an
240 * intermediate result value between zero and the final result
241 * value is written to pData for that query.
243 * Just return 0 here for simplicity since it's a valid result.
245 write_query_value_cpu(result_base
, 0, 0, flags
);
247 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
248 /* From the Vulkan 1.1.130 spec:
250 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
251 * integer value written for each query is non-zero if the query’s
252 * status was available or zero if the status was unavailable.
254 write_query_value_cpu(result_base
, 1, available
, flags
);
256 result_base
+= stride
;
262 tu_GetQueryPoolResults(VkDevice _device
,
263 VkQueryPool queryPool
,
269 VkQueryResultFlags flags
)
271 TU_FROM_HANDLE(tu_device
, device
, _device
);
272 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
273 assert(firstQuery
+ queryCount
<= pool
->size
);
275 switch (pool
->type
) {
276 case VK_QUERY_TYPE_OCCLUSION
:
277 case VK_QUERY_TYPE_TIMESTAMP
:
278 return get_query_pool_results(device
, pool
, firstQuery
, queryCount
,
279 dataSize
, pData
, stride
, flags
);
280 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
281 unreachable("Unimplemented query type");
283 assert(!"Invalid query type");
288 /* Copies a query value from one buffer to another from the GPU. */
290 copy_query_value_gpu(struct tu_cmd_buffer
*cmdbuf
,
293 uint64_t base_write_iova
,
295 VkQueryResultFlags flags
) {
296 uint32_t element_size
= flags
& VK_QUERY_RESULT_64_BIT
?
297 sizeof(uint64_t) : sizeof(uint32_t);
298 uint64_t write_iova
= base_write_iova
+ (offset
* element_size
);
300 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 5);
301 uint32_t mem_to_mem_flags
= flags
& VK_QUERY_RESULT_64_BIT
?
302 CP_MEM_TO_MEM_0_DOUBLE
: 0;
303 tu_cs_emit(cs
, mem_to_mem_flags
);
304 tu_cs_emit_qw(cs
, write_iova
);
305 tu_cs_emit_qw(cs
, src_iova
);
309 emit_copy_query_pool_results(struct tu_cmd_buffer
*cmdbuf
,
311 struct tu_query_pool
*pool
,
314 struct tu_buffer
*buffer
,
315 VkDeviceSize dstOffset
,
317 VkQueryResultFlags flags
)
319 /* From the Vulkan 1.1.130 spec:
321 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
322 * uses of vkCmdResetQueryPool in the same queue, without any additional
325 * To ensure that previous writes to the available bit are coherent, first
326 * wait for all writes to complete.
328 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
330 for (uint32_t i
= 0; i
< queryCount
; i
++) {
331 uint32_t query
= firstQuery
+ i
;
332 uint64_t available_iova
= query_available_iova(pool
, query
);
333 uint64_t result_iova
= query_result_iova(pool
, query
);
334 uint64_t buffer_iova
= tu_buffer_iova(buffer
) + dstOffset
+ i
* stride
;
336 /* Wait for the available bit to be set if executed with the
337 * VK_QUERY_RESULT_WAIT_BIT flag. */
338 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
339 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
340 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ
) |
341 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
342 tu_cs_emit_qw(cs
, available_iova
);
343 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0x1));
344 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
345 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
348 if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
) {
349 /* Unconditionally copying the bo->result into the buffer here is
350 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
351 * if the query is unavailable, this will copy the correct partial
354 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
355 0 /* offset */, flags
);
357 /* Conditionally copy bo->result into the buffer based on whether the
358 * query is available.
360 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
361 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
362 * that 0 < available < 2, aka available == 1.
364 tu_cs_reserve(cs
, 7 + 6);
365 tu_cs_emit_pkt7(cs
, CP_COND_EXEC
, 6);
366 tu_cs_emit_qw(cs
, available_iova
);
367 tu_cs_emit_qw(cs
, available_iova
);
368 tu_cs_emit(cs
, CP_COND_EXEC_4_REF(0x2));
369 tu_cs_emit(cs
, 6); /* Cond execute the next 6 DWORDS */
371 /* Start of conditional execution */
372 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
373 0 /* offset */, flags
);
374 /* End of conditional execution */
377 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
378 copy_query_value_gpu(cmdbuf
, cs
, available_iova
, buffer_iova
,
379 1 /* offset */, flags
);
383 tu_bo_list_add(&cmdbuf
->bo_list
, buffer
->bo
, MSM_SUBMIT_BO_WRITE
);
387 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer
,
388 VkQueryPool queryPool
,
392 VkDeviceSize dstOffset
,
394 VkQueryResultFlags flags
)
396 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
397 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
398 TU_FROM_HANDLE(tu_buffer
, buffer
, dstBuffer
);
399 struct tu_cs
*cs
= &cmdbuf
->cs
;
400 assert(firstQuery
+ queryCount
<= pool
->size
);
402 switch (pool
->type
) {
403 case VK_QUERY_TYPE_OCCLUSION
:
404 case VK_QUERY_TYPE_TIMESTAMP
:
405 return emit_copy_query_pool_results(cmdbuf
, cs
, pool
, firstQuery
,
406 queryCount
, buffer
, dstOffset
, stride
, flags
);
407 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
408 unreachable("Unimplemented query type");
410 assert(!"Invalid query type");
415 emit_reset_query_pool(struct tu_cmd_buffer
*cmdbuf
,
416 struct tu_query_pool
*pool
,
420 struct tu_cs
*cs
= &cmdbuf
->cs
;
422 for (uint32_t i
= 0; i
< queryCount
; i
++) {
423 uint32_t query
= firstQuery
+ i
;
425 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
426 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
427 tu_cs_emit_qw(cs
, 0x0);
429 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
430 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
));
431 tu_cs_emit_qw(cs
, 0x0);
436 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer
,
437 VkQueryPool queryPool
,
441 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
442 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
444 switch (pool
->type
) {
445 case VK_QUERY_TYPE_TIMESTAMP
:
446 case VK_QUERY_TYPE_OCCLUSION
:
447 emit_reset_query_pool(cmdbuf
, pool
, firstQuery
, queryCount
);
449 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
450 unreachable("Unimplemented query type");
452 assert(!"Invalid query type");
455 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
459 emit_begin_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
460 struct tu_query_pool
*pool
,
463 /* From the Vulkan 1.1.130 spec:
465 * A query must begin and end inside the same subpass of a render pass
466 * instance, or must both begin and end outside of a render pass
469 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
470 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
471 * query begins/ends inside the same subpass of a render pass, we need to
472 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
473 * is then run on every tile during render, so we just need to accumulate
474 * sample counts in slot->result to compute the query result.
476 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
478 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
481 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
484 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova
));
486 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
487 tu_cs_emit(cs
, ZPASS_DONE
);
491 tu_CmdBeginQuery(VkCommandBuffer commandBuffer
,
492 VkQueryPool queryPool
,
494 VkQueryControlFlags flags
)
496 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
497 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
498 assert(query
< pool
->size
);
500 switch (pool
->type
) {
501 case VK_QUERY_TYPE_OCCLUSION
:
502 /* In freedreno, there is no implementation difference between
503 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
504 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
506 emit_begin_occlusion_query(cmdbuf
, pool
, query
);
508 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
509 case VK_QUERY_TYPE_TIMESTAMP
:
510 unreachable("Unimplemented query type");
512 assert(!"Invalid query type");
515 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
519 emit_end_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
520 struct tu_query_pool
*pool
,
523 /* Ending an occlusion query happens in a few steps:
524 * 1) Set the slot->end to UINT64_MAX.
525 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
526 * write the current sample count value into slot->end.
527 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
528 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
529 * 4) Accumulate the results of the query (slot->end - slot->begin) into
531 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
532 * pass, set the slot's available bit since the query is now done.
533 * 6) If vkCmdEndQuery *is* called from within the scope of a render
534 * pass, we cannot mark as available yet since the commands in
535 * draw_cs are not run until vkCmdEndRenderPass.
537 const struct tu_render_pass
*pass
= cmdbuf
->state
.pass
;
538 struct tu_cs
*cs
= pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
540 uint64_t available_iova
= query_available_iova(pool
, query
);
541 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
542 uint64_t end_iova
= occlusion_query_iova(pool
, query
, end
);
543 uint64_t result_iova
= query_result_iova(pool
, query
);
544 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
545 tu_cs_emit_qw(cs
, end_iova
);
546 tu_cs_emit_qw(cs
, 0xffffffffffffffffull
);
548 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
551 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
554 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova
));
556 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
557 tu_cs_emit(cs
, ZPASS_DONE
);
559 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
560 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE
) |
561 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
562 tu_cs_emit_qw(cs
, end_iova
);
563 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0xffffffff));
564 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
565 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
567 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
568 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
569 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
);
570 tu_cs_emit_qw(cs
, result_iova
);
571 tu_cs_emit_qw(cs
, result_iova
);
572 tu_cs_emit_qw(cs
, end_iova
);
573 tu_cs_emit_qw(cs
, begin_iova
);
575 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
578 /* Technically, queries should be tracked per-subpass, but here we track
579 * at the render pass level to simply the code a bit. This is safe
580 * because the only commands that use the available bit are
581 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
582 * cannot be invoked from inside a render pass scope.
584 cs
= &cmdbuf
->draw_epilogue_cs
;
586 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
587 tu_cs_emit_qw(cs
, available_iova
);
588 tu_cs_emit_qw(cs
, 0x1);
592 tu_CmdEndQuery(VkCommandBuffer commandBuffer
,
593 VkQueryPool queryPool
,
596 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
597 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
598 assert(query
< pool
->size
);
600 switch (pool
->type
) {
601 case VK_QUERY_TYPE_OCCLUSION
:
602 emit_end_occlusion_query(cmdbuf
, pool
, query
);
604 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
605 case VK_QUERY_TYPE_TIMESTAMP
:
606 unreachable("Unimplemented query type");
608 assert(!"Invalid query type");
611 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
615 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer
,
616 VkPipelineStageFlagBits pipelineStage
,
617 VkQueryPool queryPool
,
620 TU_FROM_HANDLE(tu_cmd_buffer
, cmd
, commandBuffer
);
621 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
622 struct tu_cs
*cs
= cmd
->state
.pass
? &cmd
->draw_epilogue_cs
: &cmd
->cs
;
624 /* WFI to get more accurate timestamp */
627 tu_cs_emit_pkt7(cs
, CP_REG_TO_MEM
, 3);
628 tu_cs_emit(cs
, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO
) |
629 CP_REG_TO_MEM_0_CNT(2) |
630 CP_REG_TO_MEM_0_64B
);
631 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
));
633 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
634 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
635 tu_cs_emit_qw(cs
, 0x1);
637 if (cmd
->state
.pass
) {
638 /* TODO: to have useful in-renderpass timestamps:
639 * for sysmem path, we can just emit the timestamp in draw_cs,
640 * for gmem renderpass, we do something with accumulate,
641 * but I'm not sure that would follow the spec
643 tu_finishme("CmdWriteTimestam in renderpass not accurate");