2 * Copyrigh 2016 Red Hat Inc.
4 * Copyright © 2015 Intel Corporation
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
26 #include "tu_private.h"
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
46 /* Depending on the query type, there might be 2 integer values.
47 * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
48 * values[0] : primitives written, values[1]: primitives generated
50 struct PACKED slot_value
{
54 struct PACKED query_slot
{
55 struct slot_value available
; /* 0 when unavailable, 1 when available */
56 struct slot_value result
;
59 struct PACKED occlusion_query_slot
{
60 struct query_slot common
;
61 struct slot_value begin
;
62 struct slot_value end
;
65 /* The result of transform feedback queries is two integer values:
66 * common.result.values[0] is the count of primitives written,
67 * common.result.values[1] is the count of primitives generated.
68 * Also a result for each stream is stored at 4 slots respectively.
70 struct PACKED primitive_query_slot
{
71 struct query_slot common
;
72 struct slot_value begin
[4];
73 struct slot_value end
[4];
76 /* Returns the IOVA of a given uint64_t field in a given slot of a query
78 #define query_iova(type, pool, query, field, value_index) \
79 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
80 offsetof(struct slot_value, values[value_index])
82 #define occlusion_query_iova(pool, query, field) \
83 query_iova(struct occlusion_query_slot, pool, query, field, 0)
85 #define primitive_query_iova(pool, query, field, i) \
86 query_iova(struct primitive_query_slot, pool, query, field, i)
88 #define query_available_iova(pool, query) \
89 query_iova(struct query_slot, pool, query, available, 0)
91 #define query_result_iova(pool, query, i) \
92 query_iova(struct query_slot, pool, query, result, i)
94 #define query_is_available(slot) slot->available.values[0]
97 * Returns a pointer to a given slot in a query pool.
99 static void* slot_address(struct tu_query_pool
*pool
, uint32_t query
)
101 return (char*)pool
->bo
.map
+ query
* pool
->stride
;
105 tu_CreateQueryPool(VkDevice _device
,
106 const VkQueryPoolCreateInfo
*pCreateInfo
,
107 const VkAllocationCallbacks
*pAllocator
,
108 VkQueryPool
*pQueryPool
)
110 TU_FROM_HANDLE(tu_device
, device
, _device
);
111 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
112 assert(pCreateInfo
->queryCount
> 0);
115 switch (pCreateInfo
->queryType
) {
116 case VK_QUERY_TYPE_OCCLUSION
:
117 slot_size
= sizeof(struct occlusion_query_slot
);
119 case VK_QUERY_TYPE_TIMESTAMP
:
120 slot_size
= sizeof(struct query_slot
);
122 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
123 slot_size
= sizeof(struct primitive_query_slot
);
125 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
126 unreachable("Unimplemented query type");
128 assert(!"Invalid query type");
131 struct tu_query_pool
*pool
=
132 vk_object_alloc(&device
->vk
, pAllocator
, sizeof(*pool
),
133 VK_OBJECT_TYPE_QUERY_POOL
);
135 return vk_error(device
->instance
, VK_ERROR_OUT_OF_HOST_MEMORY
);
137 VkResult result
= tu_bo_init_new(device
, &pool
->bo
,
138 pCreateInfo
->queryCount
* slot_size
);
139 if (result
!= VK_SUCCESS
) {
140 vk_object_free(&device
->vk
, pAllocator
, pool
);
144 result
= tu_bo_map(device
, &pool
->bo
);
145 if (result
!= VK_SUCCESS
) {
146 tu_bo_finish(device
, &pool
->bo
);
147 vk_object_free(&device
->vk
, pAllocator
, pool
);
151 /* Initialize all query statuses to unavailable */
152 memset(pool
->bo
.map
, 0, pool
->bo
.size
);
154 pool
->type
= pCreateInfo
->queryType
;
155 pool
->stride
= slot_size
;
156 pool
->size
= pCreateInfo
->queryCount
;
157 pool
->pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
158 *pQueryPool
= tu_query_pool_to_handle(pool
);
164 tu_DestroyQueryPool(VkDevice _device
,
166 const VkAllocationCallbacks
*pAllocator
)
168 TU_FROM_HANDLE(tu_device
, device
, _device
);
169 TU_FROM_HANDLE(tu_query_pool
, pool
, _pool
);
174 tu_bo_finish(device
, &pool
->bo
);
175 vk_object_free(&device
->vk
, pAllocator
, pool
);
179 get_result_count(struct tu_query_pool
*pool
)
181 switch (pool
->type
) {
182 /* Occulusion and timestamp queries write one integer value */
183 case VK_QUERY_TYPE_OCCLUSION
:
184 case VK_QUERY_TYPE_TIMESTAMP
:
186 /* Transform feedback queries write two integer values */
187 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
190 assert(!"Invalid query type");
195 /* Wait on the the availability status of a query up until a timeout. */
197 wait_for_available(struct tu_device
*device
, struct tu_query_pool
*pool
,
200 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
201 * scheduler friendly way instead of busy polling once the patch has landed
203 struct query_slot
*slot
= slot_address(pool
, query
);
204 uint64_t abs_timeout
= os_time_get_absolute_timeout(
205 WAIT_TIMEOUT
* NSEC_PER_SEC
);
206 while(os_time_get_nano() < abs_timeout
) {
207 if (query_is_available(slot
))
210 return vk_error(device
->instance
, VK_TIMEOUT
);
213 /* Writes a query value to a buffer from the CPU. */
215 write_query_value_cpu(char* base
,
218 VkQueryResultFlags flags
)
220 if (flags
& VK_QUERY_RESULT_64_BIT
) {
221 *(uint64_t*)(base
+ (offset
* sizeof(uint64_t))) = value
;
223 *(uint32_t*)(base
+ (offset
* sizeof(uint32_t))) = value
;
228 get_query_pool_results(struct tu_device
*device
,
229 struct tu_query_pool
*pool
,
235 VkQueryResultFlags flags
)
237 assert(dataSize
>= stride
* queryCount
);
239 char *result_base
= pData
;
240 VkResult result
= VK_SUCCESS
;
241 for (uint32_t i
= 0; i
< queryCount
; i
++) {
242 uint32_t query
= firstQuery
+ i
;
243 struct query_slot
*slot
= slot_address(pool
, query
);
244 bool available
= query_is_available(slot
);
245 uint32_t result_count
= get_result_count(pool
);
247 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) && !available
) {
248 VkResult wait_result
= wait_for_available(device
, pool
, query
);
249 if (wait_result
!= VK_SUCCESS
)
252 } else if (!(flags
& VK_QUERY_RESULT_PARTIAL_BIT
) && !available
) {
253 /* From the Vulkan 1.1.130 spec:
255 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
256 * both not set then no result values are written to pData for
257 * queries that are in the unavailable state at the time of the
258 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
259 * availability state is still written to pData for those queries
260 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
262 result
= VK_NOT_READY
;
263 if (!(flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)) {
264 result_base
+= stride
;
269 for (uint32_t k
= 0; k
< result_count
; k
++) {
271 write_query_value_cpu(result_base
, k
, slot
->result
.values
[k
], flags
);
272 else if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
)
273 /* From the Vulkan 1.1.130 spec:
275 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
276 * is not set, and the query’s status is unavailable, an
277 * intermediate result value between zero and the final result
278 * value is written to pData for that query.
280 * Just return 0 here for simplicity since it's a valid result.
282 write_query_value_cpu(result_base
, k
, 0, flags
);
285 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
286 /* From the Vulkan 1.1.130 spec:
288 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
289 * integer value written for each query is non-zero if the query’s
290 * status was available or zero if the status was unavailable.
292 write_query_value_cpu(result_base
, result_count
, available
, flags
);
294 result_base
+= stride
;
300 tu_GetQueryPoolResults(VkDevice _device
,
301 VkQueryPool queryPool
,
307 VkQueryResultFlags flags
)
309 TU_FROM_HANDLE(tu_device
, device
, _device
);
310 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
311 assert(firstQuery
+ queryCount
<= pool
->size
);
313 if (tu_device_is_lost(device
))
314 return VK_ERROR_DEVICE_LOST
;
316 switch (pool
->type
) {
317 case VK_QUERY_TYPE_OCCLUSION
:
318 case VK_QUERY_TYPE_TIMESTAMP
:
319 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
320 return get_query_pool_results(device
, pool
, firstQuery
, queryCount
,
321 dataSize
, pData
, stride
, flags
);
322 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
323 unreachable("Unimplemented query type");
325 assert(!"Invalid query type");
330 /* Copies a query value from one buffer to another from the GPU. */
332 copy_query_value_gpu(struct tu_cmd_buffer
*cmdbuf
,
335 uint64_t base_write_iova
,
337 VkQueryResultFlags flags
) {
338 uint32_t element_size
= flags
& VK_QUERY_RESULT_64_BIT
?
339 sizeof(uint64_t) : sizeof(uint32_t);
340 uint64_t write_iova
= base_write_iova
+ (offset
* element_size
);
342 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 5);
343 uint32_t mem_to_mem_flags
= flags
& VK_QUERY_RESULT_64_BIT
?
344 CP_MEM_TO_MEM_0_DOUBLE
: 0;
345 tu_cs_emit(cs
, mem_to_mem_flags
);
346 tu_cs_emit_qw(cs
, write_iova
);
347 tu_cs_emit_qw(cs
, src_iova
);
351 emit_copy_query_pool_results(struct tu_cmd_buffer
*cmdbuf
,
353 struct tu_query_pool
*pool
,
356 struct tu_buffer
*buffer
,
357 VkDeviceSize dstOffset
,
359 VkQueryResultFlags flags
)
361 /* From the Vulkan 1.1.130 spec:
363 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
364 * uses of vkCmdResetQueryPool in the same queue, without any additional
367 * To ensure that previous writes to the available bit are coherent, first
368 * wait for all writes to complete.
370 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
372 for (uint32_t i
= 0; i
< queryCount
; i
++) {
373 uint32_t query
= firstQuery
+ i
;
374 uint64_t available_iova
= query_available_iova(pool
, query
);
375 uint64_t buffer_iova
= tu_buffer_iova(buffer
) + dstOffset
+ i
* stride
;
376 uint32_t result_count
= get_result_count(pool
);
378 /* Wait for the available bit to be set if executed with the
379 * VK_QUERY_RESULT_WAIT_BIT flag. */
380 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
381 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
382 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ
) |
383 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
384 tu_cs_emit_qw(cs
, available_iova
);
385 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0x1));
386 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
387 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
390 for (uint32_t k
= 0; k
< result_count
; k
++) {
391 uint64_t result_iova
= query_result_iova(pool
, query
, k
);
393 if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
) {
394 /* Unconditionally copying the bo->result into the buffer here is
395 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
396 * if the query is unavailable, this will copy the correct partial
399 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
400 k
/* offset */, flags
);
402 /* Conditionally copy bo->result into the buffer based on whether the
403 * query is available.
405 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
406 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
407 * that 0 < available < 2, aka available == 1.
409 tu_cs_reserve(cs
, 7 + 6);
410 tu_cs_emit_pkt7(cs
, CP_COND_EXEC
, 6);
411 tu_cs_emit_qw(cs
, available_iova
);
412 tu_cs_emit_qw(cs
, available_iova
);
413 tu_cs_emit(cs
, CP_COND_EXEC_4_REF(0x2));
414 tu_cs_emit(cs
, 6); /* Cond execute the next 6 DWORDS */
416 /* Start of conditional execution */
417 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
418 k
/* offset */, flags
);
419 /* End of conditional execution */
423 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
424 copy_query_value_gpu(cmdbuf
, cs
, available_iova
, buffer_iova
,
425 result_count
/* offset */, flags
);
429 tu_bo_list_add(&cmdbuf
->bo_list
, buffer
->bo
, MSM_SUBMIT_BO_WRITE
);
433 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer
,
434 VkQueryPool queryPool
,
438 VkDeviceSize dstOffset
,
440 VkQueryResultFlags flags
)
442 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
443 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
444 TU_FROM_HANDLE(tu_buffer
, buffer
, dstBuffer
);
445 struct tu_cs
*cs
= &cmdbuf
->cs
;
446 assert(firstQuery
+ queryCount
<= pool
->size
);
448 switch (pool
->type
) {
449 case VK_QUERY_TYPE_OCCLUSION
:
450 case VK_QUERY_TYPE_TIMESTAMP
:
451 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
452 return emit_copy_query_pool_results(cmdbuf
, cs
, pool
, firstQuery
,
453 queryCount
, buffer
, dstOffset
, stride
, flags
);
454 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
455 unreachable("Unimplemented query type");
457 assert(!"Invalid query type");
462 emit_reset_query_pool(struct tu_cmd_buffer
*cmdbuf
,
463 struct tu_query_pool
*pool
,
467 struct tu_cs
*cs
= &cmdbuf
->cs
;
469 for (uint32_t i
= 0; i
< queryCount
; i
++) {
470 uint32_t query
= firstQuery
+ i
;
472 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
473 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
474 tu_cs_emit_qw(cs
, 0x0);
476 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
477 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, 0));
478 tu_cs_emit_qw(cs
, 0x0);
479 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
480 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, 1));
481 tu_cs_emit_qw(cs
, 0x0);
486 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer
,
487 VkQueryPool queryPool
,
491 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
492 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
494 switch (pool
->type
) {
495 case VK_QUERY_TYPE_TIMESTAMP
:
496 case VK_QUERY_TYPE_OCCLUSION
:
497 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
498 emit_reset_query_pool(cmdbuf
, pool
, firstQuery
, queryCount
);
500 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
501 unreachable("Unimplemented query type");
503 assert(!"Invalid query type");
506 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
510 emit_begin_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
511 struct tu_query_pool
*pool
,
514 /* From the Vulkan 1.1.130 spec:
516 * A query must begin and end inside the same subpass of a render pass
517 * instance, or must both begin and end outside of a render pass
520 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
521 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
522 * query begins/ends inside the same subpass of a render pass, we need to
523 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
524 * is then run on every tile during render, so we just need to accumulate
525 * sample counts in slot->result to compute the query result.
527 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
529 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
532 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
535 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova
));
537 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
538 tu_cs_emit(cs
, ZPASS_DONE
);
542 emit_begin_xfb_query(struct tu_cmd_buffer
*cmdbuf
,
543 struct tu_query_pool
*pool
,
547 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
548 uint64_t begin_iova
= primitive_query_iova(pool
, query
, begin
[0], 0);
550 tu_cs_emit_regs(cs
, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova
));
551 tu6_emit_event_write(cmdbuf
, cs
, WRITE_PRIMITIVE_COUNTS
);
555 tu_CmdBeginQuery(VkCommandBuffer commandBuffer
,
556 VkQueryPool queryPool
,
558 VkQueryControlFlags flags
)
560 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
561 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
562 assert(query
< pool
->size
);
564 switch (pool
->type
) {
565 case VK_QUERY_TYPE_OCCLUSION
:
566 /* In freedreno, there is no implementation difference between
567 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
568 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
570 emit_begin_occlusion_query(cmdbuf
, pool
, query
);
572 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
573 emit_begin_xfb_query(cmdbuf
, pool
, query
, 0);
575 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
576 case VK_QUERY_TYPE_TIMESTAMP
:
577 unreachable("Unimplemented query type");
579 assert(!"Invalid query type");
582 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
586 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer
,
587 VkQueryPool queryPool
,
589 VkQueryControlFlags flags
,
592 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
593 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
594 assert(query
< pool
->size
);
596 switch (pool
->type
) {
597 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
598 emit_begin_xfb_query(cmdbuf
, pool
, query
, index
);
601 assert(!"Invalid query type");
604 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
608 emit_end_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
609 struct tu_query_pool
*pool
,
612 /* Ending an occlusion query happens in a few steps:
613 * 1) Set the slot->end to UINT64_MAX.
614 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
615 * write the current sample count value into slot->end.
616 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
617 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
618 * 4) Accumulate the results of the query (slot->end - slot->begin) into
620 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
621 * pass, set the slot's available bit since the query is now done.
622 * 6) If vkCmdEndQuery *is* called from within the scope of a render
623 * pass, we cannot mark as available yet since the commands in
624 * draw_cs are not run until vkCmdEndRenderPass.
626 const struct tu_render_pass
*pass
= cmdbuf
->state
.pass
;
627 struct tu_cs
*cs
= pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
629 uint64_t available_iova
= query_available_iova(pool
, query
);
630 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
631 uint64_t end_iova
= occlusion_query_iova(pool
, query
, end
);
632 uint64_t result_iova
= query_result_iova(pool
, query
, 0);
633 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
634 tu_cs_emit_qw(cs
, end_iova
);
635 tu_cs_emit_qw(cs
, 0xffffffffffffffffull
);
637 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
640 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
643 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova
));
645 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
646 tu_cs_emit(cs
, ZPASS_DONE
);
648 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
649 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE
) |
650 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
651 tu_cs_emit_qw(cs
, end_iova
);
652 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0xffffffff));
653 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
654 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
656 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
657 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
658 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
);
659 tu_cs_emit_qw(cs
, result_iova
);
660 tu_cs_emit_qw(cs
, result_iova
);
661 tu_cs_emit_qw(cs
, end_iova
);
662 tu_cs_emit_qw(cs
, begin_iova
);
664 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
667 /* Technically, queries should be tracked per-subpass, but here we track
668 * at the render pass level to simply the code a bit. This is safe
669 * because the only commands that use the available bit are
670 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
671 * cannot be invoked from inside a render pass scope.
673 cs
= &cmdbuf
->draw_epilogue_cs
;
675 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
676 tu_cs_emit_qw(cs
, available_iova
);
677 tu_cs_emit_qw(cs
, 0x1);
681 emit_end_xfb_query(struct tu_cmd_buffer
*cmdbuf
,
682 struct tu_query_pool
*pool
,
686 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
688 uint64_t end_iova
= primitive_query_iova(pool
, query
, end
[0], 0);
689 uint64_t result_written_iova
= query_result_iova(pool
, query
, 0);
690 uint64_t result_generated_iova
= query_result_iova(pool
, query
, 1);
691 uint64_t begin_written_iova
= primitive_query_iova(pool
, query
, begin
[stream_id
], 0);
692 uint64_t begin_generated_iova
= primitive_query_iova(pool
, query
, begin
[stream_id
], 1);
693 uint64_t end_written_iova
= primitive_query_iova(pool
, query
, end
[stream_id
], 0);
694 uint64_t end_generated_iova
= primitive_query_iova(pool
, query
, end
[stream_id
], 1);
695 uint64_t available_iova
= query_available_iova(pool
, query
);
697 tu_cs_emit_regs(cs
, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova
));
698 tu6_emit_event_write(cmdbuf
, cs
, WRITE_PRIMITIVE_COUNTS
);
701 tu6_emit_event_write(cmdbuf
, cs
, CACHE_FLUSH_TS
);
703 /* Set the count of written primitives */
704 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
705 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
|
706 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES
| 0x80000000);
707 tu_cs_emit_qw(cs
, result_written_iova
);
708 tu_cs_emit_qw(cs
, result_written_iova
);
709 tu_cs_emit_qw(cs
, end_written_iova
);
710 tu_cs_emit_qw(cs
, begin_written_iova
);
712 tu6_emit_event_write(cmdbuf
, cs
, CACHE_FLUSH_TS
);
714 /* Set the count of generated primitives */
715 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
716 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
|
717 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES
| 0x80000000);
718 tu_cs_emit_qw(cs
, result_generated_iova
);
719 tu_cs_emit_qw(cs
, result_generated_iova
);
720 tu_cs_emit_qw(cs
, end_generated_iova
);
721 tu_cs_emit_qw(cs
, begin_generated_iova
);
723 /* Set the availability to 1 */
724 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
725 tu_cs_emit_qw(cs
, available_iova
);
726 tu_cs_emit_qw(cs
, 0x1);
730 tu_CmdEndQuery(VkCommandBuffer commandBuffer
,
731 VkQueryPool queryPool
,
734 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
735 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
736 assert(query
< pool
->size
);
738 switch (pool
->type
) {
739 case VK_QUERY_TYPE_OCCLUSION
:
740 emit_end_occlusion_query(cmdbuf
, pool
, query
);
742 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
743 emit_end_xfb_query(cmdbuf
, pool
, query
, 0);
745 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
746 case VK_QUERY_TYPE_TIMESTAMP
:
747 unreachable("Unimplemented query type");
749 assert(!"Invalid query type");
752 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
756 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer
,
757 VkQueryPool queryPool
,
761 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
762 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
763 assert(query
< pool
->size
);
765 switch (pool
->type
) {
766 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
768 emit_end_xfb_query(cmdbuf
, pool
, query
, index
);
771 assert(!"Invalid query type");
774 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
778 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer
,
779 VkPipelineStageFlagBits pipelineStage
,
780 VkQueryPool queryPool
,
783 TU_FROM_HANDLE(tu_cmd_buffer
, cmd
, commandBuffer
);
784 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
785 struct tu_cs
*cs
= cmd
->state
.pass
? &cmd
->draw_epilogue_cs
: &cmd
->cs
;
787 /* WFI to get more accurate timestamp */
790 tu_cs_emit_pkt7(cs
, CP_REG_TO_MEM
, 3);
791 tu_cs_emit(cs
, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO
) |
792 CP_REG_TO_MEM_0_CNT(2) |
793 CP_REG_TO_MEM_0_64B
);
794 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, 0));
796 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
797 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
798 tu_cs_emit_qw(cs
, 0x1);
800 if (cmd
->state
.pass
) {
801 /* TODO: to have useful in-renderpass timestamps:
802 * for sysmem path, we can just emit the timestamp in draw_cs,
803 * for gmem renderpass, we do something with accumulate,
804 * but I'm not sure that would follow the spec
806 tu_finishme("CmdWriteTimestam in renderpass not accurate");