2 * Copyrigh 2016 Red Hat Inc.
4 * Copyright © 2015 Intel Corporation
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
26 #include "tu_private.h"
34 #include "adreno_pm4.xml.h"
35 #include "adreno_common.xml.h"
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
45 #define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1)
47 struct PACKED query_slot
{
51 struct PACKED occlusion_slot_value
{
52 /* Seems sample counters are placed to be 16-byte aligned
53 * even though this query needs an 8-byte slot. */
58 struct PACKED occlusion_query_slot
{
59 struct query_slot common
;
62 struct occlusion_slot_value begin
;
63 struct occlusion_slot_value end
;
66 struct PACKED timestamp_query_slot
{
67 struct query_slot common
;
71 struct PACKED primitive_slot_value
{
75 struct PACKED pipeline_stat_query_slot
{
76 struct query_slot common
;
77 uint64_t results
[STAT_COUNT
];
79 uint64_t begin
[STAT_COUNT
];
80 uint64_t end
[STAT_COUNT
];
83 struct PACKED primitive_query_slot
{
84 struct query_slot common
;
85 /* The result of transform feedback queries is two integer values:
86 * results[0] is the count of primitives written,
87 * results[1] is the count of primitives generated.
88 * Also a result for each stream is stored at 4 slots respectively.
92 /* Primitive counters also need to be 16-byte aligned. */
95 struct primitive_slot_value begin
[4];
96 struct primitive_slot_value end
[4];
99 /* Returns the IOVA of a given uint64_t field in a given slot of a query
101 #define query_iova(type, pool, query, field) \
102 pool->bo.iova + pool->stride * (query) + offsetof(type, field)
104 #define occlusion_query_iova(pool, query, field) \
105 query_iova(struct occlusion_query_slot, pool, query, field)
107 #define pipeline_stat_query_iova(pool, query, field) \
108 pool->bo.iova + pool->stride * query + \
109 offsetof(struct pipeline_stat_query_slot, field)
111 #define primitive_query_iova(pool, query, field, i) \
112 query_iova(struct primitive_query_slot, pool, query, field) + \
113 offsetof(struct primitive_slot_value, values[i])
115 #define query_available_iova(pool, query) \
116 query_iova(struct query_slot, pool, query, available)
118 #define query_result_iova(pool, query, i) \
119 pool->bo.iova + pool->stride * (query) + \
120 sizeof(struct query_slot) + sizeof(uint64_t) * i
122 #define query_result_addr(pool, query, i) \
123 pool->bo.map + pool->stride * query + \
124 sizeof(struct query_slot) + sizeof(uint64_t) * i
126 #define query_is_available(slot) slot->available
129 * Returns a pointer to a given slot in a query pool.
131 static void* slot_address(struct tu_query_pool
*pool
, uint32_t query
)
133 return (char*)pool
->bo
.map
+ query
* pool
->stride
;
137 tu_CreateQueryPool(VkDevice _device
,
138 const VkQueryPoolCreateInfo
*pCreateInfo
,
139 const VkAllocationCallbacks
*pAllocator
,
140 VkQueryPool
*pQueryPool
)
142 TU_FROM_HANDLE(tu_device
, device
, _device
);
143 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
144 assert(pCreateInfo
->queryCount
> 0);
147 switch (pCreateInfo
->queryType
) {
148 case VK_QUERY_TYPE_OCCLUSION
:
149 slot_size
= sizeof(struct occlusion_query_slot
);
151 case VK_QUERY_TYPE_TIMESTAMP
:
152 slot_size
= sizeof(struct timestamp_query_slot
);
154 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
155 slot_size
= sizeof(struct primitive_query_slot
);
157 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
158 slot_size
= sizeof(struct pipeline_stat_query_slot
);
161 assert(!"Invalid query type");
164 struct tu_query_pool
*pool
=
165 vk_object_alloc(&device
->vk
, pAllocator
, sizeof(*pool
),
166 VK_OBJECT_TYPE_QUERY_POOL
);
168 return vk_error(device
->instance
, VK_ERROR_OUT_OF_HOST_MEMORY
);
170 VkResult result
= tu_bo_init_new(device
, &pool
->bo
,
171 pCreateInfo
->queryCount
* slot_size
);
172 if (result
!= VK_SUCCESS
) {
173 vk_object_free(&device
->vk
, pAllocator
, pool
);
177 result
= tu_bo_map(device
, &pool
->bo
);
178 if (result
!= VK_SUCCESS
) {
179 tu_bo_finish(device
, &pool
->bo
);
180 vk_object_free(&device
->vk
, pAllocator
, pool
);
184 /* Initialize all query statuses to unavailable */
185 memset(pool
->bo
.map
, 0, pool
->bo
.size
);
187 pool
->type
= pCreateInfo
->queryType
;
188 pool
->stride
= slot_size
;
189 pool
->size
= pCreateInfo
->queryCount
;
190 pool
->pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
191 *pQueryPool
= tu_query_pool_to_handle(pool
);
197 tu_DestroyQueryPool(VkDevice _device
,
199 const VkAllocationCallbacks
*pAllocator
)
201 TU_FROM_HANDLE(tu_device
, device
, _device
);
202 TU_FROM_HANDLE(tu_query_pool
, pool
, _pool
);
207 tu_bo_finish(device
, &pool
->bo
);
208 vk_object_free(&device
->vk
, pAllocator
, pool
);
212 get_result_count(struct tu_query_pool
*pool
)
214 switch (pool
->type
) {
215 /* Occulusion and timestamp queries write one integer value */
216 case VK_QUERY_TYPE_OCCLUSION
:
217 case VK_QUERY_TYPE_TIMESTAMP
:
219 /* Transform feedback queries write two integer values */
220 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
222 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
223 return util_bitcount(pool
->pipeline_statistics
);
225 assert(!"Invalid query type");
231 statistics_index(uint32_t *statistics
)
234 stat
= u_bit_scan(statistics
);
237 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT
:
238 case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT
:
240 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT
:
242 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT
:
244 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT
:
246 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT
:
248 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT
:
250 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT
:
252 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT
:
254 case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
:
256 case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT
:
263 /* Wait on the the availability status of a query up until a timeout. */
265 wait_for_available(struct tu_device
*device
, struct tu_query_pool
*pool
,
268 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
269 * scheduler friendly way instead of busy polling once the patch has landed
271 struct query_slot
*slot
= slot_address(pool
, query
);
272 uint64_t abs_timeout
= os_time_get_absolute_timeout(
273 WAIT_TIMEOUT
* NSEC_PER_SEC
);
274 while(os_time_get_nano() < abs_timeout
) {
275 if (query_is_available(slot
))
278 return vk_error(device
->instance
, VK_TIMEOUT
);
281 /* Writes a query value to a buffer from the CPU. */
283 write_query_value_cpu(char* base
,
286 VkQueryResultFlags flags
)
288 if (flags
& VK_QUERY_RESULT_64_BIT
) {
289 *(uint64_t*)(base
+ (offset
* sizeof(uint64_t))) = value
;
291 *(uint32_t*)(base
+ (offset
* sizeof(uint32_t))) = value
;
296 get_query_pool_results(struct tu_device
*device
,
297 struct tu_query_pool
*pool
,
303 VkQueryResultFlags flags
)
305 assert(dataSize
>= stride
* queryCount
);
307 char *result_base
= pData
;
308 VkResult result
= VK_SUCCESS
;
309 for (uint32_t i
= 0; i
< queryCount
; i
++) {
310 uint32_t query
= firstQuery
+ i
;
311 struct query_slot
*slot
= slot_address(pool
, query
);
312 bool available
= query_is_available(slot
);
313 uint32_t result_count
= get_result_count(pool
);
314 uint32_t statistics
= pool
->pipeline_statistics
;
316 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) && !available
) {
317 VkResult wait_result
= wait_for_available(device
, pool
, query
);
318 if (wait_result
!= VK_SUCCESS
)
321 } else if (!(flags
& VK_QUERY_RESULT_PARTIAL_BIT
) && !available
) {
322 /* From the Vulkan 1.1.130 spec:
324 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
325 * both not set then no result values are written to pData for
326 * queries that are in the unavailable state at the time of the
327 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
328 * availability state is still written to pData for those queries
329 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
331 result
= VK_NOT_READY
;
332 if (!(flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)) {
333 result_base
+= stride
;
338 for (uint32_t k
= 0; k
< result_count
; k
++) {
342 if (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) {
343 uint32_t stat_idx
= statistics_index(&statistics
);
344 result
= query_result_addr(pool
, query
, stat_idx
);
346 result
= query_result_addr(pool
, query
, k
);
349 write_query_value_cpu(result_base
, k
, *result
, flags
);
350 } else if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
)
351 /* From the Vulkan 1.1.130 spec:
353 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
354 * is not set, and the query’s status is unavailable, an
355 * intermediate result value between zero and the final result
356 * value is written to pData for that query.
358 * Just return 0 here for simplicity since it's a valid result.
360 write_query_value_cpu(result_base
, k
, 0, flags
);
363 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
364 /* From the Vulkan 1.1.130 spec:
366 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
367 * integer value written for each query is non-zero if the query’s
368 * status was available or zero if the status was unavailable.
370 write_query_value_cpu(result_base
, result_count
, available
, flags
);
372 result_base
+= stride
;
378 tu_GetQueryPoolResults(VkDevice _device
,
379 VkQueryPool queryPool
,
385 VkQueryResultFlags flags
)
387 TU_FROM_HANDLE(tu_device
, device
, _device
);
388 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
389 assert(firstQuery
+ queryCount
<= pool
->size
);
391 if (tu_device_is_lost(device
))
392 return VK_ERROR_DEVICE_LOST
;
394 switch (pool
->type
) {
395 case VK_QUERY_TYPE_OCCLUSION
:
396 case VK_QUERY_TYPE_TIMESTAMP
:
397 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
398 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
399 return get_query_pool_results(device
, pool
, firstQuery
, queryCount
,
400 dataSize
, pData
, stride
, flags
);
402 assert(!"Invalid query type");
407 /* Copies a query value from one buffer to another from the GPU. */
409 copy_query_value_gpu(struct tu_cmd_buffer
*cmdbuf
,
412 uint64_t base_write_iova
,
414 VkQueryResultFlags flags
) {
415 uint32_t element_size
= flags
& VK_QUERY_RESULT_64_BIT
?
416 sizeof(uint64_t) : sizeof(uint32_t);
417 uint64_t write_iova
= base_write_iova
+ (offset
* element_size
);
419 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 5);
420 uint32_t mem_to_mem_flags
= flags
& VK_QUERY_RESULT_64_BIT
?
421 CP_MEM_TO_MEM_0_DOUBLE
: 0;
422 tu_cs_emit(cs
, mem_to_mem_flags
);
423 tu_cs_emit_qw(cs
, write_iova
);
424 tu_cs_emit_qw(cs
, src_iova
);
428 emit_copy_query_pool_results(struct tu_cmd_buffer
*cmdbuf
,
430 struct tu_query_pool
*pool
,
433 struct tu_buffer
*buffer
,
434 VkDeviceSize dstOffset
,
436 VkQueryResultFlags flags
)
438 /* From the Vulkan 1.1.130 spec:
440 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
441 * uses of vkCmdResetQueryPool in the same queue, without any additional
444 * To ensure that previous writes to the available bit are coherent, first
445 * wait for all writes to complete.
447 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
449 for (uint32_t i
= 0; i
< queryCount
; i
++) {
450 uint32_t query
= firstQuery
+ i
;
451 uint64_t available_iova
= query_available_iova(pool
, query
);
452 uint64_t buffer_iova
= tu_buffer_iova(buffer
) + dstOffset
+ i
* stride
;
453 uint32_t result_count
= get_result_count(pool
);
454 uint32_t statistics
= pool
->pipeline_statistics
;
456 /* Wait for the available bit to be set if executed with the
457 * VK_QUERY_RESULT_WAIT_BIT flag. */
458 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
459 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
460 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ
) |
461 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
462 tu_cs_emit_qw(cs
, available_iova
);
463 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0x1));
464 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
465 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
468 for (uint32_t k
= 0; k
< result_count
; k
++) {
469 uint64_t result_iova
;
471 if (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) {
472 uint32_t stat_idx
= statistics_index(&statistics
);
473 result_iova
= query_result_iova(pool
, query
, stat_idx
);
475 result_iova
= query_result_iova(pool
, query
, k
);
478 if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
) {
479 /* Unconditionally copying the bo->result into the buffer here is
480 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
481 * if the query is unavailable, this will copy the correct partial
484 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
485 k
/* offset */, flags
);
487 /* Conditionally copy bo->result into the buffer based on whether the
488 * query is available.
490 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
491 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
492 * that 0 < available < 2, aka available == 1.
494 tu_cs_reserve(cs
, 7 + 6);
495 tu_cs_emit_pkt7(cs
, CP_COND_EXEC
, 6);
496 tu_cs_emit_qw(cs
, available_iova
);
497 tu_cs_emit_qw(cs
, available_iova
);
498 tu_cs_emit(cs
, CP_COND_EXEC_4_REF(0x2));
499 tu_cs_emit(cs
, 6); /* Cond execute the next 6 DWORDS */
501 /* Start of conditional execution */
502 copy_query_value_gpu(cmdbuf
, cs
, result_iova
, buffer_iova
,
503 k
/* offset */, flags
);
504 /* End of conditional execution */
508 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
509 copy_query_value_gpu(cmdbuf
, cs
, available_iova
, buffer_iova
,
510 result_count
/* offset */, flags
);
514 tu_bo_list_add(&cmdbuf
->bo_list
, buffer
->bo
, MSM_SUBMIT_BO_WRITE
);
518 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer
,
519 VkQueryPool queryPool
,
523 VkDeviceSize dstOffset
,
525 VkQueryResultFlags flags
)
527 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
528 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
529 TU_FROM_HANDLE(tu_buffer
, buffer
, dstBuffer
);
530 struct tu_cs
*cs
= &cmdbuf
->cs
;
531 assert(firstQuery
+ queryCount
<= pool
->size
);
533 switch (pool
->type
) {
534 case VK_QUERY_TYPE_OCCLUSION
:
535 case VK_QUERY_TYPE_TIMESTAMP
:
536 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
537 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
538 return emit_copy_query_pool_results(cmdbuf
, cs
, pool
, firstQuery
,
539 queryCount
, buffer
, dstOffset
, stride
, flags
);
541 assert(!"Invalid query type");
546 emit_reset_query_pool(struct tu_cmd_buffer
*cmdbuf
,
547 struct tu_query_pool
*pool
,
551 struct tu_cs
*cs
= &cmdbuf
->cs
;
553 for (uint32_t i
= 0; i
< queryCount
; i
++) {
554 uint32_t query
= firstQuery
+ i
;
555 uint32_t statistics
= pool
->pipeline_statistics
;
557 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
558 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
559 tu_cs_emit_qw(cs
, 0x0);
561 for (uint32_t k
= 0; k
< get_result_count(pool
); k
++) {
562 uint64_t result_iova
;
564 if (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) {
565 uint32_t stat_idx
= statistics_index(&statistics
);
566 result_iova
= query_result_iova(pool
, query
, stat_idx
);
568 result_iova
= query_result_iova(pool
, query
, k
);
571 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
572 tu_cs_emit_qw(cs
, result_iova
);
573 tu_cs_emit_qw(cs
, 0x0);
580 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer
,
581 VkQueryPool queryPool
,
585 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
586 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
588 switch (pool
->type
) {
589 case VK_QUERY_TYPE_TIMESTAMP
:
590 case VK_QUERY_TYPE_OCCLUSION
:
591 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
592 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
593 emit_reset_query_pool(cmdbuf
, pool
, firstQuery
, queryCount
);
596 assert(!"Invalid query type");
599 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
603 emit_begin_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
604 struct tu_query_pool
*pool
,
607 /* From the Vulkan 1.1.130 spec:
609 * A query must begin and end inside the same subpass of a render pass
610 * instance, or must both begin and end outside of a render pass
613 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
614 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
615 * query begins/ends inside the same subpass of a render pass, we need to
616 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
617 * is then run on every tile during render, so we just need to accumulate
618 * sample counts in slot->result to compute the query result.
620 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
622 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
625 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
628 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova
));
630 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
631 tu_cs_emit(cs
, ZPASS_DONE
);
635 emit_begin_stat_query(struct tu_cmd_buffer
*cmdbuf
,
636 struct tu_query_pool
*pool
,
639 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
640 uint64_t begin_iova
= pipeline_stat_query_iova(pool
, query
, begin
);
642 tu6_emit_event_write(cmdbuf
, cs
, START_PRIMITIVE_CTRS
);
643 tu6_emit_event_write(cmdbuf
, cs
, RST_PIX_CNT
);
644 tu6_emit_event_write(cmdbuf
, cs
, TILE_FLUSH
);
648 tu_cs_emit_pkt7(cs
, CP_REG_TO_MEM
, 3);
649 tu_cs_emit(cs
, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO
) |
650 CP_REG_TO_MEM_0_CNT(STAT_COUNT
* 2) |
651 CP_REG_TO_MEM_0_64B
);
652 tu_cs_emit_qw(cs
, begin_iova
);
656 emit_begin_xfb_query(struct tu_cmd_buffer
*cmdbuf
,
657 struct tu_query_pool
*pool
,
661 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
662 uint64_t begin_iova
= primitive_query_iova(pool
, query
, begin
[0], 0);
664 tu_cs_emit_regs(cs
, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova
));
665 tu6_emit_event_write(cmdbuf
, cs
, WRITE_PRIMITIVE_COUNTS
);
669 tu_CmdBeginQuery(VkCommandBuffer commandBuffer
,
670 VkQueryPool queryPool
,
672 VkQueryControlFlags flags
)
674 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
675 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
676 assert(query
< pool
->size
);
678 switch (pool
->type
) {
679 case VK_QUERY_TYPE_OCCLUSION
:
680 /* In freedreno, there is no implementation difference between
681 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
682 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
684 emit_begin_occlusion_query(cmdbuf
, pool
, query
);
686 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
687 emit_begin_xfb_query(cmdbuf
, pool
, query
, 0);
689 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
690 emit_begin_stat_query(cmdbuf
, pool
, query
);
692 case VK_QUERY_TYPE_TIMESTAMP
:
693 unreachable("Unimplemented query type");
695 assert(!"Invalid query type");
698 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
702 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer
,
703 VkQueryPool queryPool
,
705 VkQueryControlFlags flags
,
708 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
709 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
710 assert(query
< pool
->size
);
712 switch (pool
->type
) {
713 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
714 emit_begin_xfb_query(cmdbuf
, pool
, query
, index
);
717 assert(!"Invalid query type");
720 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
724 emit_end_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
725 struct tu_query_pool
*pool
,
728 /* Ending an occlusion query happens in a few steps:
729 * 1) Set the slot->end to UINT64_MAX.
730 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
731 * write the current sample count value into slot->end.
732 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
733 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
734 * 4) Accumulate the results of the query (slot->end - slot->begin) into
736 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
737 * pass, set the slot's available bit since the query is now done.
738 * 6) If vkCmdEndQuery *is* called from within the scope of a render
739 * pass, we cannot mark as available yet since the commands in
740 * draw_cs are not run until vkCmdEndRenderPass.
742 const struct tu_render_pass
*pass
= cmdbuf
->state
.pass
;
743 struct tu_cs
*cs
= pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
745 uint64_t available_iova
= query_available_iova(pool
, query
);
746 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
747 uint64_t end_iova
= occlusion_query_iova(pool
, query
, end
);
748 uint64_t result_iova
= query_result_iova(pool
, query
, 0);
749 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
750 tu_cs_emit_qw(cs
, end_iova
);
751 tu_cs_emit_qw(cs
, 0xffffffffffffffffull
);
753 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
756 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
759 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova
));
761 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
762 tu_cs_emit(cs
, ZPASS_DONE
);
764 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
765 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE
) |
766 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
767 tu_cs_emit_qw(cs
, end_iova
);
768 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0xffffffff));
769 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
770 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
772 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
773 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
774 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
);
775 tu_cs_emit_qw(cs
, result_iova
);
776 tu_cs_emit_qw(cs
, result_iova
);
777 tu_cs_emit_qw(cs
, end_iova
);
778 tu_cs_emit_qw(cs
, begin_iova
);
780 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
783 /* Technically, queries should be tracked per-subpass, but here we track
784 * at the render pass level to simply the code a bit. This is safe
785 * because the only commands that use the available bit are
786 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
787 * cannot be invoked from inside a render pass scope.
789 cs
= &cmdbuf
->draw_epilogue_cs
;
791 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
792 tu_cs_emit_qw(cs
, available_iova
);
793 tu_cs_emit_qw(cs
, 0x1);
797 emit_end_stat_query(struct tu_cmd_buffer
*cmdbuf
,
798 struct tu_query_pool
*pool
,
801 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
802 uint64_t end_iova
= pipeline_stat_query_iova(pool
, query
, end
);
803 uint64_t available_iova
= query_available_iova(pool
, query
);
804 uint64_t result_iova
;
805 uint64_t stat_start_iova
;
806 uint64_t stat_stop_iova
;
808 tu6_emit_event_write(cmdbuf
, cs
, STOP_PRIMITIVE_CTRS
);
809 tu6_emit_event_write(cmdbuf
, cs
, RST_VTX_CNT
);
810 tu6_emit_event_write(cmdbuf
, cs
, STAT_EVENT
);
814 tu_cs_emit_pkt7(cs
, CP_REG_TO_MEM
, 3);
815 tu_cs_emit(cs
, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO
) |
816 CP_REG_TO_MEM_0_CNT(STAT_COUNT
* 2) |
817 CP_REG_TO_MEM_0_64B
);
818 tu_cs_emit_qw(cs
, end_iova
);
820 for (int i
= 0; i
< STAT_COUNT
; i
++) {
821 result_iova
= query_result_iova(pool
, query
, i
);
822 stat_start_iova
= pipeline_stat_query_iova(pool
, query
, begin
[i
]);
823 stat_stop_iova
= pipeline_stat_query_iova(pool
, query
, end
[i
]);
825 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
826 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES
|
827 CP_MEM_TO_MEM_0_DOUBLE
|
828 CP_MEM_TO_MEM_0_NEG_C
);
830 tu_cs_emit_qw(cs
, result_iova
);
831 tu_cs_emit_qw(cs
, result_iova
);
832 tu_cs_emit_qw(cs
, stat_stop_iova
);
833 tu_cs_emit_qw(cs
, stat_start_iova
);
836 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
838 if (cmdbuf
->state
.pass
)
839 cs
= &cmdbuf
->draw_epilogue_cs
;
841 /* Set the availability to 1 */
842 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
843 tu_cs_emit_qw(cs
, available_iova
);
844 tu_cs_emit_qw(cs
, 0x1);
848 emit_end_xfb_query(struct tu_cmd_buffer
*cmdbuf
,
849 struct tu_query_pool
*pool
,
853 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
855 uint64_t end_iova
= primitive_query_iova(pool
, query
, end
[0], 0);
856 uint64_t result_written_iova
= query_result_iova(pool
, query
, 0);
857 uint64_t result_generated_iova
= query_result_iova(pool
, query
, 1);
858 uint64_t begin_written_iova
= primitive_query_iova(pool
, query
, begin
[stream_id
], 0);
859 uint64_t begin_generated_iova
= primitive_query_iova(pool
, query
, begin
[stream_id
], 1);
860 uint64_t end_written_iova
= primitive_query_iova(pool
, query
, end
[stream_id
], 0);
861 uint64_t end_generated_iova
= primitive_query_iova(pool
, query
, end
[stream_id
], 1);
862 uint64_t available_iova
= query_available_iova(pool
, query
);
864 tu_cs_emit_regs(cs
, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova
));
865 tu6_emit_event_write(cmdbuf
, cs
, WRITE_PRIMITIVE_COUNTS
);
868 tu6_emit_event_write(cmdbuf
, cs
, CACHE_FLUSH_TS
);
870 /* Set the count of written primitives */
871 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
872 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
|
873 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES
| 0x80000000);
874 tu_cs_emit_qw(cs
, result_written_iova
);
875 tu_cs_emit_qw(cs
, result_written_iova
);
876 tu_cs_emit_qw(cs
, end_written_iova
);
877 tu_cs_emit_qw(cs
, begin_written_iova
);
879 tu6_emit_event_write(cmdbuf
, cs
, CACHE_FLUSH_TS
);
881 /* Set the count of generated primitives */
882 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
883 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
|
884 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES
| 0x80000000);
885 tu_cs_emit_qw(cs
, result_generated_iova
);
886 tu_cs_emit_qw(cs
, result_generated_iova
);
887 tu_cs_emit_qw(cs
, end_generated_iova
);
888 tu_cs_emit_qw(cs
, begin_generated_iova
);
890 /* Set the availability to 1 */
891 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
892 tu_cs_emit_qw(cs
, available_iova
);
893 tu_cs_emit_qw(cs
, 0x1);
896 /* Implement this bit of spec text from section 17.2 "Query Operation":
898 * If queries are used while executing a render pass instance that has
899 * multiview enabled, the query uses N consecutive query indices in the
900 * query pool (starting at query) where N is the number of bits set in the
901 * view mask in the subpass the query is used in. How the numerical
902 * results of the query are distributed among the queries is
903 * implementation-dependent. For example, some implementations may write
904 * each view’s results to a distinct query, while other implementations
905 * may write the total result to the first query and write zero to the
906 * other queries. However, the sum of the results in all the queries must
907 * accurately reflect the total result of the query summed over all views.
908 * Applications can sum the results from all the queries to compute the
911 * Since we execute all views at once, we write zero to the other queries.
912 * Furthermore, because queries must be reset before use, and we set the
913 * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
917 handle_multiview_queries(struct tu_cmd_buffer
*cmd
,
918 struct tu_query_pool
*pool
,
921 if (!cmd
->state
.pass
|| !cmd
->state
.subpass
->multiview_mask
)
924 unsigned views
= util_bitcount(cmd
->state
.subpass
->multiview_mask
);
925 struct tu_cs
*cs
= &cmd
->draw_epilogue_cs
;
927 for (uint32_t i
= 1; i
< views
; i
++) {
928 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
929 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
+ i
));
930 tu_cs_emit_qw(cs
, 0x1);
935 tu_CmdEndQuery(VkCommandBuffer commandBuffer
,
936 VkQueryPool queryPool
,
939 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
940 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
941 assert(query
< pool
->size
);
943 switch (pool
->type
) {
944 case VK_QUERY_TYPE_OCCLUSION
:
945 emit_end_occlusion_query(cmdbuf
, pool
, query
);
947 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
948 emit_end_xfb_query(cmdbuf
, pool
, query
, 0);
950 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
951 emit_end_stat_query(cmdbuf
, pool
, query
);
953 case VK_QUERY_TYPE_TIMESTAMP
:
954 unreachable("Unimplemented query type");
956 assert(!"Invalid query type");
959 handle_multiview_queries(cmdbuf
, pool
, query
);
961 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
965 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer
,
966 VkQueryPool queryPool
,
970 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
971 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
972 assert(query
< pool
->size
);
974 switch (pool
->type
) {
975 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
977 emit_end_xfb_query(cmdbuf
, pool
, query
, index
);
980 assert(!"Invalid query type");
983 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
987 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer
,
988 VkPipelineStageFlagBits pipelineStage
,
989 VkQueryPool queryPool
,
992 TU_FROM_HANDLE(tu_cmd_buffer
, cmd
, commandBuffer
);
993 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
995 tu_bo_list_add(&cmd
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
997 /* Inside a render pass, just write the timestamp multiple times so that
998 * the user gets the last one if we use GMEM. There isn't really much
999 * better we can do, and this seems to be what the blob does too.
1001 struct tu_cs
*cs
= cmd
->state
.pass
? &cmd
->draw_cs
: &cmd
->cs
;
1003 /* Stages that will already have been executed by the time the CP executes
1004 * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
1005 * indirect stage counts as top-of-pipe too.
1007 VkPipelineStageFlags top_of_pipe_flags
=
1008 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
|
1009 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT
;
1011 if (pipelineStage
& ~top_of_pipe_flags
) {
1012 /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
1013 * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
1016 * Stalling the CP like this is really unfortunate, but I don't think
1017 * there's a better solution that allows all 48 bits of precision
1018 * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
1023 tu_cs_emit_pkt7(cs
, CP_REG_TO_MEM
, 3);
1024 tu_cs_emit(cs
, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO
) |
1025 CP_REG_TO_MEM_0_CNT(2) |
1026 CP_REG_TO_MEM_0_64B
);
1027 tu_cs_emit_qw(cs
, query_result_iova(pool
, query
, 0));
1029 /* Only flag availability once the entire renderpass is done, similar to
1030 * the begin/end path.
1032 cs
= cmd
->state
.pass
? &cmd
->draw_epilogue_cs
: &cmd
->cs
;
1034 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
1035 tu_cs_emit_qw(cs
, query_available_iova(pool
, query
));
1036 tu_cs_emit_qw(cs
, 0x1);
1038 /* From the spec for vkCmdWriteTimestamp:
1040 * If vkCmdWriteTimestamp is called while executing a render pass
1041 * instance that has multiview enabled, the timestamp uses N consecutive
1042 * query indices in the query pool (starting at query) where N is the
1043 * number of bits set in the view mask of the subpass the command is
1044 * executed in. The resulting query values are determined by an
1045 * implementation-dependent choice of one of the following behaviors:
1047 * - The first query is a timestamp value and (if more than one bit is
1048 * set in the view mask) zero is written to the remaining queries.
1049 * If two timestamps are written in the same subpass, the sum of the
1050 * execution time of all views between those commands is the
1051 * difference between the first query written by each command.
1053 * - All N queries are timestamp values. If two timestamps are written
1054 * in the same subpass, the sum of the execution time of all views
1055 * between those commands is the sum of the difference between
1056 * corresponding queries written by each command. The difference
1057 * between corresponding queries may be the execution time of a
1060 * We execute all views in the same draw call, so we implement the first
1061 * option, the same as regular queries.
1063 handle_multiview_queries(cmd
, pool
, query
);