2 * Copyrigh 2016 Red Hat Inc.
4 * Copyright © 2015 Intel Corporation
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
26 #include "tu_private.h"
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
46 /* It seems like sample counts need to be copied over to 16-byte aligned
48 struct PACKED slot_value
{
53 struct PACKED occlusion_query_slot
{
54 struct slot_value available
; /* 0 when unavailable, 1 when available */
55 struct slot_value begin
;
56 struct slot_value end
;
57 struct slot_value result
;
60 /* Returns the IOVA of a given uint64_t field in a given slot of a query
62 #define query_iova(type, pool, query, field) \
63 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
64 offsetof(struct slot_value, value)
66 #define occlusion_query_iova(pool, query, field) \
67 query_iova(struct occlusion_query_slot, pool, query, field)
69 #define query_is_available(type, slot) \
70 ((type*)slot)->available.value
72 #define occlusion_query_is_available(slot) \
73 query_is_available(struct occlusion_query_slot, slot)
76 * Returns a pointer to a given slot in a query pool.
78 static void* slot_address(struct tu_query_pool
*pool
, uint32_t query
)
80 return (char*)pool
->bo
.map
+ query
* pool
->stride
;
84 tu_CreateQueryPool(VkDevice _device
,
85 const VkQueryPoolCreateInfo
*pCreateInfo
,
86 const VkAllocationCallbacks
*pAllocator
,
87 VkQueryPool
*pQueryPool
)
89 TU_FROM_HANDLE(tu_device
, device
, _device
);
90 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
91 assert(pCreateInfo
->queryCount
> 0);
94 switch (pCreateInfo
->queryType
) {
95 case VK_QUERY_TYPE_OCCLUSION
:
96 slot_size
= sizeof(struct occlusion_query_slot
);
98 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
99 case VK_QUERY_TYPE_TIMESTAMP
:
100 unreachable("Unimplemented query type");
102 assert(!"Invalid query type");
105 struct tu_query_pool
*pool
=
106 vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*pool
), 8,
107 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
110 return vk_error(device
->instance
, VK_ERROR_OUT_OF_HOST_MEMORY
);
112 VkResult result
= tu_bo_init_new(device
, &pool
->bo
,
113 pCreateInfo
->queryCount
* slot_size
);
114 if (result
!= VK_SUCCESS
) {
115 vk_free2(&device
->alloc
, pAllocator
, pool
);
119 result
= tu_bo_map(device
, &pool
->bo
);
120 if (result
!= VK_SUCCESS
) {
121 tu_bo_finish(device
, &pool
->bo
);
122 vk_free2(&device
->alloc
, pAllocator
, pool
);
126 /* Initialize all query statuses to unavailable */
127 memset(pool
->bo
.map
, 0, pool
->bo
.size
);
129 pool
->type
= pCreateInfo
->queryType
;
130 pool
->stride
= slot_size
;
131 pool
->size
= pCreateInfo
->queryCount
;
132 pool
->pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
133 *pQueryPool
= tu_query_pool_to_handle(pool
);
139 tu_DestroyQueryPool(VkDevice _device
,
141 const VkAllocationCallbacks
*pAllocator
)
143 TU_FROM_HANDLE(tu_device
, device
, _device
);
144 TU_FROM_HANDLE(tu_query_pool
, pool
, _pool
);
149 tu_bo_finish(device
, &pool
->bo
);
150 vk_free2(&device
->alloc
, pAllocator
, pool
);
153 /* Wait on the the availability status of a query up until a timeout. */
155 wait_for_available(struct tu_device
*device
, struct tu_query_pool
*pool
,
158 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
159 * scheduler friendly way instead of busy polling once the patch has landed
161 struct occlusion_query_slot
*slot
= slot_address(pool
, query
);
162 uint64_t abs_timeout
= os_time_get_absolute_timeout(
163 WAIT_TIMEOUT
* NSEC_PER_SEC
);
164 while(os_time_get_nano() < abs_timeout
) {
165 if (occlusion_query_is_available(slot
))
168 return vk_error(device
->instance
, VK_TIMEOUT
);
172 get_occlusion_query_pool_results(struct tu_device
*device
,
173 struct tu_query_pool
*pool
,
179 VkQueryResultFlags flags
)
181 assert(dataSize
>= stride
* queryCount
);
183 char *query_result
= pData
;
184 VkResult result
= VK_SUCCESS
;
185 for (uint32_t i
= 0; i
< queryCount
; i
++) {
186 uint32_t query
= firstQuery
+ i
;
187 struct occlusion_query_slot
*slot
= slot_address(pool
, query
);
188 bool available
= occlusion_query_is_available(slot
);
189 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) && !available
) {
190 VkResult wait_result
= wait_for_available(device
, pool
, query
);
191 if (wait_result
!= VK_SUCCESS
)
194 } else if (!(flags
& VK_QUERY_RESULT_PARTIAL_BIT
) && !available
) {
195 /* From the Vulkan 1.1.130 spec:
197 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
198 * both not set then no result values are written to pData for
199 * queries that are in the unavailable state at the time of the
200 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
201 * availability state is still written to pData for those queries
202 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
204 result
= VK_NOT_READY
;
205 if (!(flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)) {
206 query_result
+= stride
;
213 value
= slot
->result
.value
;
214 } else if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
215 /* From the Vulkan 1.1.130 spec:
217 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
218 * integer value written for each query is non-zero if the query’s
219 * status was available or zero if the status was unavailable.
222 } else if (flags
& VK_QUERY_RESULT_PARTIAL_BIT
) {
223 /* From the Vulkan 1.1.130 spec:
225 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
226 * is not set, and the query’s status is unavailable, an
227 * intermediate result value between zero and the final result
228 * value is written to pData for that query.
230 * Just return 0 here for simplicity since it's a valid result.
235 if (flags
& VK_QUERY_RESULT_64_BIT
) {
236 *(uint64_t*)query_result
= value
;
238 *(uint32_t*)query_result
= value
;
240 query_result
+= stride
;
246 tu_GetQueryPoolResults(VkDevice _device
,
247 VkQueryPool queryPool
,
253 VkQueryResultFlags flags
)
255 TU_FROM_HANDLE(tu_device
, device
, _device
);
256 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
257 assert(firstQuery
+ queryCount
<= pool
->size
);
259 switch (pool
->type
) {
260 case VK_QUERY_TYPE_OCCLUSION
: {
261 return get_occlusion_query_pool_results(device
, pool
, firstQuery
,
262 queryCount
, dataSize
, pData
, stride
, flags
);
264 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
265 case VK_QUERY_TYPE_TIMESTAMP
:
266 unreachable("Unimplemented query type");
268 assert(!"Invalid query type");
274 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer
,
275 VkQueryPool queryPool
,
279 VkDeviceSize dstOffset
,
281 VkQueryResultFlags flags
)
286 emit_reset_occlusion_query_pool(struct tu_cmd_buffer
*cmdbuf
,
287 struct tu_query_pool
*pool
,
291 struct tu_cs
*cs
= &cmdbuf
->cs
;
293 for (uint32_t i
= 0; i
< queryCount
; i
++) {
294 uint32_t query
= firstQuery
+ i
;
295 uint64_t available_iova
= occlusion_query_iova(pool
, query
, available
);
296 uint64_t result_iova
= occlusion_query_iova(pool
, query
, result
);
297 tu_cs_reserve_space(cmdbuf
->device
, cs
, 11);
298 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
299 tu_cs_emit_qw(cs
, available_iova
);
300 tu_cs_emit_qw(cs
, 0x0);
302 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
303 tu_cs_emit_qw(cs
, result_iova
);
304 tu_cs_emit_qw(cs
, 0x0);
309 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer
,
310 VkQueryPool queryPool
,
314 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
315 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
317 switch (pool
->type
) {
318 case VK_QUERY_TYPE_OCCLUSION
:
319 emit_reset_occlusion_query_pool(cmdbuf
, pool
, firstQuery
, queryCount
);
321 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
322 case VK_QUERY_TYPE_TIMESTAMP
:
323 unreachable("Unimplemented query type");
325 assert(!"Invalid query type");
328 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
332 emit_begin_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
333 struct tu_query_pool
*pool
,
336 /* From the Vulkan 1.1.130 spec:
338 * A query must begin and end inside the same subpass of a render pass
339 * instance, or must both begin and end outside of a render pass
342 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
343 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
344 * query begins/ends inside the same subpass of a render pass, we need to
345 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
346 * is then run on every tile during render, so we just need to accumulate
347 * sample counts in slot->result to compute the query result.
349 struct tu_cs
*cs
= cmdbuf
->state
.pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
351 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
353 tu_cs_reserve_space(cmdbuf
->device
, cs
, 7);
355 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
358 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova
));
360 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
361 tu_cs_emit(cs
, ZPASS_DONE
);
365 tu_CmdBeginQuery(VkCommandBuffer commandBuffer
,
366 VkQueryPool queryPool
,
368 VkQueryControlFlags flags
)
370 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
371 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
372 assert(query
< pool
->size
);
374 switch (pool
->type
) {
375 case VK_QUERY_TYPE_OCCLUSION
:
376 /* In freedreno, there is no implementation difference between
377 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
378 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
380 emit_begin_occlusion_query(cmdbuf
, pool
, query
);
382 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
383 case VK_QUERY_TYPE_TIMESTAMP
:
384 unreachable("Unimplemented query type");
386 assert(!"Invalid query type");
389 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
393 emit_end_occlusion_query(struct tu_cmd_buffer
*cmdbuf
,
394 struct tu_query_pool
*pool
,
397 /* Ending an occlusion query happens in a few steps:
398 * 1) Set the slot->end to UINT64_MAX.
399 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
400 * write the current sample count value into slot->end.
401 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
402 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
403 * 4) Accumulate the results of the query (slot->end - slot->begin) into
405 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
406 * pass, set the slot's available bit since the query is now done.
407 * 6) If vkCmdEndQuery *is* called from within the scope of a render
408 * pass, we cannot mark as available yet since the commands in
409 * draw_cs are not run until vkCmdEndRenderPass.
411 const struct tu_render_pass
*pass
= cmdbuf
->state
.pass
;
412 struct tu_cs
*cs
= pass
? &cmdbuf
->draw_cs
: &cmdbuf
->cs
;
414 uint64_t available_iova
= occlusion_query_iova(pool
, query
, available
);
415 uint64_t begin_iova
= occlusion_query_iova(pool
, query
, begin
);
416 uint64_t end_iova
= occlusion_query_iova(pool
, query
, end
);
417 uint64_t result_iova
= occlusion_query_iova(pool
, query
, result
);
418 tu_cs_reserve_space(cmdbuf
->device
, cs
, 31);
419 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
420 tu_cs_emit_qw(cs
, end_iova
);
421 tu_cs_emit_qw(cs
, 0xffffffffffffffffull
);
423 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
426 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy
= true));
429 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova
));
431 tu_cs_emit_pkt7(cs
, CP_EVENT_WRITE
, 1);
432 tu_cs_emit(cs
, ZPASS_DONE
);
434 tu_cs_emit_pkt7(cs
, CP_WAIT_REG_MEM
, 6);
435 tu_cs_emit(cs
, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE
) |
436 CP_WAIT_REG_MEM_0_POLL_MEMORY
);
437 tu_cs_emit_qw(cs
, end_iova
);
438 tu_cs_emit(cs
, CP_WAIT_REG_MEM_3_REF(0xffffffff));
439 tu_cs_emit(cs
, CP_WAIT_REG_MEM_4_MASK(~0));
440 tu_cs_emit(cs
, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
442 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
443 tu_cs_emit_pkt7(cs
, CP_MEM_TO_MEM
, 9);
444 tu_cs_emit(cs
, CP_MEM_TO_MEM_0_DOUBLE
| CP_MEM_TO_MEM_0_NEG_C
);
445 tu_cs_emit_qw(cs
, result_iova
);
446 tu_cs_emit_qw(cs
, result_iova
);
447 tu_cs_emit_qw(cs
, end_iova
);
448 tu_cs_emit_qw(cs
, begin_iova
);
450 tu_cs_emit_pkt7(cs
, CP_WAIT_MEM_WRITES
, 0);
453 /* Technically, queries should be tracked per-subpass, but here we track
454 * at the render pass level to simply the code a bit. This is safe
455 * because the only commands that use the available bit are
456 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
457 * cannot be invoked from inside a render pass scope.
459 cs
= &cmdbuf
->draw_epilogue_cs
;
461 tu_cs_reserve_space(cmdbuf
->device
, cs
, 5);
462 tu_cs_emit_pkt7(cs
, CP_MEM_WRITE
, 4);
463 tu_cs_emit_qw(cs
, available_iova
);
464 tu_cs_emit_qw(cs
, 0x1);
468 tu_CmdEndQuery(VkCommandBuffer commandBuffer
,
469 VkQueryPool queryPool
,
472 TU_FROM_HANDLE(tu_cmd_buffer
, cmdbuf
, commandBuffer
);
473 TU_FROM_HANDLE(tu_query_pool
, pool
, queryPool
);
474 assert(query
< pool
->size
);
476 switch (pool
->type
) {
477 case VK_QUERY_TYPE_OCCLUSION
:
478 emit_end_occlusion_query(cmdbuf
, pool
, query
);
480 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
481 case VK_QUERY_TYPE_TIMESTAMP
:
482 unreachable("Unimplemented query type");
484 assert(!"Invalid query type");
487 tu_bo_list_add(&cmdbuf
->bo_list
, &pool
->bo
, MSM_SUBMIT_BO_WRITE
);
491 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer
,
492 VkPipelineStageFlagBits pipelineStage
,
493 VkQueryPool queryPool
,