turnip: implement timestamp query
[mesa.git] / src / freedreno / vulkan / tu_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "tu_private.h"
27
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <stdbool.h>
31 #include <string.h>
32 #include <unistd.h>
33
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
37
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
40
41 #include "tu_cs.h"
42
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
45
46 /* It seems like sample counts need to be copied over to 16-byte aligned
47 * memory. */
48 struct PACKED slot_value {
49 uint64_t value;
50 uint64_t __padding;
51 };
52
53 struct PACKED query_slot {
54 struct slot_value available; /* 0 when unavailable, 1 when available */
55 struct slot_value result;
56 };
57
58 struct PACKED occlusion_query_slot {
59 struct query_slot common;
60 struct slot_value begin;
61 struct slot_value end;
62 };
63
64 /* Returns the IOVA of a given uint64_t field in a given slot of a query
65 * pool. */
66 #define query_iova(type, pool, query, field) \
67 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
68 offsetof(struct slot_value, value)
69
70 #define occlusion_query_iova(pool, query, field) \
71 query_iova(struct occlusion_query_slot, pool, query, field)
72
73 #define query_available_iova(pool, query) \
74 query_iova(struct query_slot, pool, query, available)
75
76 #define query_result_iova(pool, query) \
77 query_iova(struct query_slot, pool, query, result)
78
79 #define query_is_available(slot) slot->available.value
80
81 /*
82 * Returns a pointer to a given slot in a query pool.
83 */
84 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
85 {
86 return (char*)pool->bo.map + query * pool->stride;
87 }
88
89 VkResult
90 tu_CreateQueryPool(VkDevice _device,
91 const VkQueryPoolCreateInfo *pCreateInfo,
92 const VkAllocationCallbacks *pAllocator,
93 VkQueryPool *pQueryPool)
94 {
95 TU_FROM_HANDLE(tu_device, device, _device);
96 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
97 assert(pCreateInfo->queryCount > 0);
98
99 uint32_t slot_size;
100 switch (pCreateInfo->queryType) {
101 case VK_QUERY_TYPE_OCCLUSION:
102 slot_size = sizeof(struct occlusion_query_slot);
103 break;
104 case VK_QUERY_TYPE_TIMESTAMP:
105 slot_size = sizeof(struct query_slot);
106 break;
107 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
108 unreachable("Unimplemented query type");
109 default:
110 assert(!"Invalid query type");
111 }
112
113 struct tu_query_pool *pool =
114 vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
115 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
116
117 if (!pool)
118 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
119
120 VkResult result = tu_bo_init_new(device, &pool->bo,
121 pCreateInfo->queryCount * slot_size);
122 if (result != VK_SUCCESS) {
123 vk_free2(&device->alloc, pAllocator, pool);
124 return result;
125 }
126
127 result = tu_bo_map(device, &pool->bo);
128 if (result != VK_SUCCESS) {
129 tu_bo_finish(device, &pool->bo);
130 vk_free2(&device->alloc, pAllocator, pool);
131 return result;
132 }
133
134 /* Initialize all query statuses to unavailable */
135 memset(pool->bo.map, 0, pool->bo.size);
136
137 pool->type = pCreateInfo->queryType;
138 pool->stride = slot_size;
139 pool->size = pCreateInfo->queryCount;
140 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
141 *pQueryPool = tu_query_pool_to_handle(pool);
142
143 return VK_SUCCESS;
144 }
145
146 void
147 tu_DestroyQueryPool(VkDevice _device,
148 VkQueryPool _pool,
149 const VkAllocationCallbacks *pAllocator)
150 {
151 TU_FROM_HANDLE(tu_device, device, _device);
152 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
153
154 if (!pool)
155 return;
156
157 tu_bo_finish(device, &pool->bo);
158 vk_free2(&device->alloc, pAllocator, pool);
159 }
160
161 /* Wait on the the availability status of a query up until a timeout. */
162 static VkResult
163 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
164 uint32_t query)
165 {
166 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
167 * scheduler friendly way instead of busy polling once the patch has landed
168 * upstream. */
169 struct query_slot *slot = slot_address(pool, query);
170 uint64_t abs_timeout = os_time_get_absolute_timeout(
171 WAIT_TIMEOUT * NSEC_PER_SEC);
172 while(os_time_get_nano() < abs_timeout) {
173 if (query_is_available(slot))
174 return VK_SUCCESS;
175 }
176 return vk_error(device->instance, VK_TIMEOUT);
177 }
178
179 /* Writes a query value to a buffer from the CPU. */
180 static void
181 write_query_value_cpu(char* base,
182 uint32_t offset,
183 uint64_t value,
184 VkQueryResultFlags flags)
185 {
186 if (flags & VK_QUERY_RESULT_64_BIT) {
187 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
188 } else {
189 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
190 }
191 }
192
193 static VkResult
194 get_query_pool_results(struct tu_device *device,
195 struct tu_query_pool *pool,
196 uint32_t firstQuery,
197 uint32_t queryCount,
198 size_t dataSize,
199 void *pData,
200 VkDeviceSize stride,
201 VkQueryResultFlags flags)
202 {
203 assert(dataSize >= stride * queryCount);
204
205 char *result_base = pData;
206 VkResult result = VK_SUCCESS;
207 for (uint32_t i = 0; i < queryCount; i++) {
208 uint32_t query = firstQuery + i;
209 struct query_slot *slot = slot_address(pool, query);
210 bool available = query_is_available(slot);
211 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
212 VkResult wait_result = wait_for_available(device, pool, query);
213 if (wait_result != VK_SUCCESS)
214 return wait_result;
215 available = true;
216 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
217 /* From the Vulkan 1.1.130 spec:
218 *
219 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
220 * both not set then no result values are written to pData for
221 * queries that are in the unavailable state at the time of the
222 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
223 * availability state is still written to pData for those queries
224 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
225 */
226 result = VK_NOT_READY;
227 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
228 result_base += stride;
229 continue;
230 }
231 }
232
233 if (available)
234 write_query_value_cpu(result_base, 0, slot->result.value, flags);
235 else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
236 /* From the Vulkan 1.1.130 spec:
237 *
238 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
239 * is not set, and the query’s status is unavailable, an
240 * intermediate result value between zero and the final result
241 * value is written to pData for that query.
242 *
243 * Just return 0 here for simplicity since it's a valid result.
244 */
245 write_query_value_cpu(result_base, 0, 0, flags);
246
247 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
248 /* From the Vulkan 1.1.130 spec:
249 *
250 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
251 * integer value written for each query is non-zero if the query’s
252 * status was available or zero if the status was unavailable.
253 */
254 write_query_value_cpu(result_base, 1, available, flags);
255
256 result_base += stride;
257 }
258 return result;
259 }
260
261 VkResult
262 tu_GetQueryPoolResults(VkDevice _device,
263 VkQueryPool queryPool,
264 uint32_t firstQuery,
265 uint32_t queryCount,
266 size_t dataSize,
267 void *pData,
268 VkDeviceSize stride,
269 VkQueryResultFlags flags)
270 {
271 TU_FROM_HANDLE(tu_device, device, _device);
272 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
273 assert(firstQuery + queryCount <= pool->size);
274
275 switch (pool->type) {
276 case VK_QUERY_TYPE_OCCLUSION:
277 case VK_QUERY_TYPE_TIMESTAMP:
278 return get_query_pool_results(device, pool, firstQuery, queryCount,
279 dataSize, pData, stride, flags);
280 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
281 unreachable("Unimplemented query type");
282 default:
283 assert(!"Invalid query type");
284 }
285 return VK_SUCCESS;
286 }
287
288 /* Copies a query value from one buffer to another from the GPU. */
289 static void
290 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
291 struct tu_cs *cs,
292 uint64_t src_iova,
293 uint64_t base_write_iova,
294 uint32_t offset,
295 VkQueryResultFlags flags) {
296 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
297 sizeof(uint64_t) : sizeof(uint32_t);
298 uint64_t write_iova = base_write_iova + (offset * element_size);
299
300 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
301 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
302 CP_MEM_TO_MEM_0_DOUBLE : 0;
303 tu_cs_emit(cs, mem_to_mem_flags);
304 tu_cs_emit_qw(cs, write_iova);
305 tu_cs_emit_qw(cs, src_iova);
306 }
307
308 static void
309 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
310 struct tu_cs *cs,
311 struct tu_query_pool *pool,
312 uint32_t firstQuery,
313 uint32_t queryCount,
314 struct tu_buffer *buffer,
315 VkDeviceSize dstOffset,
316 VkDeviceSize stride,
317 VkQueryResultFlags flags)
318 {
319 /* From the Vulkan 1.1.130 spec:
320 *
321 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
322 * uses of vkCmdResetQueryPool in the same queue, without any additional
323 * synchronization.
324 *
325 * To ensure that previous writes to the available bit are coherent, first
326 * wait for all writes to complete.
327 */
328 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
329
330 for (uint32_t i = 0; i < queryCount; i++) {
331 uint32_t query = firstQuery + i;
332 uint64_t available_iova = query_available_iova(pool, query);
333 uint64_t result_iova = query_result_iova(pool, query);
334 uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
335
336 /* Wait for the available bit to be set if executed with the
337 * VK_QUERY_RESULT_WAIT_BIT flag. */
338 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
339 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
340 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
341 CP_WAIT_REG_MEM_0_POLL_MEMORY);
342 tu_cs_emit_qw(cs, available_iova);
343 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
344 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
345 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
346 }
347
348 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
349 /* Unconditionally copying the bo->result into the buffer here is
350 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
351 * if the query is unavailable, this will copy the correct partial
352 * value of 0.
353 */
354 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
355 0 /* offset */, flags);
356 } else {
357 /* Conditionally copy bo->result into the buffer based on whether the
358 * query is available.
359 *
360 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
361 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
362 * that 0 < available < 2, aka available == 1.
363 */
364 tu_cs_reserve(cs, 7 + 6);
365 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
366 tu_cs_emit_qw(cs, available_iova);
367 tu_cs_emit_qw(cs, available_iova);
368 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
369 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
370
371 /* Start of conditional execution */
372 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
373 0 /* offset */, flags);
374 /* End of conditional execution */
375 }
376
377 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
378 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
379 1 /* offset */, flags);
380 }
381 }
382
383 tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
384 }
385
386 void
387 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
388 VkQueryPool queryPool,
389 uint32_t firstQuery,
390 uint32_t queryCount,
391 VkBuffer dstBuffer,
392 VkDeviceSize dstOffset,
393 VkDeviceSize stride,
394 VkQueryResultFlags flags)
395 {
396 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
397 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
398 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
399 struct tu_cs *cs = &cmdbuf->cs;
400 assert(firstQuery + queryCount <= pool->size);
401
402 switch (pool->type) {
403 case VK_QUERY_TYPE_OCCLUSION:
404 case VK_QUERY_TYPE_TIMESTAMP:
405 return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
406 queryCount, buffer, dstOffset, stride, flags);
407 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
408 unreachable("Unimplemented query type");
409 default:
410 assert(!"Invalid query type");
411 }
412 }
413
414 static void
415 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
416 struct tu_query_pool *pool,
417 uint32_t firstQuery,
418 uint32_t queryCount)
419 {
420 struct tu_cs *cs = &cmdbuf->cs;
421
422 for (uint32_t i = 0; i < queryCount; i++) {
423 uint32_t query = firstQuery + i;
424
425 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
426 tu_cs_emit_qw(cs, query_available_iova(pool, query));
427 tu_cs_emit_qw(cs, 0x0);
428
429 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
430 tu_cs_emit_qw(cs, query_result_iova(pool, query));
431 tu_cs_emit_qw(cs, 0x0);
432 }
433 }
434
435 void
436 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
437 VkQueryPool queryPool,
438 uint32_t firstQuery,
439 uint32_t queryCount)
440 {
441 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
442 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
443
444 switch (pool->type) {
445 case VK_QUERY_TYPE_TIMESTAMP:
446 case VK_QUERY_TYPE_OCCLUSION:
447 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
448 break;
449 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
450 unreachable("Unimplemented query type");
451 default:
452 assert(!"Invalid query type");
453 }
454
455 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
456 }
457
458 static void
459 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
460 struct tu_query_pool *pool,
461 uint32_t query)
462 {
463 /* From the Vulkan 1.1.130 spec:
464 *
465 * A query must begin and end inside the same subpass of a render pass
466 * instance, or must both begin and end outside of a render pass
467 * instance.
468 *
469 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
470 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
471 * query begins/ends inside the same subpass of a render pass, we need to
472 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
473 * is then run on every tile during render, so we just need to accumulate
474 * sample counts in slot->result to compute the query result.
475 */
476 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
477
478 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
479
480 tu_cs_emit_regs(cs,
481 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
482
483 tu_cs_emit_regs(cs,
484 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
485
486 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
487 tu_cs_emit(cs, ZPASS_DONE);
488 }
489
490 void
491 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
492 VkQueryPool queryPool,
493 uint32_t query,
494 VkQueryControlFlags flags)
495 {
496 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
497 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
498 assert(query < pool->size);
499
500 switch (pool->type) {
501 case VK_QUERY_TYPE_OCCLUSION:
502 /* In freedreno, there is no implementation difference between
503 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
504 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
505 */
506 emit_begin_occlusion_query(cmdbuf, pool, query);
507 break;
508 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
509 case VK_QUERY_TYPE_TIMESTAMP:
510 unreachable("Unimplemented query type");
511 default:
512 assert(!"Invalid query type");
513 }
514
515 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
516 }
517
518 static void
519 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
520 struct tu_query_pool *pool,
521 uint32_t query)
522 {
523 /* Ending an occlusion query happens in a few steps:
524 * 1) Set the slot->end to UINT64_MAX.
525 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
526 * write the current sample count value into slot->end.
527 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
528 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
529 * 4) Accumulate the results of the query (slot->end - slot->begin) into
530 * slot->result.
531 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
532 * pass, set the slot's available bit since the query is now done.
533 * 6) If vkCmdEndQuery *is* called from within the scope of a render
534 * pass, we cannot mark as available yet since the commands in
535 * draw_cs are not run until vkCmdEndRenderPass.
536 */
537 const struct tu_render_pass *pass = cmdbuf->state.pass;
538 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
539
540 uint64_t available_iova = query_available_iova(pool, query);
541 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
542 uint64_t end_iova = occlusion_query_iova(pool, query, end);
543 uint64_t result_iova = query_result_iova(pool, query);
544 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
545 tu_cs_emit_qw(cs, end_iova);
546 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
547
548 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
549
550 tu_cs_emit_regs(cs,
551 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
552
553 tu_cs_emit_regs(cs,
554 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
555
556 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
557 tu_cs_emit(cs, ZPASS_DONE);
558
559 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
560 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
561 CP_WAIT_REG_MEM_0_POLL_MEMORY);
562 tu_cs_emit_qw(cs, end_iova);
563 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
564 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
565 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
566
567 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
568 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
569 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
570 tu_cs_emit_qw(cs, result_iova);
571 tu_cs_emit_qw(cs, result_iova);
572 tu_cs_emit_qw(cs, end_iova);
573 tu_cs_emit_qw(cs, begin_iova);
574
575 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
576
577 if (pass)
578 /* Technically, queries should be tracked per-subpass, but here we track
579 * at the render pass level to simply the code a bit. This is safe
580 * because the only commands that use the available bit are
581 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
582 * cannot be invoked from inside a render pass scope.
583 */
584 cs = &cmdbuf->draw_epilogue_cs;
585
586 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
587 tu_cs_emit_qw(cs, available_iova);
588 tu_cs_emit_qw(cs, 0x1);
589 }
590
591 void
592 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
593 VkQueryPool queryPool,
594 uint32_t query)
595 {
596 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
597 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
598 assert(query < pool->size);
599
600 switch (pool->type) {
601 case VK_QUERY_TYPE_OCCLUSION:
602 emit_end_occlusion_query(cmdbuf, pool, query);
603 break;
604 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
605 case VK_QUERY_TYPE_TIMESTAMP:
606 unreachable("Unimplemented query type");
607 default:
608 assert(!"Invalid query type");
609 }
610
611 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
612 }
613
614 void
615 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
616 VkPipelineStageFlagBits pipelineStage,
617 VkQueryPool queryPool,
618 uint32_t query)
619 {
620 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
621 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
622 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
623
624 /* WFI to get more accurate timestamp */
625 tu_cs_emit_wfi(cs);
626
627 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
628 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
629 CP_REG_TO_MEM_0_CNT(2) |
630 CP_REG_TO_MEM_0_64B);
631 tu_cs_emit_qw(cs, query_result_iova(pool, query));
632
633 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
634 tu_cs_emit_qw(cs, query_available_iova(pool, query));
635 tu_cs_emit_qw(cs, 0x1);
636
637 if (cmd->state.pass) {
638 /* TODO: to have useful in-renderpass timestamps:
639 * for sysmem path, we can just emit the timestamp in draw_cs,
640 * for gmem renderpass, we do something with accumulate,
641 * but I'm not sure that would follow the spec
642 */
643 tu_finishme("CmdWriteTimestam in renderpass not accurate");
644 }
645 }