turnip: Fix vkCmdCopyQueryPoolResults with available flag
[mesa.git] / src / freedreno / vulkan / tu_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "tu_private.h"
27
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <stdbool.h>
31 #include <string.h>
32 #include <unistd.h>
33
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
37
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
40
41 #include "tu_cs.h"
42
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
45
46 /* It seems like sample counts need to be copied over to 16-byte aligned
47 * memory. */
48 struct PACKED slot_value {
49 uint64_t value;
50 uint64_t __padding;
51 };
52
53 struct PACKED occlusion_query_slot {
54 struct slot_value available; /* 0 when unavailable, 1 when available */
55 struct slot_value begin;
56 struct slot_value end;
57 struct slot_value result;
58 };
59
60 /* Returns the IOVA of a given uint64_t field in a given slot of a query
61 * pool. */
62 #define query_iova(type, pool, query, field) \
63 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
64 offsetof(struct slot_value, value)
65
66 #define occlusion_query_iova(pool, query, field) \
67 query_iova(struct occlusion_query_slot, pool, query, field)
68
69 #define query_is_available(type, slot) \
70 ((type*)slot)->available.value
71
72 #define occlusion_query_is_available(slot) \
73 query_is_available(struct occlusion_query_slot, slot)
74
75 /*
76 * Returns a pointer to a given slot in a query pool.
77 */
78 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
79 {
80 return (char*)pool->bo.map + query * pool->stride;
81 }
82
83 VkResult
84 tu_CreateQueryPool(VkDevice _device,
85 const VkQueryPoolCreateInfo *pCreateInfo,
86 const VkAllocationCallbacks *pAllocator,
87 VkQueryPool *pQueryPool)
88 {
89 TU_FROM_HANDLE(tu_device, device, _device);
90 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
91 assert(pCreateInfo->queryCount > 0);
92
93 uint32_t slot_size;
94 switch (pCreateInfo->queryType) {
95 case VK_QUERY_TYPE_OCCLUSION:
96 slot_size = sizeof(struct occlusion_query_slot);
97 break;
98 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
99 case VK_QUERY_TYPE_TIMESTAMP:
100 unreachable("Unimplemented query type");
101 default:
102 assert(!"Invalid query type");
103 }
104
105 struct tu_query_pool *pool =
106 vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
107 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
108
109 if (!pool)
110 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
111
112 VkResult result = tu_bo_init_new(device, &pool->bo,
113 pCreateInfo->queryCount * slot_size);
114 if (result != VK_SUCCESS) {
115 vk_free2(&device->alloc, pAllocator, pool);
116 return result;
117 }
118
119 result = tu_bo_map(device, &pool->bo);
120 if (result != VK_SUCCESS) {
121 tu_bo_finish(device, &pool->bo);
122 vk_free2(&device->alloc, pAllocator, pool);
123 return result;
124 }
125
126 /* Initialize all query statuses to unavailable */
127 memset(pool->bo.map, 0, pool->bo.size);
128
129 pool->type = pCreateInfo->queryType;
130 pool->stride = slot_size;
131 pool->size = pCreateInfo->queryCount;
132 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
133 *pQueryPool = tu_query_pool_to_handle(pool);
134
135 return VK_SUCCESS;
136 }
137
138 void
139 tu_DestroyQueryPool(VkDevice _device,
140 VkQueryPool _pool,
141 const VkAllocationCallbacks *pAllocator)
142 {
143 TU_FROM_HANDLE(tu_device, device, _device);
144 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
145
146 if (!pool)
147 return;
148
149 tu_bo_finish(device, &pool->bo);
150 vk_free2(&device->alloc, pAllocator, pool);
151 }
152
153 /* Wait on the the availability status of a query up until a timeout. */
154 static VkResult
155 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
156 uint32_t query)
157 {
158 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
159 * scheduler friendly way instead of busy polling once the patch has landed
160 * upstream. */
161 struct occlusion_query_slot *slot = slot_address(pool, query);
162 uint64_t abs_timeout = os_time_get_absolute_timeout(
163 WAIT_TIMEOUT * NSEC_PER_SEC);
164 while(os_time_get_nano() < abs_timeout) {
165 if (occlusion_query_is_available(slot))
166 return VK_SUCCESS;
167 }
168 return vk_error(device->instance, VK_TIMEOUT);
169 }
170
171 /* Writes a query value to a buffer from the CPU. */
172 static void
173 write_query_value_cpu(char* base,
174 uint32_t offset,
175 uint64_t value,
176 VkQueryResultFlags flags)
177 {
178 if (flags & VK_QUERY_RESULT_64_BIT) {
179 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
180 } else {
181 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
182 }
183 }
184
185 static VkResult
186 get_occlusion_query_pool_results(struct tu_device *device,
187 struct tu_query_pool *pool,
188 uint32_t firstQuery,
189 uint32_t queryCount,
190 size_t dataSize,
191 void *pData,
192 VkDeviceSize stride,
193 VkQueryResultFlags flags)
194 {
195 assert(dataSize >= stride * queryCount);
196
197 char *result_base = pData;
198 VkResult result = VK_SUCCESS;
199 for (uint32_t i = 0; i < queryCount; i++) {
200 uint32_t query = firstQuery + i;
201 struct occlusion_query_slot *slot = slot_address(pool, query);
202 bool available = occlusion_query_is_available(slot);
203 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
204 VkResult wait_result = wait_for_available(device, pool, query);
205 if (wait_result != VK_SUCCESS)
206 return wait_result;
207 available = true;
208 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
209 /* From the Vulkan 1.1.130 spec:
210 *
211 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
212 * both not set then no result values are written to pData for
213 * queries that are in the unavailable state at the time of the
214 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
215 * availability state is still written to pData for those queries
216 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
217 */
218 result = VK_NOT_READY;
219 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
220 result_base += stride;
221 continue;
222 }
223 }
224
225 if (available)
226 write_query_value_cpu(result_base, 0, slot->result.value, flags);
227 else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
228 /* From the Vulkan 1.1.130 spec:
229 *
230 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
231 * is not set, and the query’s status is unavailable, an
232 * intermediate result value between zero and the final result
233 * value is written to pData for that query.
234 *
235 * Just return 0 here for simplicity since it's a valid result.
236 */
237 write_query_value_cpu(result_base, 0, 0, flags);
238
239 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
240 /* From the Vulkan 1.1.130 spec:
241 *
242 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
243 * integer value written for each query is non-zero if the query’s
244 * status was available or zero if the status was unavailable.
245 */
246 write_query_value_cpu(result_base, 1, available, flags);
247
248 result_base += stride;
249 }
250 return result;
251 }
252
253 VkResult
254 tu_GetQueryPoolResults(VkDevice _device,
255 VkQueryPool queryPool,
256 uint32_t firstQuery,
257 uint32_t queryCount,
258 size_t dataSize,
259 void *pData,
260 VkDeviceSize stride,
261 VkQueryResultFlags flags)
262 {
263 TU_FROM_HANDLE(tu_device, device, _device);
264 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
265 assert(firstQuery + queryCount <= pool->size);
266
267 switch (pool->type) {
268 case VK_QUERY_TYPE_OCCLUSION: {
269 return get_occlusion_query_pool_results(device, pool, firstQuery,
270 queryCount, dataSize, pData, stride, flags);
271 }
272 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
273 case VK_QUERY_TYPE_TIMESTAMP:
274 unreachable("Unimplemented query type");
275 default:
276 assert(!"Invalid query type");
277 }
278 return VK_SUCCESS;
279 }
280
281 /* Copies a query value from one buffer to another from the GPU. */
282 static void
283 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
284 struct tu_cs *cs,
285 uint64_t src_iova,
286 uint64_t base_write_iova,
287 uint32_t offset,
288 VkQueryResultFlags flags) {
289 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
290 sizeof(uint64_t) : sizeof(uint32_t);
291 uint64_t write_iova = base_write_iova + (offset * element_size);
292
293 tu_cs_reserve_space(cmdbuf->device, cs, 6);
294 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
295 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
296 CP_MEM_TO_MEM_0_DOUBLE : 0;
297 tu_cs_emit(cs, mem_to_mem_flags);
298 tu_cs_emit_qw(cs, write_iova);
299 tu_cs_emit_qw(cs, src_iova);
300 }
301
302 static void
303 emit_copy_occlusion_query_pool_results(struct tu_cmd_buffer *cmdbuf,
304 struct tu_cs *cs,
305 struct tu_query_pool *pool,
306 uint32_t firstQuery,
307 uint32_t queryCount,
308 struct tu_buffer *buffer,
309 VkDeviceSize dstOffset,
310 VkDeviceSize stride,
311 VkQueryResultFlags flags)
312 {
313 /* From the Vulkan 1.1.130 spec:
314 *
315 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
316 * uses of vkCmdResetQueryPool in the same queue, without any additional
317 * synchronization.
318 *
319 * To ensure that previous writes to the available bit are coherent, first
320 * wait for all writes to complete.
321 */
322 tu_cs_reserve_space(cmdbuf->device, cs, 1);
323 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
324
325 for (uint32_t i = 0; i < queryCount; i++) {
326 uint32_t query = firstQuery + i;
327 uint64_t available_iova = occlusion_query_iova(pool, query, available);
328 uint64_t result_iova = occlusion_query_iova(pool, query, result);
329 uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
330 /* Wait for the available bit to be set if executed with the
331 * VK_QUERY_RESULT_WAIT_BIT flag. */
332 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
333 tu_cs_reserve_space(cmdbuf->device, cs, 7);
334 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
335 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
336 CP_WAIT_REG_MEM_0_POLL_MEMORY);
337 tu_cs_emit_qw(cs, available_iova);
338 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
339 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
340 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
341 }
342
343 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
344 /* Unconditionally copying the bo->result into the buffer here is
345 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
346 * if the query is unavailable, this will copy the correct partial
347 * value of 0.
348 */
349 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
350 0 /* offset */, flags);
351 } else {
352 /* Conditionally copy bo->result into the buffer based on whether the
353 * query is available.
354 *
355 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
356 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
357 * that 0 < available < 2, aka available == 1.
358 */
359 tu_cs_reserve_space(cmdbuf->device, cs, 7);
360 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
361 tu_cs_emit_qw(cs, available_iova);
362 tu_cs_emit_qw(cs, available_iova);
363 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
364 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
365
366 /* Start of conditional execution */
367 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
368 0 /* offset */, flags);
369 /* End of conditional execution */
370 }
371
372 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
373 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
374 1 /* offset */, flags);
375 }
376 }
377
378 tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
379 }
380
381 void
382 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
383 VkQueryPool queryPool,
384 uint32_t firstQuery,
385 uint32_t queryCount,
386 VkBuffer dstBuffer,
387 VkDeviceSize dstOffset,
388 VkDeviceSize stride,
389 VkQueryResultFlags flags)
390 {
391 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
392 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
393 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
394 struct tu_cs *cs = &cmdbuf->cs;
395 assert(firstQuery + queryCount <= pool->size);
396
397 switch (pool->type) {
398 case VK_QUERY_TYPE_OCCLUSION: {
399 return emit_copy_occlusion_query_pool_results(cmdbuf, cs, pool,
400 firstQuery, queryCount, buffer, dstOffset, stride, flags);
401 }
402 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
403 case VK_QUERY_TYPE_TIMESTAMP:
404 unreachable("Unimplemented query type");
405 default:
406 assert(!"Invalid query type");
407 }
408 }
409
410 static void
411 emit_reset_occlusion_query_pool(struct tu_cmd_buffer *cmdbuf,
412 struct tu_query_pool *pool,
413 uint32_t firstQuery,
414 uint32_t queryCount)
415 {
416 struct tu_cs *cs = &cmdbuf->cs;
417
418 for (uint32_t i = 0; i < queryCount; i++) {
419 uint32_t query = firstQuery + i;
420 uint64_t available_iova = occlusion_query_iova(pool, query, available);
421 uint64_t result_iova = occlusion_query_iova(pool, query, result);
422 tu_cs_reserve_space(cmdbuf->device, cs, 11);
423 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
424 tu_cs_emit_qw(cs, available_iova);
425 tu_cs_emit_qw(cs, 0x0);
426
427 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
428 tu_cs_emit_qw(cs, result_iova);
429 tu_cs_emit_qw(cs, 0x0);
430 }
431 }
432
433 void
434 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
435 VkQueryPool queryPool,
436 uint32_t firstQuery,
437 uint32_t queryCount)
438 {
439 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
440 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
441
442 switch (pool->type) {
443 case VK_QUERY_TYPE_OCCLUSION:
444 emit_reset_occlusion_query_pool(cmdbuf, pool, firstQuery, queryCount);
445 break;
446 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
447 case VK_QUERY_TYPE_TIMESTAMP:
448 unreachable("Unimplemented query type");
449 default:
450 assert(!"Invalid query type");
451 }
452
453 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
454 }
455
456 static void
457 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
458 struct tu_query_pool *pool,
459 uint32_t query)
460 {
461 /* From the Vulkan 1.1.130 spec:
462 *
463 * A query must begin and end inside the same subpass of a render pass
464 * instance, or must both begin and end outside of a render pass
465 * instance.
466 *
467 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
468 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
469 * query begins/ends inside the same subpass of a render pass, we need to
470 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
471 * is then run on every tile during render, so we just need to accumulate
472 * sample counts in slot->result to compute the query result.
473 */
474 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
475
476 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
477
478 tu_cs_reserve_space(cmdbuf->device, cs, 7);
479 tu_cs_emit_regs(cs,
480 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
481
482 tu_cs_emit_regs(cs,
483 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
484
485 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
486 tu_cs_emit(cs, ZPASS_DONE);
487 }
488
489 void
490 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
491 VkQueryPool queryPool,
492 uint32_t query,
493 VkQueryControlFlags flags)
494 {
495 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
496 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
497 assert(query < pool->size);
498
499 switch (pool->type) {
500 case VK_QUERY_TYPE_OCCLUSION:
501 /* In freedreno, there is no implementation difference between
502 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
503 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
504 */
505 emit_begin_occlusion_query(cmdbuf, pool, query);
506 break;
507 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
508 case VK_QUERY_TYPE_TIMESTAMP:
509 unreachable("Unimplemented query type");
510 default:
511 assert(!"Invalid query type");
512 }
513
514 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
515 }
516
517 static void
518 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
519 struct tu_query_pool *pool,
520 uint32_t query)
521 {
522 /* Ending an occlusion query happens in a few steps:
523 * 1) Set the slot->end to UINT64_MAX.
524 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
525 * write the current sample count value into slot->end.
526 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
527 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
528 * 4) Accumulate the results of the query (slot->end - slot->begin) into
529 * slot->result.
530 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
531 * pass, set the slot's available bit since the query is now done.
532 * 6) If vkCmdEndQuery *is* called from within the scope of a render
533 * pass, we cannot mark as available yet since the commands in
534 * draw_cs are not run until vkCmdEndRenderPass.
535 */
536 const struct tu_render_pass *pass = cmdbuf->state.pass;
537 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
538
539 uint64_t available_iova = occlusion_query_iova(pool, query, available);
540 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
541 uint64_t end_iova = occlusion_query_iova(pool, query, end);
542 uint64_t result_iova = occlusion_query_iova(pool, query, result);
543 tu_cs_reserve_space(cmdbuf->device, cs, 31);
544 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
545 tu_cs_emit_qw(cs, end_iova);
546 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
547
548 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
549
550 tu_cs_emit_regs(cs,
551 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
552
553 tu_cs_emit_regs(cs,
554 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
555
556 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
557 tu_cs_emit(cs, ZPASS_DONE);
558
559 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
560 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
561 CP_WAIT_REG_MEM_0_POLL_MEMORY);
562 tu_cs_emit_qw(cs, end_iova);
563 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
564 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
565 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
566
567 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
568 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
569 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
570 tu_cs_emit_qw(cs, result_iova);
571 tu_cs_emit_qw(cs, result_iova);
572 tu_cs_emit_qw(cs, end_iova);
573 tu_cs_emit_qw(cs, begin_iova);
574
575 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
576
577 if (pass)
578 /* Technically, queries should be tracked per-subpass, but here we track
579 * at the render pass level to simply the code a bit. This is safe
580 * because the only commands that use the available bit are
581 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
582 * cannot be invoked from inside a render pass scope.
583 */
584 cs = &cmdbuf->draw_epilogue_cs;
585
586 tu_cs_reserve_space(cmdbuf->device, cs, 5);
587 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
588 tu_cs_emit_qw(cs, available_iova);
589 tu_cs_emit_qw(cs, 0x1);
590 }
591
592 void
593 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
594 VkQueryPool queryPool,
595 uint32_t query)
596 {
597 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
598 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
599 assert(query < pool->size);
600
601 switch (pool->type) {
602 case VK_QUERY_TYPE_OCCLUSION:
603 emit_end_occlusion_query(cmdbuf, pool, query);
604 break;
605 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
606 case VK_QUERY_TYPE_TIMESTAMP:
607 unreachable("Unimplemented query type");
608 default:
609 assert(!"Invalid query type");
610 }
611
612 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
613 }
614
615 void
616 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
617 VkPipelineStageFlagBits pipelineStage,
618 VkQueryPool queryPool,
619 uint32_t query)
620 {
621 }