6a106a40614e55b5cc1ed8e1cc044def14a4be8d
[mesa.git] / src / freedreno / vulkan / tu_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "tu_private.h"
27
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <stdbool.h>
31 #include <string.h>
32 #include <unistd.h>
33
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
37
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
40
41 #include "tu_cs.h"
42
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
45
46 /* Depending on the query type, there might be 2 integer values.
47 * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
48 * values[0] : primitives written, values[1]: primitives generated
49 */
50 struct PACKED slot_value {
51 uint64_t values[2];
52 };
53
54 struct PACKED query_slot {
55 struct slot_value available; /* 0 when unavailable, 1 when available */
56 struct slot_value result;
57 };
58
59 struct PACKED occlusion_query_slot {
60 struct query_slot common;
61 struct slot_value begin;
62 struct slot_value end;
63 };
64
65 /* The result of transform feedback queries is two integer values:
66 * common.result.values[0] is the count of primitives written,
67 * common.result.values[1] is the count of primitives generated.
68 * Also a result for each stream is stored at 4 slots respectively.
69 */
70 struct PACKED primitive_query_slot {
71 struct query_slot common;
72 struct slot_value begin[4];
73 struct slot_value end[4];
74 };
75
76 /* Returns the IOVA of a given uint64_t field in a given slot of a query
77 * pool. */
78 #define query_iova(type, pool, query, field, value_index) \
79 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
80 offsetof(struct slot_value, values[value_index])
81
82 #define occlusion_query_iova(pool, query, field) \
83 query_iova(struct occlusion_query_slot, pool, query, field, 0)
84
85 #define primitive_query_iova(pool, query, field, i) \
86 query_iova(struct primitive_query_slot, pool, query, field, i)
87
88 #define query_available_iova(pool, query) \
89 query_iova(struct query_slot, pool, query, available, 0)
90
91 #define query_result_iova(pool, query, i) \
92 query_iova(struct query_slot, pool, query, result, i)
93
94 #define query_is_available(slot) slot->available.values[0]
95
96 /*
97 * Returns a pointer to a given slot in a query pool.
98 */
99 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
100 {
101 return (char*)pool->bo.map + query * pool->stride;
102 }
103
104 VkResult
105 tu_CreateQueryPool(VkDevice _device,
106 const VkQueryPoolCreateInfo *pCreateInfo,
107 const VkAllocationCallbacks *pAllocator,
108 VkQueryPool *pQueryPool)
109 {
110 TU_FROM_HANDLE(tu_device, device, _device);
111 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
112 assert(pCreateInfo->queryCount > 0);
113
114 uint32_t slot_size;
115 switch (pCreateInfo->queryType) {
116 case VK_QUERY_TYPE_OCCLUSION:
117 slot_size = sizeof(struct occlusion_query_slot);
118 break;
119 case VK_QUERY_TYPE_TIMESTAMP:
120 slot_size = sizeof(struct query_slot);
121 break;
122 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
123 slot_size = sizeof(struct primitive_query_slot);
124 break;
125 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
126 unreachable("Unimplemented query type");
127 default:
128 assert(!"Invalid query type");
129 }
130
131 struct tu_query_pool *pool =
132 vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
133 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
134
135 if (!pool)
136 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
137
138 VkResult result = tu_bo_init_new(device, &pool->bo,
139 pCreateInfo->queryCount * slot_size);
140 if (result != VK_SUCCESS) {
141 vk_free2(&device->alloc, pAllocator, pool);
142 return result;
143 }
144
145 result = tu_bo_map(device, &pool->bo);
146 if (result != VK_SUCCESS) {
147 tu_bo_finish(device, &pool->bo);
148 vk_free2(&device->alloc, pAllocator, pool);
149 return result;
150 }
151
152 /* Initialize all query statuses to unavailable */
153 memset(pool->bo.map, 0, pool->bo.size);
154
155 pool->type = pCreateInfo->queryType;
156 pool->stride = slot_size;
157 pool->size = pCreateInfo->queryCount;
158 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
159 *pQueryPool = tu_query_pool_to_handle(pool);
160
161 return VK_SUCCESS;
162 }
163
164 void
165 tu_DestroyQueryPool(VkDevice _device,
166 VkQueryPool _pool,
167 const VkAllocationCallbacks *pAllocator)
168 {
169 TU_FROM_HANDLE(tu_device, device, _device);
170 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
171
172 if (!pool)
173 return;
174
175 tu_bo_finish(device, &pool->bo);
176 vk_free2(&device->alloc, pAllocator, pool);
177 }
178
179 static uint32_t
180 get_result_count(struct tu_query_pool *pool)
181 {
182 switch (pool->type) {
183 /* Occulusion and timestamp queries write one integer value */
184 case VK_QUERY_TYPE_OCCLUSION:
185 case VK_QUERY_TYPE_TIMESTAMP:
186 return 1;
187 /* Transform feedback queries write two integer values */
188 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
189 return 2;
190 default:
191 assert(!"Invalid query type");
192 return 0;
193 }
194 }
195
196 /* Wait on the the availability status of a query up until a timeout. */
197 static VkResult
198 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
199 uint32_t query)
200 {
201 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
202 * scheduler friendly way instead of busy polling once the patch has landed
203 * upstream. */
204 struct query_slot *slot = slot_address(pool, query);
205 uint64_t abs_timeout = os_time_get_absolute_timeout(
206 WAIT_TIMEOUT * NSEC_PER_SEC);
207 while(os_time_get_nano() < abs_timeout) {
208 if (query_is_available(slot))
209 return VK_SUCCESS;
210 }
211 return vk_error(device->instance, VK_TIMEOUT);
212 }
213
214 /* Writes a query value to a buffer from the CPU. */
215 static void
216 write_query_value_cpu(char* base,
217 uint32_t offset,
218 uint64_t value,
219 VkQueryResultFlags flags)
220 {
221 if (flags & VK_QUERY_RESULT_64_BIT) {
222 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
223 } else {
224 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
225 }
226 }
227
228 static VkResult
229 get_query_pool_results(struct tu_device *device,
230 struct tu_query_pool *pool,
231 uint32_t firstQuery,
232 uint32_t queryCount,
233 size_t dataSize,
234 void *pData,
235 VkDeviceSize stride,
236 VkQueryResultFlags flags)
237 {
238 assert(dataSize >= stride * queryCount);
239
240 char *result_base = pData;
241 VkResult result = VK_SUCCESS;
242 for (uint32_t i = 0; i < queryCount; i++) {
243 uint32_t query = firstQuery + i;
244 struct query_slot *slot = slot_address(pool, query);
245 bool available = query_is_available(slot);
246 uint32_t result_count = get_result_count(pool);
247
248 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
249 VkResult wait_result = wait_for_available(device, pool, query);
250 if (wait_result != VK_SUCCESS)
251 return wait_result;
252 available = true;
253 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
254 /* From the Vulkan 1.1.130 spec:
255 *
256 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
257 * both not set then no result values are written to pData for
258 * queries that are in the unavailable state at the time of the
259 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
260 * availability state is still written to pData for those queries
261 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
262 */
263 result = VK_NOT_READY;
264 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
265 result_base += stride;
266 continue;
267 }
268 }
269
270 for (uint32_t k = 0; k < result_count; k++) {
271 if (available)
272 write_query_value_cpu(result_base, k, slot->result.values[k], flags);
273 else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
274 /* From the Vulkan 1.1.130 spec:
275 *
276 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
277 * is not set, and the query’s status is unavailable, an
278 * intermediate result value between zero and the final result
279 * value is written to pData for that query.
280 *
281 * Just return 0 here for simplicity since it's a valid result.
282 */
283 write_query_value_cpu(result_base, k, 0, flags);
284 }
285
286 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
287 /* From the Vulkan 1.1.130 spec:
288 *
289 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
290 * integer value written for each query is non-zero if the query’s
291 * status was available or zero if the status was unavailable.
292 */
293 write_query_value_cpu(result_base, result_count, available, flags);
294
295 result_base += stride;
296 }
297 return result;
298 }
299
300 VkResult
301 tu_GetQueryPoolResults(VkDevice _device,
302 VkQueryPool queryPool,
303 uint32_t firstQuery,
304 uint32_t queryCount,
305 size_t dataSize,
306 void *pData,
307 VkDeviceSize stride,
308 VkQueryResultFlags flags)
309 {
310 TU_FROM_HANDLE(tu_device, device, _device);
311 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
312 assert(firstQuery + queryCount <= pool->size);
313
314 switch (pool->type) {
315 case VK_QUERY_TYPE_OCCLUSION:
316 case VK_QUERY_TYPE_TIMESTAMP:
317 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
318 return get_query_pool_results(device, pool, firstQuery, queryCount,
319 dataSize, pData, stride, flags);
320 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
321 unreachable("Unimplemented query type");
322 default:
323 assert(!"Invalid query type");
324 }
325 return VK_SUCCESS;
326 }
327
328 /* Copies a query value from one buffer to another from the GPU. */
329 static void
330 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
331 struct tu_cs *cs,
332 uint64_t src_iova,
333 uint64_t base_write_iova,
334 uint32_t offset,
335 VkQueryResultFlags flags) {
336 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
337 sizeof(uint64_t) : sizeof(uint32_t);
338 uint64_t write_iova = base_write_iova + (offset * element_size);
339
340 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
341 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
342 CP_MEM_TO_MEM_0_DOUBLE : 0;
343 tu_cs_emit(cs, mem_to_mem_flags);
344 tu_cs_emit_qw(cs, write_iova);
345 tu_cs_emit_qw(cs, src_iova);
346 }
347
348 static void
349 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
350 struct tu_cs *cs,
351 struct tu_query_pool *pool,
352 uint32_t firstQuery,
353 uint32_t queryCount,
354 struct tu_buffer *buffer,
355 VkDeviceSize dstOffset,
356 VkDeviceSize stride,
357 VkQueryResultFlags flags)
358 {
359 /* From the Vulkan 1.1.130 spec:
360 *
361 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
362 * uses of vkCmdResetQueryPool in the same queue, without any additional
363 * synchronization.
364 *
365 * To ensure that previous writes to the available bit are coherent, first
366 * wait for all writes to complete.
367 */
368 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
369
370 for (uint32_t i = 0; i < queryCount; i++) {
371 uint32_t query = firstQuery + i;
372 uint64_t available_iova = query_available_iova(pool, query);
373 uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
374 uint32_t result_count = get_result_count(pool);
375
376 /* Wait for the available bit to be set if executed with the
377 * VK_QUERY_RESULT_WAIT_BIT flag. */
378 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
379 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
380 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
381 CP_WAIT_REG_MEM_0_POLL_MEMORY);
382 tu_cs_emit_qw(cs, available_iova);
383 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
384 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
385 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
386 }
387
388 for (uint32_t k = 0; k < result_count; k++) {
389 uint64_t result_iova = query_result_iova(pool, query, k);
390
391 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
392 /* Unconditionally copying the bo->result into the buffer here is
393 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
394 * if the query is unavailable, this will copy the correct partial
395 * value of 0.
396 */
397 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
398 k /* offset */, flags);
399 } else {
400 /* Conditionally copy bo->result into the buffer based on whether the
401 * query is available.
402 *
403 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
404 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
405 * that 0 < available < 2, aka available == 1.
406 */
407 tu_cs_reserve(cs, 7 + 6);
408 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
409 tu_cs_emit_qw(cs, available_iova);
410 tu_cs_emit_qw(cs, available_iova);
411 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
412 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
413
414 /* Start of conditional execution */
415 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
416 k /* offset */, flags);
417 /* End of conditional execution */
418 }
419 }
420
421 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
422 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
423 result_count /* offset */, flags);
424 }
425 }
426
427 tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
428 }
429
430 void
431 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
432 VkQueryPool queryPool,
433 uint32_t firstQuery,
434 uint32_t queryCount,
435 VkBuffer dstBuffer,
436 VkDeviceSize dstOffset,
437 VkDeviceSize stride,
438 VkQueryResultFlags flags)
439 {
440 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
441 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
442 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
443 struct tu_cs *cs = &cmdbuf->cs;
444 assert(firstQuery + queryCount <= pool->size);
445
446 switch (pool->type) {
447 case VK_QUERY_TYPE_OCCLUSION:
448 case VK_QUERY_TYPE_TIMESTAMP:
449 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
450 return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
451 queryCount, buffer, dstOffset, stride, flags);
452 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
453 unreachable("Unimplemented query type");
454 default:
455 assert(!"Invalid query type");
456 }
457 }
458
459 static void
460 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
461 struct tu_query_pool *pool,
462 uint32_t firstQuery,
463 uint32_t queryCount)
464 {
465 struct tu_cs *cs = &cmdbuf->cs;
466
467 for (uint32_t i = 0; i < queryCount; i++) {
468 uint32_t query = firstQuery + i;
469
470 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
471 tu_cs_emit_qw(cs, query_available_iova(pool, query));
472 tu_cs_emit_qw(cs, 0x0);
473
474 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
475 tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
476 tu_cs_emit_qw(cs, 0x0);
477 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
478 tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
479 tu_cs_emit_qw(cs, 0x0);
480 }
481 }
482
483 void
484 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
485 VkQueryPool queryPool,
486 uint32_t firstQuery,
487 uint32_t queryCount)
488 {
489 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
490 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
491
492 switch (pool->type) {
493 case VK_QUERY_TYPE_TIMESTAMP:
494 case VK_QUERY_TYPE_OCCLUSION:
495 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
496 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
497 break;
498 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
499 unreachable("Unimplemented query type");
500 default:
501 assert(!"Invalid query type");
502 }
503
504 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
505 }
506
507 static void
508 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
509 struct tu_query_pool *pool,
510 uint32_t query)
511 {
512 /* From the Vulkan 1.1.130 spec:
513 *
514 * A query must begin and end inside the same subpass of a render pass
515 * instance, or must both begin and end outside of a render pass
516 * instance.
517 *
518 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
519 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
520 * query begins/ends inside the same subpass of a render pass, we need to
521 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
522 * is then run on every tile during render, so we just need to accumulate
523 * sample counts in slot->result to compute the query result.
524 */
525 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
526
527 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
528
529 tu_cs_emit_regs(cs,
530 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
531
532 tu_cs_emit_regs(cs,
533 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
534
535 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
536 tu_cs_emit(cs, ZPASS_DONE);
537 }
538
539 static void
540 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
541 struct tu_query_pool *pool,
542 uint32_t query,
543 uint32_t stream_id)
544 {
545 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
546 uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
547
548 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova));
549 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
550 }
551
552 void
553 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
554 VkQueryPool queryPool,
555 uint32_t query,
556 VkQueryControlFlags flags)
557 {
558 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
559 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
560 assert(query < pool->size);
561
562 switch (pool->type) {
563 case VK_QUERY_TYPE_OCCLUSION:
564 /* In freedreno, there is no implementation difference between
565 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
566 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
567 */
568 emit_begin_occlusion_query(cmdbuf, pool, query);
569 break;
570 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
571 emit_begin_xfb_query(cmdbuf, pool, query, 0);
572 break;
573 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
574 case VK_QUERY_TYPE_TIMESTAMP:
575 unreachable("Unimplemented query type");
576 default:
577 assert(!"Invalid query type");
578 }
579
580 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
581 }
582
583 void
584 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
585 VkQueryPool queryPool,
586 uint32_t query,
587 VkQueryControlFlags flags,
588 uint32_t index)
589 {
590 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
591 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
592 assert(query < pool->size);
593
594 switch (pool->type) {
595 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
596 emit_begin_xfb_query(cmdbuf, pool, query, index);
597 break;
598 default:
599 assert(!"Invalid query type");
600 }
601
602 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
603 }
604
605 static void
606 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
607 struct tu_query_pool *pool,
608 uint32_t query)
609 {
610 /* Ending an occlusion query happens in a few steps:
611 * 1) Set the slot->end to UINT64_MAX.
612 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
613 * write the current sample count value into slot->end.
614 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
615 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
616 * 4) Accumulate the results of the query (slot->end - slot->begin) into
617 * slot->result.
618 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
619 * pass, set the slot's available bit since the query is now done.
620 * 6) If vkCmdEndQuery *is* called from within the scope of a render
621 * pass, we cannot mark as available yet since the commands in
622 * draw_cs are not run until vkCmdEndRenderPass.
623 */
624 const struct tu_render_pass *pass = cmdbuf->state.pass;
625 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
626
627 uint64_t available_iova = query_available_iova(pool, query);
628 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
629 uint64_t end_iova = occlusion_query_iova(pool, query, end);
630 uint64_t result_iova = query_result_iova(pool, query, 0);
631 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
632 tu_cs_emit_qw(cs, end_iova);
633 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
634
635 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
636
637 tu_cs_emit_regs(cs,
638 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
639
640 tu_cs_emit_regs(cs,
641 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
642
643 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
644 tu_cs_emit(cs, ZPASS_DONE);
645
646 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
647 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
648 CP_WAIT_REG_MEM_0_POLL_MEMORY);
649 tu_cs_emit_qw(cs, end_iova);
650 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
651 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
652 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
653
654 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
655 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
656 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
657 tu_cs_emit_qw(cs, result_iova);
658 tu_cs_emit_qw(cs, result_iova);
659 tu_cs_emit_qw(cs, end_iova);
660 tu_cs_emit_qw(cs, begin_iova);
661
662 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
663
664 if (pass)
665 /* Technically, queries should be tracked per-subpass, but here we track
666 * at the render pass level to simply the code a bit. This is safe
667 * because the only commands that use the available bit are
668 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
669 * cannot be invoked from inside a render pass scope.
670 */
671 cs = &cmdbuf->draw_epilogue_cs;
672
673 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
674 tu_cs_emit_qw(cs, available_iova);
675 tu_cs_emit_qw(cs, 0x1);
676 }
677
678 static void
679 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
680 struct tu_query_pool *pool,
681 uint32_t query,
682 uint32_t stream_id)
683 {
684 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
685
686 uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
687 uint64_t result_written_iova = query_result_iova(pool, query, 0);
688 uint64_t result_generated_iova = query_result_iova(pool, query, 1);
689 uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
690 uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
691 uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
692 uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
693 uint64_t available_iova = query_available_iova(pool, query);
694
695 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova));
696 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
697
698 tu_cs_emit_wfi(cs);
699 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
700
701 /* Set the count of written primitives */
702 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
703 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
704 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
705 tu_cs_emit_qw(cs, result_written_iova);
706 tu_cs_emit_qw(cs, result_written_iova);
707 tu_cs_emit_qw(cs, end_written_iova);
708 tu_cs_emit_qw(cs, begin_written_iova);
709
710 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
711
712 /* Set the count of generated primitives */
713 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
714 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
715 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
716 tu_cs_emit_qw(cs, result_generated_iova);
717 tu_cs_emit_qw(cs, result_generated_iova);
718 tu_cs_emit_qw(cs, end_generated_iova);
719 tu_cs_emit_qw(cs, begin_generated_iova);
720
721 /* Set the availability to 1 */
722 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
723 tu_cs_emit_qw(cs, available_iova);
724 tu_cs_emit_qw(cs, 0x1);
725 }
726
727 void
728 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
729 VkQueryPool queryPool,
730 uint32_t query)
731 {
732 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
733 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
734 assert(query < pool->size);
735
736 switch (pool->type) {
737 case VK_QUERY_TYPE_OCCLUSION:
738 emit_end_occlusion_query(cmdbuf, pool, query);
739 break;
740 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
741 emit_end_xfb_query(cmdbuf, pool, query, 0);
742 break;
743 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
744 case VK_QUERY_TYPE_TIMESTAMP:
745 unreachable("Unimplemented query type");
746 default:
747 assert(!"Invalid query type");
748 }
749
750 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
751 }
752
753 void
754 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
755 VkQueryPool queryPool,
756 uint32_t query,
757 uint32_t index)
758 {
759 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
760 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
761 assert(query < pool->size);
762
763 switch (pool->type) {
764 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
765 assert(index <= 4);
766 emit_end_xfb_query(cmdbuf, pool, query, index);
767 break;
768 default:
769 assert(!"Invalid query type");
770 }
771
772 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
773 }
774
775 void
776 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
777 VkPipelineStageFlagBits pipelineStage,
778 VkQueryPool queryPool,
779 uint32_t query)
780 {
781 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
782 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
783 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
784
785 /* WFI to get more accurate timestamp */
786 tu_cs_emit_wfi(cs);
787
788 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
789 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
790 CP_REG_TO_MEM_0_CNT(2) |
791 CP_REG_TO_MEM_0_64B);
792 tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
793
794 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
795 tu_cs_emit_qw(cs, query_available_iova(pool, query));
796 tu_cs_emit_qw(cs, 0x1);
797
798 if (cmd->state.pass) {
799 /* TODO: to have useful in-renderpass timestamps:
800 * for sysmem path, we can just emit the timestamp in draw_cs,
801 * for gmem renderpass, we do something with accumulate,
802 * but I'm not sure that would follow the spec
803 */
804 tu_finishme("CmdWriteTimestam in renderpass not accurate");
805 }
806 }