8d91fe59c44badb5a63ae3531f50507332331e86
[mesa.git] / src / freedreno / vulkan / tu_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "tu_private.h"
27
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <stdbool.h>
31 #include <string.h>
32 #include <unistd.h>
33
34 #include "adreno_pm4.xml.h"
35 #include "adreno_common.xml.h"
36 #include "a6xx.xml.h"
37
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
40
41 #include "tu_cs.h"
42
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
45
46 struct PACKED query_slot {
47 uint64_t available;
48 };
49
50 struct PACKED occlusion_slot_value {
51 /* Seems sample counters are placed to be 16-byte aligned
52 * even though this query needs an 8-byte slot. */
53 uint64_t value;
54 uint64_t _padding;
55 };
56
57 struct PACKED occlusion_query_slot {
58 struct query_slot common;
59 uint64_t result;
60
61 struct occlusion_slot_value begin;
62 struct occlusion_slot_value end;
63 };
64
65 struct PACKED timestamp_query_slot {
66 struct query_slot common;
67 uint64_t result;
68 };
69
70 struct PACKED primitive_slot_value {
71 uint64_t values[2];
72 };
73
74 struct PACKED primitive_query_slot {
75 struct query_slot common;
76 /* The result of transform feedback queries is two integer values:
77 * results[0] is the count of primitives written,
78 * results[1] is the count of primitives generated.
79 * Also a result for each stream is stored at 4 slots respectively.
80 */
81 uint64_t results[2];
82
83 /* Primitive counters also need to be 16-byte aligned. */
84 uint64_t _padding;
85
86 struct primitive_slot_value begin[4];
87 struct primitive_slot_value end[4];
88 };
89
90 /* Returns the IOVA of a given uint64_t field in a given slot of a query
91 * pool. */
92 #define query_iova(type, pool, query, field) \
93 pool->bo.iova + pool->stride * (query) + offsetof(type, field)
94
95 #define occlusion_query_iova(pool, query, field) \
96 query_iova(struct occlusion_query_slot, pool, query, field)
97
98 #define primitive_query_iova(pool, query, field, i) \
99 query_iova(struct primitive_query_slot, pool, query, field) + \
100 offsetof(struct primitive_slot_value, values[i])
101
102 #define query_available_iova(pool, query) \
103 query_iova(struct query_slot, pool, query, available)
104
105 #define query_result_iova(pool, query, i) \
106 pool->bo.iova + pool->stride * (query) + \
107 sizeof(struct query_slot) + sizeof(uint64_t) * i
108
109 #define query_result_addr(pool, query, i) \
110 pool->bo.map + pool->stride * query + \
111 sizeof(struct query_slot) + sizeof(uint64_t) * i
112
113 #define query_is_available(slot) slot->available
114
115 /*
116 * Returns a pointer to a given slot in a query pool.
117 */
118 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
119 {
120 return (char*)pool->bo.map + query * pool->stride;
121 }
122
123 VkResult
124 tu_CreateQueryPool(VkDevice _device,
125 const VkQueryPoolCreateInfo *pCreateInfo,
126 const VkAllocationCallbacks *pAllocator,
127 VkQueryPool *pQueryPool)
128 {
129 TU_FROM_HANDLE(tu_device, device, _device);
130 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
131 assert(pCreateInfo->queryCount > 0);
132
133 uint32_t slot_size;
134 switch (pCreateInfo->queryType) {
135 case VK_QUERY_TYPE_OCCLUSION:
136 slot_size = sizeof(struct occlusion_query_slot);
137 break;
138 case VK_QUERY_TYPE_TIMESTAMP:
139 slot_size = sizeof(struct timestamp_query_slot);
140 break;
141 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
142 slot_size = sizeof(struct primitive_query_slot);
143 break;
144 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
145 unreachable("Unimplemented query type");
146 default:
147 assert(!"Invalid query type");
148 }
149
150 struct tu_query_pool *pool =
151 vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
152 VK_OBJECT_TYPE_QUERY_POOL);
153 if (!pool)
154 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
155
156 VkResult result = tu_bo_init_new(device, &pool->bo,
157 pCreateInfo->queryCount * slot_size);
158 if (result != VK_SUCCESS) {
159 vk_object_free(&device->vk, pAllocator, pool);
160 return result;
161 }
162
163 result = tu_bo_map(device, &pool->bo);
164 if (result != VK_SUCCESS) {
165 tu_bo_finish(device, &pool->bo);
166 vk_object_free(&device->vk, pAllocator, pool);
167 return result;
168 }
169
170 /* Initialize all query statuses to unavailable */
171 memset(pool->bo.map, 0, pool->bo.size);
172
173 pool->type = pCreateInfo->queryType;
174 pool->stride = slot_size;
175 pool->size = pCreateInfo->queryCount;
176 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
177 *pQueryPool = tu_query_pool_to_handle(pool);
178
179 return VK_SUCCESS;
180 }
181
182 void
183 tu_DestroyQueryPool(VkDevice _device,
184 VkQueryPool _pool,
185 const VkAllocationCallbacks *pAllocator)
186 {
187 TU_FROM_HANDLE(tu_device, device, _device);
188 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
189
190 if (!pool)
191 return;
192
193 tu_bo_finish(device, &pool->bo);
194 vk_object_free(&device->vk, pAllocator, pool);
195 }
196
197 static uint32_t
198 get_result_count(struct tu_query_pool *pool)
199 {
200 switch (pool->type) {
201 /* Occulusion and timestamp queries write one integer value */
202 case VK_QUERY_TYPE_OCCLUSION:
203 case VK_QUERY_TYPE_TIMESTAMP:
204 return 1;
205 /* Transform feedback queries write two integer values */
206 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
207 return 2;
208 default:
209 assert(!"Invalid query type");
210 return 0;
211 }
212 }
213
214 /* Wait on the the availability status of a query up until a timeout. */
215 static VkResult
216 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
217 uint32_t query)
218 {
219 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
220 * scheduler friendly way instead of busy polling once the patch has landed
221 * upstream. */
222 struct query_slot *slot = slot_address(pool, query);
223 uint64_t abs_timeout = os_time_get_absolute_timeout(
224 WAIT_TIMEOUT * NSEC_PER_SEC);
225 while(os_time_get_nano() < abs_timeout) {
226 if (query_is_available(slot))
227 return VK_SUCCESS;
228 }
229 return vk_error(device->instance, VK_TIMEOUT);
230 }
231
232 /* Writes a query value to a buffer from the CPU. */
233 static void
234 write_query_value_cpu(char* base,
235 uint32_t offset,
236 uint64_t value,
237 VkQueryResultFlags flags)
238 {
239 if (flags & VK_QUERY_RESULT_64_BIT) {
240 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
241 } else {
242 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
243 }
244 }
245
246 static VkResult
247 get_query_pool_results(struct tu_device *device,
248 struct tu_query_pool *pool,
249 uint32_t firstQuery,
250 uint32_t queryCount,
251 size_t dataSize,
252 void *pData,
253 VkDeviceSize stride,
254 VkQueryResultFlags flags)
255 {
256 assert(dataSize >= stride * queryCount);
257
258 char *result_base = pData;
259 VkResult result = VK_SUCCESS;
260 for (uint32_t i = 0; i < queryCount; i++) {
261 uint32_t query = firstQuery + i;
262 struct query_slot *slot = slot_address(pool, query);
263 bool available = query_is_available(slot);
264 uint32_t result_count = get_result_count(pool);
265
266 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
267 VkResult wait_result = wait_for_available(device, pool, query);
268 if (wait_result != VK_SUCCESS)
269 return wait_result;
270 available = true;
271 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
272 /* From the Vulkan 1.1.130 spec:
273 *
274 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
275 * both not set then no result values are written to pData for
276 * queries that are in the unavailable state at the time of the
277 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
278 * availability state is still written to pData for those queries
279 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
280 */
281 result = VK_NOT_READY;
282 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
283 result_base += stride;
284 continue;
285 }
286 }
287
288 for (uint32_t k = 0; k < result_count; k++) {
289 if (available) {
290 uint64_t *result = query_result_addr(pool, query, k);
291 write_query_value_cpu(result_base, k, *result, flags);
292 } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
293 /* From the Vulkan 1.1.130 spec:
294 *
295 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
296 * is not set, and the query’s status is unavailable, an
297 * intermediate result value between zero and the final result
298 * value is written to pData for that query.
299 *
300 * Just return 0 here for simplicity since it's a valid result.
301 */
302 write_query_value_cpu(result_base, k, 0, flags);
303 }
304
305 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
306 /* From the Vulkan 1.1.130 spec:
307 *
308 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
309 * integer value written for each query is non-zero if the query’s
310 * status was available or zero if the status was unavailable.
311 */
312 write_query_value_cpu(result_base, result_count, available, flags);
313
314 result_base += stride;
315 }
316 return result;
317 }
318
319 VkResult
320 tu_GetQueryPoolResults(VkDevice _device,
321 VkQueryPool queryPool,
322 uint32_t firstQuery,
323 uint32_t queryCount,
324 size_t dataSize,
325 void *pData,
326 VkDeviceSize stride,
327 VkQueryResultFlags flags)
328 {
329 TU_FROM_HANDLE(tu_device, device, _device);
330 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
331 assert(firstQuery + queryCount <= pool->size);
332
333 if (tu_device_is_lost(device))
334 return VK_ERROR_DEVICE_LOST;
335
336 switch (pool->type) {
337 case VK_QUERY_TYPE_OCCLUSION:
338 case VK_QUERY_TYPE_TIMESTAMP:
339 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
340 return get_query_pool_results(device, pool, firstQuery, queryCount,
341 dataSize, pData, stride, flags);
342 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
343 unreachable("Unimplemented query type");
344 default:
345 assert(!"Invalid query type");
346 }
347 return VK_SUCCESS;
348 }
349
350 /* Copies a query value from one buffer to another from the GPU. */
351 static void
352 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
353 struct tu_cs *cs,
354 uint64_t src_iova,
355 uint64_t base_write_iova,
356 uint32_t offset,
357 VkQueryResultFlags flags) {
358 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
359 sizeof(uint64_t) : sizeof(uint32_t);
360 uint64_t write_iova = base_write_iova + (offset * element_size);
361
362 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
363 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
364 CP_MEM_TO_MEM_0_DOUBLE : 0;
365 tu_cs_emit(cs, mem_to_mem_flags);
366 tu_cs_emit_qw(cs, write_iova);
367 tu_cs_emit_qw(cs, src_iova);
368 }
369
370 static void
371 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
372 struct tu_cs *cs,
373 struct tu_query_pool *pool,
374 uint32_t firstQuery,
375 uint32_t queryCount,
376 struct tu_buffer *buffer,
377 VkDeviceSize dstOffset,
378 VkDeviceSize stride,
379 VkQueryResultFlags flags)
380 {
381 /* From the Vulkan 1.1.130 spec:
382 *
383 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
384 * uses of vkCmdResetQueryPool in the same queue, without any additional
385 * synchronization.
386 *
387 * To ensure that previous writes to the available bit are coherent, first
388 * wait for all writes to complete.
389 */
390 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
391
392 for (uint32_t i = 0; i < queryCount; i++) {
393 uint32_t query = firstQuery + i;
394 uint64_t available_iova = query_available_iova(pool, query);
395 uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
396 uint32_t result_count = get_result_count(pool);
397
398 /* Wait for the available bit to be set if executed with the
399 * VK_QUERY_RESULT_WAIT_BIT flag. */
400 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
401 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
402 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
403 CP_WAIT_REG_MEM_0_POLL_MEMORY);
404 tu_cs_emit_qw(cs, available_iova);
405 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
406 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
407 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
408 }
409
410 for (uint32_t k = 0; k < result_count; k++) {
411 uint64_t result_iova = query_result_iova(pool, query, k);
412
413 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
414 /* Unconditionally copying the bo->result into the buffer here is
415 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
416 * if the query is unavailable, this will copy the correct partial
417 * value of 0.
418 */
419 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
420 k /* offset */, flags);
421 } else {
422 /* Conditionally copy bo->result into the buffer based on whether the
423 * query is available.
424 *
425 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
426 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
427 * that 0 < available < 2, aka available == 1.
428 */
429 tu_cs_reserve(cs, 7 + 6);
430 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
431 tu_cs_emit_qw(cs, available_iova);
432 tu_cs_emit_qw(cs, available_iova);
433 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
434 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
435
436 /* Start of conditional execution */
437 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
438 k /* offset */, flags);
439 /* End of conditional execution */
440 }
441 }
442
443 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
444 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
445 result_count /* offset */, flags);
446 }
447 }
448
449 tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
450 }
451
452 void
453 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
454 VkQueryPool queryPool,
455 uint32_t firstQuery,
456 uint32_t queryCount,
457 VkBuffer dstBuffer,
458 VkDeviceSize dstOffset,
459 VkDeviceSize stride,
460 VkQueryResultFlags flags)
461 {
462 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
463 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
464 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
465 struct tu_cs *cs = &cmdbuf->cs;
466 assert(firstQuery + queryCount <= pool->size);
467
468 switch (pool->type) {
469 case VK_QUERY_TYPE_OCCLUSION:
470 case VK_QUERY_TYPE_TIMESTAMP:
471 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
472 return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
473 queryCount, buffer, dstOffset, stride, flags);
474 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
475 unreachable("Unimplemented query type");
476 default:
477 assert(!"Invalid query type");
478 }
479 }
480
481 static void
482 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
483 struct tu_query_pool *pool,
484 uint32_t firstQuery,
485 uint32_t queryCount)
486 {
487 struct tu_cs *cs = &cmdbuf->cs;
488
489 for (uint32_t i = 0; i < queryCount; i++) {
490 uint32_t query = firstQuery + i;
491
492 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
493 tu_cs_emit_qw(cs, query_available_iova(pool, query));
494 tu_cs_emit_qw(cs, 0x0);
495
496 for (uint32_t k = 0; k < get_result_count(pool); k++) {
497 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
498 tu_cs_emit_qw(cs, query_result_iova(pool, query, k));
499 tu_cs_emit_qw(cs, 0x0);
500 }
501 }
502 }
503
504 void
505 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
506 VkQueryPool queryPool,
507 uint32_t firstQuery,
508 uint32_t queryCount)
509 {
510 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
511 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
512
513 switch (pool->type) {
514 case VK_QUERY_TYPE_TIMESTAMP:
515 case VK_QUERY_TYPE_OCCLUSION:
516 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
517 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
518 break;
519 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
520 unreachable("Unimplemented query type");
521 default:
522 assert(!"Invalid query type");
523 }
524
525 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
526 }
527
528 static void
529 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
530 struct tu_query_pool *pool,
531 uint32_t query)
532 {
533 /* From the Vulkan 1.1.130 spec:
534 *
535 * A query must begin and end inside the same subpass of a render pass
536 * instance, or must both begin and end outside of a render pass
537 * instance.
538 *
539 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
540 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
541 * query begins/ends inside the same subpass of a render pass, we need to
542 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
543 * is then run on every tile during render, so we just need to accumulate
544 * sample counts in slot->result to compute the query result.
545 */
546 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
547
548 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
549
550 tu_cs_emit_regs(cs,
551 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
552
553 tu_cs_emit_regs(cs,
554 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
555
556 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
557 tu_cs_emit(cs, ZPASS_DONE);
558 }
559
560 static void
561 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
562 struct tu_query_pool *pool,
563 uint32_t query,
564 uint32_t stream_id)
565 {
566 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
567 uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
568
569 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova));
570 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
571 }
572
573 void
574 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
575 VkQueryPool queryPool,
576 uint32_t query,
577 VkQueryControlFlags flags)
578 {
579 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
580 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
581 assert(query < pool->size);
582
583 switch (pool->type) {
584 case VK_QUERY_TYPE_OCCLUSION:
585 /* In freedreno, there is no implementation difference between
586 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
587 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
588 */
589 emit_begin_occlusion_query(cmdbuf, pool, query);
590 break;
591 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
592 emit_begin_xfb_query(cmdbuf, pool, query, 0);
593 break;
594 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
595 case VK_QUERY_TYPE_TIMESTAMP:
596 unreachable("Unimplemented query type");
597 default:
598 assert(!"Invalid query type");
599 }
600
601 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
602 }
603
604 void
605 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
606 VkQueryPool queryPool,
607 uint32_t query,
608 VkQueryControlFlags flags,
609 uint32_t index)
610 {
611 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
612 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
613 assert(query < pool->size);
614
615 switch (pool->type) {
616 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
617 emit_begin_xfb_query(cmdbuf, pool, query, index);
618 break;
619 default:
620 assert(!"Invalid query type");
621 }
622
623 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
624 }
625
626 static void
627 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
628 struct tu_query_pool *pool,
629 uint32_t query)
630 {
631 /* Ending an occlusion query happens in a few steps:
632 * 1) Set the slot->end to UINT64_MAX.
633 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
634 * write the current sample count value into slot->end.
635 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
636 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
637 * 4) Accumulate the results of the query (slot->end - slot->begin) into
638 * slot->result.
639 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
640 * pass, set the slot's available bit since the query is now done.
641 * 6) If vkCmdEndQuery *is* called from within the scope of a render
642 * pass, we cannot mark as available yet since the commands in
643 * draw_cs are not run until vkCmdEndRenderPass.
644 */
645 const struct tu_render_pass *pass = cmdbuf->state.pass;
646 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
647
648 uint64_t available_iova = query_available_iova(pool, query);
649 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
650 uint64_t end_iova = occlusion_query_iova(pool, query, end);
651 uint64_t result_iova = query_result_iova(pool, query, 0);
652 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
653 tu_cs_emit_qw(cs, end_iova);
654 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
655
656 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
657
658 tu_cs_emit_regs(cs,
659 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
660
661 tu_cs_emit_regs(cs,
662 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
663
664 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
665 tu_cs_emit(cs, ZPASS_DONE);
666
667 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
668 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
669 CP_WAIT_REG_MEM_0_POLL_MEMORY);
670 tu_cs_emit_qw(cs, end_iova);
671 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
672 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
673 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
674
675 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
676 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
677 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
678 tu_cs_emit_qw(cs, result_iova);
679 tu_cs_emit_qw(cs, result_iova);
680 tu_cs_emit_qw(cs, end_iova);
681 tu_cs_emit_qw(cs, begin_iova);
682
683 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
684
685 if (pass)
686 /* Technically, queries should be tracked per-subpass, but here we track
687 * at the render pass level to simply the code a bit. This is safe
688 * because the only commands that use the available bit are
689 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
690 * cannot be invoked from inside a render pass scope.
691 */
692 cs = &cmdbuf->draw_epilogue_cs;
693
694 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
695 tu_cs_emit_qw(cs, available_iova);
696 tu_cs_emit_qw(cs, 0x1);
697 }
698
699 static void
700 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
701 struct tu_query_pool *pool,
702 uint32_t query,
703 uint32_t stream_id)
704 {
705 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
706
707 uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
708 uint64_t result_written_iova = query_result_iova(pool, query, 0);
709 uint64_t result_generated_iova = query_result_iova(pool, query, 1);
710 uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
711 uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
712 uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
713 uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
714 uint64_t available_iova = query_available_iova(pool, query);
715
716 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova));
717 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
718
719 tu_cs_emit_wfi(cs);
720 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
721
722 /* Set the count of written primitives */
723 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
724 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
725 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
726 tu_cs_emit_qw(cs, result_written_iova);
727 tu_cs_emit_qw(cs, result_written_iova);
728 tu_cs_emit_qw(cs, end_written_iova);
729 tu_cs_emit_qw(cs, begin_written_iova);
730
731 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
732
733 /* Set the count of generated primitives */
734 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
735 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
736 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
737 tu_cs_emit_qw(cs, result_generated_iova);
738 tu_cs_emit_qw(cs, result_generated_iova);
739 tu_cs_emit_qw(cs, end_generated_iova);
740 tu_cs_emit_qw(cs, begin_generated_iova);
741
742 /* Set the availability to 1 */
743 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
744 tu_cs_emit_qw(cs, available_iova);
745 tu_cs_emit_qw(cs, 0x1);
746 }
747
748 /* Implement this bit of spec text from section 17.2 "Query Operation":
749 *
750 * If queries are used while executing a render pass instance that has
751 * multiview enabled, the query uses N consecutive query indices in the
752 * query pool (starting at query) where N is the number of bits set in the
753 * view mask in the subpass the query is used in. How the numerical
754 * results of the query are distributed among the queries is
755 * implementation-dependent. For example, some implementations may write
756 * each view’s results to a distinct query, while other implementations
757 * may write the total result to the first query and write zero to the
758 * other queries. However, the sum of the results in all the queries must
759 * accurately reflect the total result of the query summed over all views.
760 * Applications can sum the results from all the queries to compute the
761 * total result.
762 *
763 * Since we execute all views at once, we write zero to the other queries.
764 * Furthermore, because queries must be reset before use, and we set the
765 * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
766 */
767
768 static void
769 handle_multiview_queries(struct tu_cmd_buffer *cmd,
770 struct tu_query_pool *pool,
771 uint32_t query)
772 {
773 if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
774 return;
775
776 unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
777 struct tu_cs *cs = &cmd->draw_epilogue_cs;
778
779 for (uint32_t i = 1; i < views; i++) {
780 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
781 tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
782 tu_cs_emit_qw(cs, 0x1);
783 }
784 }
785
786 void
787 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
788 VkQueryPool queryPool,
789 uint32_t query)
790 {
791 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
792 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
793 assert(query < pool->size);
794
795 switch (pool->type) {
796 case VK_QUERY_TYPE_OCCLUSION:
797 emit_end_occlusion_query(cmdbuf, pool, query);
798 break;
799 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
800 emit_end_xfb_query(cmdbuf, pool, query, 0);
801 break;
802 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
803 case VK_QUERY_TYPE_TIMESTAMP:
804 unreachable("Unimplemented query type");
805 default:
806 assert(!"Invalid query type");
807 }
808
809 handle_multiview_queries(cmdbuf, pool, query);
810
811 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
812 }
813
814 void
815 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
816 VkQueryPool queryPool,
817 uint32_t query,
818 uint32_t index)
819 {
820 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
821 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
822 assert(query < pool->size);
823
824 switch (pool->type) {
825 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
826 assert(index <= 4);
827 emit_end_xfb_query(cmdbuf, pool, query, index);
828 break;
829 default:
830 assert(!"Invalid query type");
831 }
832
833 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
834 }
835
836 void
837 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
838 VkPipelineStageFlagBits pipelineStage,
839 VkQueryPool queryPool,
840 uint32_t query)
841 {
842 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
843 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
844
845 tu_bo_list_add(&cmd->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
846
847 /* Inside a render pass, just write the timestamp multiple times so that
848 * the user gets the last one if we use GMEM. There isn't really much
849 * better we can do, and this seems to be what the blob does too.
850 */
851 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
852
853 /* Stages that will already have been executed by the time the CP executes
854 * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
855 * indirect stage counts as top-of-pipe too.
856 */
857 VkPipelineStageFlags top_of_pipe_flags =
858 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
859 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
860
861 if (pipelineStage & ~top_of_pipe_flags) {
862 /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
863 * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
864 * complete.
865 *
866 * Stalling the CP like this is really unfortunate, but I don't think
867 * there's a better solution that allows all 48 bits of precision
868 * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
869 */
870 tu_cs_emit_wfi(cs);
871 }
872
873 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
874 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
875 CP_REG_TO_MEM_0_CNT(2) |
876 CP_REG_TO_MEM_0_64B);
877 tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
878
879 /* Only flag availability once the entire renderpass is done, similar to
880 * the begin/end path.
881 */
882 cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
883
884 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
885 tu_cs_emit_qw(cs, query_available_iova(pool, query));
886 tu_cs_emit_qw(cs, 0x1);
887
888 /* From the spec for vkCmdWriteTimestamp:
889 *
890 * If vkCmdWriteTimestamp is called while executing a render pass
891 * instance that has multiview enabled, the timestamp uses N consecutive
892 * query indices in the query pool (starting at query) where N is the
893 * number of bits set in the view mask of the subpass the command is
894 * executed in. The resulting query values are determined by an
895 * implementation-dependent choice of one of the following behaviors:
896 *
897 * - The first query is a timestamp value and (if more than one bit is
898 * set in the view mask) zero is written to the remaining queries.
899 * If two timestamps are written in the same subpass, the sum of the
900 * execution time of all views between those commands is the
901 * difference between the first query written by each command.
902 *
903 * - All N queries are timestamp values. If two timestamps are written
904 * in the same subpass, the sum of the execution time of all views
905 * between those commands is the sum of the difference between
906 * corresponding queries written by each command. The difference
907 * between corresponding queries may be the execution time of a
908 * single view.
909 *
910 * We execute all views in the same draw call, so we implement the first
911 * option, the same as regular queries.
912 */
913 handle_multiview_queries(cmd, pool, query);
914 }