turnip: Properly return VK_DEVICE_LOST on queuesubmit failures.
[mesa.git] / src / freedreno / vulkan / tu_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "tu_private.h"
27
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <stdbool.h>
31 #include <string.h>
32 #include <unistd.h>
33
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
37
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
40
41 #include "tu_cs.h"
42
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
45
46 /* Depending on the query type, there might be 2 integer values.
47 * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
48 * values[0] : primitives written, values[1]: primitives generated
49 */
50 struct PACKED slot_value {
51 uint64_t values[2];
52 };
53
54 struct PACKED query_slot {
55 struct slot_value available; /* 0 when unavailable, 1 when available */
56 struct slot_value result;
57 };
58
59 struct PACKED occlusion_query_slot {
60 struct query_slot common;
61 struct slot_value begin;
62 struct slot_value end;
63 };
64
65 /* The result of transform feedback queries is two integer values:
66 * common.result.values[0] is the count of primitives written,
67 * common.result.values[1] is the count of primitives generated.
68 * Also a result for each stream is stored at 4 slots respectively.
69 */
70 struct PACKED primitive_query_slot {
71 struct query_slot common;
72 struct slot_value begin[4];
73 struct slot_value end[4];
74 };
75
76 /* Returns the IOVA of a given uint64_t field in a given slot of a query
77 * pool. */
78 #define query_iova(type, pool, query, field, value_index) \
79 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
80 offsetof(struct slot_value, values[value_index])
81
82 #define occlusion_query_iova(pool, query, field) \
83 query_iova(struct occlusion_query_slot, pool, query, field, 0)
84
85 #define primitive_query_iova(pool, query, field, i) \
86 query_iova(struct primitive_query_slot, pool, query, field, i)
87
88 #define query_available_iova(pool, query) \
89 query_iova(struct query_slot, pool, query, available, 0)
90
91 #define query_result_iova(pool, query, i) \
92 query_iova(struct query_slot, pool, query, result, i)
93
94 #define query_is_available(slot) slot->available.values[0]
95
96 /*
97 * Returns a pointer to a given slot in a query pool.
98 */
99 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
100 {
101 return (char*)pool->bo.map + query * pool->stride;
102 }
103
104 VkResult
105 tu_CreateQueryPool(VkDevice _device,
106 const VkQueryPoolCreateInfo *pCreateInfo,
107 const VkAllocationCallbacks *pAllocator,
108 VkQueryPool *pQueryPool)
109 {
110 TU_FROM_HANDLE(tu_device, device, _device);
111 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
112 assert(pCreateInfo->queryCount > 0);
113
114 uint32_t slot_size;
115 switch (pCreateInfo->queryType) {
116 case VK_QUERY_TYPE_OCCLUSION:
117 slot_size = sizeof(struct occlusion_query_slot);
118 break;
119 case VK_QUERY_TYPE_TIMESTAMP:
120 slot_size = sizeof(struct query_slot);
121 break;
122 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
123 slot_size = sizeof(struct primitive_query_slot);
124 break;
125 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
126 unreachable("Unimplemented query type");
127 default:
128 assert(!"Invalid query type");
129 }
130
131 struct tu_query_pool *pool =
132 vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
133 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
134
135 if (!pool)
136 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
137
138 VkResult result = tu_bo_init_new(device, &pool->bo,
139 pCreateInfo->queryCount * slot_size);
140 if (result != VK_SUCCESS) {
141 vk_free2(&device->alloc, pAllocator, pool);
142 return result;
143 }
144
145 result = tu_bo_map(device, &pool->bo);
146 if (result != VK_SUCCESS) {
147 tu_bo_finish(device, &pool->bo);
148 vk_free2(&device->alloc, pAllocator, pool);
149 return result;
150 }
151
152 /* Initialize all query statuses to unavailable */
153 memset(pool->bo.map, 0, pool->bo.size);
154
155 pool->type = pCreateInfo->queryType;
156 pool->stride = slot_size;
157 pool->size = pCreateInfo->queryCount;
158 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
159 *pQueryPool = tu_query_pool_to_handle(pool);
160
161 return VK_SUCCESS;
162 }
163
164 void
165 tu_DestroyQueryPool(VkDevice _device,
166 VkQueryPool _pool,
167 const VkAllocationCallbacks *pAllocator)
168 {
169 TU_FROM_HANDLE(tu_device, device, _device);
170 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
171
172 if (!pool)
173 return;
174
175 tu_bo_finish(device, &pool->bo);
176 vk_free2(&device->alloc, pAllocator, pool);
177 }
178
179 static uint32_t
180 get_result_count(struct tu_query_pool *pool)
181 {
182 switch (pool->type) {
183 /* Occulusion and timestamp queries write one integer value */
184 case VK_QUERY_TYPE_OCCLUSION:
185 case VK_QUERY_TYPE_TIMESTAMP:
186 return 1;
187 /* Transform feedback queries write two integer values */
188 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
189 return 2;
190 default:
191 assert(!"Invalid query type");
192 return 0;
193 }
194 }
195
196 /* Wait on the the availability status of a query up until a timeout. */
197 static VkResult
198 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
199 uint32_t query)
200 {
201 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
202 * scheduler friendly way instead of busy polling once the patch has landed
203 * upstream. */
204 struct query_slot *slot = slot_address(pool, query);
205 uint64_t abs_timeout = os_time_get_absolute_timeout(
206 WAIT_TIMEOUT * NSEC_PER_SEC);
207 while(os_time_get_nano() < abs_timeout) {
208 if (query_is_available(slot))
209 return VK_SUCCESS;
210 }
211 return vk_error(device->instance, VK_TIMEOUT);
212 }
213
214 /* Writes a query value to a buffer from the CPU. */
215 static void
216 write_query_value_cpu(char* base,
217 uint32_t offset,
218 uint64_t value,
219 VkQueryResultFlags flags)
220 {
221 if (flags & VK_QUERY_RESULT_64_BIT) {
222 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
223 } else {
224 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
225 }
226 }
227
228 static VkResult
229 get_query_pool_results(struct tu_device *device,
230 struct tu_query_pool *pool,
231 uint32_t firstQuery,
232 uint32_t queryCount,
233 size_t dataSize,
234 void *pData,
235 VkDeviceSize stride,
236 VkQueryResultFlags flags)
237 {
238 assert(dataSize >= stride * queryCount);
239
240 char *result_base = pData;
241 VkResult result = VK_SUCCESS;
242 for (uint32_t i = 0; i < queryCount; i++) {
243 uint32_t query = firstQuery + i;
244 struct query_slot *slot = slot_address(pool, query);
245 bool available = query_is_available(slot);
246 uint32_t result_count = get_result_count(pool);
247
248 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
249 VkResult wait_result = wait_for_available(device, pool, query);
250 if (wait_result != VK_SUCCESS)
251 return wait_result;
252 available = true;
253 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
254 /* From the Vulkan 1.1.130 spec:
255 *
256 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
257 * both not set then no result values are written to pData for
258 * queries that are in the unavailable state at the time of the
259 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
260 * availability state is still written to pData for those queries
261 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
262 */
263 result = VK_NOT_READY;
264 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
265 result_base += stride;
266 continue;
267 }
268 }
269
270 for (uint32_t k = 0; k < result_count; k++) {
271 if (available)
272 write_query_value_cpu(result_base, k, slot->result.values[k], flags);
273 else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
274 /* From the Vulkan 1.1.130 spec:
275 *
276 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
277 * is not set, and the query’s status is unavailable, an
278 * intermediate result value between zero and the final result
279 * value is written to pData for that query.
280 *
281 * Just return 0 here for simplicity since it's a valid result.
282 */
283 write_query_value_cpu(result_base, k, 0, flags);
284 }
285
286 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
287 /* From the Vulkan 1.1.130 spec:
288 *
289 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
290 * integer value written for each query is non-zero if the query’s
291 * status was available or zero if the status was unavailable.
292 */
293 write_query_value_cpu(result_base, result_count, available, flags);
294
295 result_base += stride;
296 }
297 return result;
298 }
299
300 VkResult
301 tu_GetQueryPoolResults(VkDevice _device,
302 VkQueryPool queryPool,
303 uint32_t firstQuery,
304 uint32_t queryCount,
305 size_t dataSize,
306 void *pData,
307 VkDeviceSize stride,
308 VkQueryResultFlags flags)
309 {
310 TU_FROM_HANDLE(tu_device, device, _device);
311 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
312 assert(firstQuery + queryCount <= pool->size);
313
314 if (tu_device_is_lost(device))
315 return VK_ERROR_DEVICE_LOST;
316
317 switch (pool->type) {
318 case VK_QUERY_TYPE_OCCLUSION:
319 case VK_QUERY_TYPE_TIMESTAMP:
320 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
321 return get_query_pool_results(device, pool, firstQuery, queryCount,
322 dataSize, pData, stride, flags);
323 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
324 unreachable("Unimplemented query type");
325 default:
326 assert(!"Invalid query type");
327 }
328 return VK_SUCCESS;
329 }
330
331 /* Copies a query value from one buffer to another from the GPU. */
332 static void
333 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
334 struct tu_cs *cs,
335 uint64_t src_iova,
336 uint64_t base_write_iova,
337 uint32_t offset,
338 VkQueryResultFlags flags) {
339 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
340 sizeof(uint64_t) : sizeof(uint32_t);
341 uint64_t write_iova = base_write_iova + (offset * element_size);
342
343 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
344 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
345 CP_MEM_TO_MEM_0_DOUBLE : 0;
346 tu_cs_emit(cs, mem_to_mem_flags);
347 tu_cs_emit_qw(cs, write_iova);
348 tu_cs_emit_qw(cs, src_iova);
349 }
350
351 static void
352 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
353 struct tu_cs *cs,
354 struct tu_query_pool *pool,
355 uint32_t firstQuery,
356 uint32_t queryCount,
357 struct tu_buffer *buffer,
358 VkDeviceSize dstOffset,
359 VkDeviceSize stride,
360 VkQueryResultFlags flags)
361 {
362 /* From the Vulkan 1.1.130 spec:
363 *
364 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
365 * uses of vkCmdResetQueryPool in the same queue, without any additional
366 * synchronization.
367 *
368 * To ensure that previous writes to the available bit are coherent, first
369 * wait for all writes to complete.
370 */
371 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
372
373 for (uint32_t i = 0; i < queryCount; i++) {
374 uint32_t query = firstQuery + i;
375 uint64_t available_iova = query_available_iova(pool, query);
376 uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
377 uint32_t result_count = get_result_count(pool);
378
379 /* Wait for the available bit to be set if executed with the
380 * VK_QUERY_RESULT_WAIT_BIT flag. */
381 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
382 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
383 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
384 CP_WAIT_REG_MEM_0_POLL_MEMORY);
385 tu_cs_emit_qw(cs, available_iova);
386 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
387 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
388 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
389 }
390
391 for (uint32_t k = 0; k < result_count; k++) {
392 uint64_t result_iova = query_result_iova(pool, query, k);
393
394 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
395 /* Unconditionally copying the bo->result into the buffer here is
396 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
397 * if the query is unavailable, this will copy the correct partial
398 * value of 0.
399 */
400 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
401 k /* offset */, flags);
402 } else {
403 /* Conditionally copy bo->result into the buffer based on whether the
404 * query is available.
405 *
406 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
407 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
408 * that 0 < available < 2, aka available == 1.
409 */
410 tu_cs_reserve(cs, 7 + 6);
411 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
412 tu_cs_emit_qw(cs, available_iova);
413 tu_cs_emit_qw(cs, available_iova);
414 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
415 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
416
417 /* Start of conditional execution */
418 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
419 k /* offset */, flags);
420 /* End of conditional execution */
421 }
422 }
423
424 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
425 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
426 result_count /* offset */, flags);
427 }
428 }
429
430 tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
431 }
432
433 void
434 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
435 VkQueryPool queryPool,
436 uint32_t firstQuery,
437 uint32_t queryCount,
438 VkBuffer dstBuffer,
439 VkDeviceSize dstOffset,
440 VkDeviceSize stride,
441 VkQueryResultFlags flags)
442 {
443 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
444 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
445 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
446 struct tu_cs *cs = &cmdbuf->cs;
447 assert(firstQuery + queryCount <= pool->size);
448
449 switch (pool->type) {
450 case VK_QUERY_TYPE_OCCLUSION:
451 case VK_QUERY_TYPE_TIMESTAMP:
452 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
453 return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
454 queryCount, buffer, dstOffset, stride, flags);
455 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
456 unreachable("Unimplemented query type");
457 default:
458 assert(!"Invalid query type");
459 }
460 }
461
462 static void
463 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
464 struct tu_query_pool *pool,
465 uint32_t firstQuery,
466 uint32_t queryCount)
467 {
468 struct tu_cs *cs = &cmdbuf->cs;
469
470 for (uint32_t i = 0; i < queryCount; i++) {
471 uint32_t query = firstQuery + i;
472
473 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
474 tu_cs_emit_qw(cs, query_available_iova(pool, query));
475 tu_cs_emit_qw(cs, 0x0);
476
477 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
478 tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
479 tu_cs_emit_qw(cs, 0x0);
480 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
481 tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
482 tu_cs_emit_qw(cs, 0x0);
483 }
484 }
485
486 void
487 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
488 VkQueryPool queryPool,
489 uint32_t firstQuery,
490 uint32_t queryCount)
491 {
492 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
493 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
494
495 switch (pool->type) {
496 case VK_QUERY_TYPE_TIMESTAMP:
497 case VK_QUERY_TYPE_OCCLUSION:
498 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
499 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
500 break;
501 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
502 unreachable("Unimplemented query type");
503 default:
504 assert(!"Invalid query type");
505 }
506
507 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
508 }
509
510 static void
511 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
512 struct tu_query_pool *pool,
513 uint32_t query)
514 {
515 /* From the Vulkan 1.1.130 spec:
516 *
517 * A query must begin and end inside the same subpass of a render pass
518 * instance, or must both begin and end outside of a render pass
519 * instance.
520 *
521 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
522 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
523 * query begins/ends inside the same subpass of a render pass, we need to
524 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
525 * is then run on every tile during render, so we just need to accumulate
526 * sample counts in slot->result to compute the query result.
527 */
528 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
529
530 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
531
532 tu_cs_emit_regs(cs,
533 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
534
535 tu_cs_emit_regs(cs,
536 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
537
538 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
539 tu_cs_emit(cs, ZPASS_DONE);
540 }
541
542 static void
543 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
544 struct tu_query_pool *pool,
545 uint32_t query,
546 uint32_t stream_id)
547 {
548 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
549 uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
550
551 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova));
552 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
553 }
554
555 void
556 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
557 VkQueryPool queryPool,
558 uint32_t query,
559 VkQueryControlFlags flags)
560 {
561 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
562 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
563 assert(query < pool->size);
564
565 switch (pool->type) {
566 case VK_QUERY_TYPE_OCCLUSION:
567 /* In freedreno, there is no implementation difference between
568 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
569 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
570 */
571 emit_begin_occlusion_query(cmdbuf, pool, query);
572 break;
573 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
574 emit_begin_xfb_query(cmdbuf, pool, query, 0);
575 break;
576 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
577 case VK_QUERY_TYPE_TIMESTAMP:
578 unreachable("Unimplemented query type");
579 default:
580 assert(!"Invalid query type");
581 }
582
583 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
584 }
585
586 void
587 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
588 VkQueryPool queryPool,
589 uint32_t query,
590 VkQueryControlFlags flags,
591 uint32_t index)
592 {
593 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
594 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
595 assert(query < pool->size);
596
597 switch (pool->type) {
598 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
599 emit_begin_xfb_query(cmdbuf, pool, query, index);
600 break;
601 default:
602 assert(!"Invalid query type");
603 }
604
605 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
606 }
607
608 static void
609 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
610 struct tu_query_pool *pool,
611 uint32_t query)
612 {
613 /* Ending an occlusion query happens in a few steps:
614 * 1) Set the slot->end to UINT64_MAX.
615 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
616 * write the current sample count value into slot->end.
617 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
618 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
619 * 4) Accumulate the results of the query (slot->end - slot->begin) into
620 * slot->result.
621 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
622 * pass, set the slot's available bit since the query is now done.
623 * 6) If vkCmdEndQuery *is* called from within the scope of a render
624 * pass, we cannot mark as available yet since the commands in
625 * draw_cs are not run until vkCmdEndRenderPass.
626 */
627 const struct tu_render_pass *pass = cmdbuf->state.pass;
628 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
629
630 uint64_t available_iova = query_available_iova(pool, query);
631 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
632 uint64_t end_iova = occlusion_query_iova(pool, query, end);
633 uint64_t result_iova = query_result_iova(pool, query, 0);
634 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
635 tu_cs_emit_qw(cs, end_iova);
636 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
637
638 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
639
640 tu_cs_emit_regs(cs,
641 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
642
643 tu_cs_emit_regs(cs,
644 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
645
646 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
647 tu_cs_emit(cs, ZPASS_DONE);
648
649 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
650 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
651 CP_WAIT_REG_MEM_0_POLL_MEMORY);
652 tu_cs_emit_qw(cs, end_iova);
653 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
654 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
655 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
656
657 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
658 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
659 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
660 tu_cs_emit_qw(cs, result_iova);
661 tu_cs_emit_qw(cs, result_iova);
662 tu_cs_emit_qw(cs, end_iova);
663 tu_cs_emit_qw(cs, begin_iova);
664
665 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
666
667 if (pass)
668 /* Technically, queries should be tracked per-subpass, but here we track
669 * at the render pass level to simply the code a bit. This is safe
670 * because the only commands that use the available bit are
671 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
672 * cannot be invoked from inside a render pass scope.
673 */
674 cs = &cmdbuf->draw_epilogue_cs;
675
676 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
677 tu_cs_emit_qw(cs, available_iova);
678 tu_cs_emit_qw(cs, 0x1);
679 }
680
681 static void
682 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
683 struct tu_query_pool *pool,
684 uint32_t query,
685 uint32_t stream_id)
686 {
687 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
688
689 uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
690 uint64_t result_written_iova = query_result_iova(pool, query, 0);
691 uint64_t result_generated_iova = query_result_iova(pool, query, 1);
692 uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
693 uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
694 uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
695 uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
696 uint64_t available_iova = query_available_iova(pool, query);
697
698 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova));
699 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
700
701 tu_cs_emit_wfi(cs);
702 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
703
704 /* Set the count of written primitives */
705 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
706 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
707 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
708 tu_cs_emit_qw(cs, result_written_iova);
709 tu_cs_emit_qw(cs, result_written_iova);
710 tu_cs_emit_qw(cs, end_written_iova);
711 tu_cs_emit_qw(cs, begin_written_iova);
712
713 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
714
715 /* Set the count of generated primitives */
716 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
717 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
718 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
719 tu_cs_emit_qw(cs, result_generated_iova);
720 tu_cs_emit_qw(cs, result_generated_iova);
721 tu_cs_emit_qw(cs, end_generated_iova);
722 tu_cs_emit_qw(cs, begin_generated_iova);
723
724 /* Set the availability to 1 */
725 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
726 tu_cs_emit_qw(cs, available_iova);
727 tu_cs_emit_qw(cs, 0x1);
728 }
729
730 void
731 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
732 VkQueryPool queryPool,
733 uint32_t query)
734 {
735 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
736 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
737 assert(query < pool->size);
738
739 switch (pool->type) {
740 case VK_QUERY_TYPE_OCCLUSION:
741 emit_end_occlusion_query(cmdbuf, pool, query);
742 break;
743 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
744 emit_end_xfb_query(cmdbuf, pool, query, 0);
745 break;
746 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
747 case VK_QUERY_TYPE_TIMESTAMP:
748 unreachable("Unimplemented query type");
749 default:
750 assert(!"Invalid query type");
751 }
752
753 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
754 }
755
756 void
757 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
758 VkQueryPool queryPool,
759 uint32_t query,
760 uint32_t index)
761 {
762 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
763 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
764 assert(query < pool->size);
765
766 switch (pool->type) {
767 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
768 assert(index <= 4);
769 emit_end_xfb_query(cmdbuf, pool, query, index);
770 break;
771 default:
772 assert(!"Invalid query type");
773 }
774
775 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
776 }
777
778 void
779 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
780 VkPipelineStageFlagBits pipelineStage,
781 VkQueryPool queryPool,
782 uint32_t query)
783 {
784 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
785 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
786 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
787
788 /* WFI to get more accurate timestamp */
789 tu_cs_emit_wfi(cs);
790
791 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
792 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
793 CP_REG_TO_MEM_0_CNT(2) |
794 CP_REG_TO_MEM_0_64B);
795 tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
796
797 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
798 tu_cs_emit_qw(cs, query_available_iova(pool, query));
799 tu_cs_emit_qw(cs, 0x1);
800
801 if (cmd->state.pass) {
802 /* TODO: to have useful in-renderpass timestamps:
803 * for sysmem path, we can just emit the timestamp in draw_cs,
804 * for gmem renderpass, we do something with accumulate,
805 * but I'm not sure that would follow the spec
806 */
807 tu_finishme("CmdWriteTimestam in renderpass not accurate");
808 }
809 }