turnip: make the struct slot_value of queries get 2 values
[mesa.git] / src / freedreno / vulkan / tu_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26 #include "tu_private.h"
27
28 #include <assert.h>
29 #include <fcntl.h>
30 #include <stdbool.h>
31 #include <string.h>
32 #include <unistd.h>
33
34 #include "registers/adreno_pm4.xml.h"
35 #include "registers/adreno_common.xml.h"
36 #include "registers/a6xx.xml.h"
37
38 #include "nir/nir_builder.h"
39 #include "util/os_time.h"
40
41 #include "tu_cs.h"
42
43 #define NSEC_PER_SEC 1000000000ull
44 #define WAIT_TIMEOUT 5
45
46 /* Depending on the query type, there might be 2 integer values.
47 * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
48 * values[0] : primitives written, values[1]: primitives generated
49 */
50 struct PACKED slot_value {
51 uint64_t values[2];
52 };
53
54 struct PACKED query_slot {
55 struct slot_value available; /* 0 when unavailable, 1 when available */
56 struct slot_value result;
57 };
58
59 struct PACKED occlusion_query_slot {
60 struct query_slot common;
61 struct slot_value begin;
62 struct slot_value end;
63 };
64
65 /* Returns the IOVA of a given uint64_t field in a given slot of a query
66 * pool. */
67 #define query_iova(type, pool, query, field, value_index) \
68 pool->bo.iova + pool->stride * query + offsetof(type, field) + \
69 offsetof(struct slot_value, values[value_index])
70
71 #define occlusion_query_iova(pool, query, field) \
72 query_iova(struct occlusion_query_slot, pool, query, field, 0)
73
74 #define query_available_iova(pool, query) \
75 query_iova(struct query_slot, pool, query, available, 0)
76
77 #define query_result_iova(pool, query, i) \
78 query_iova(struct query_slot, pool, query, result, i)
79
80 #define query_is_available(slot) slot->available.values[0]
81
82 /*
83 * Returns a pointer to a given slot in a query pool.
84 */
85 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
86 {
87 return (char*)pool->bo.map + query * pool->stride;
88 }
89
90 VkResult
91 tu_CreateQueryPool(VkDevice _device,
92 const VkQueryPoolCreateInfo *pCreateInfo,
93 const VkAllocationCallbacks *pAllocator,
94 VkQueryPool *pQueryPool)
95 {
96 TU_FROM_HANDLE(tu_device, device, _device);
97 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
98 assert(pCreateInfo->queryCount > 0);
99
100 uint32_t slot_size;
101 switch (pCreateInfo->queryType) {
102 case VK_QUERY_TYPE_OCCLUSION:
103 slot_size = sizeof(struct occlusion_query_slot);
104 break;
105 case VK_QUERY_TYPE_TIMESTAMP:
106 slot_size = sizeof(struct query_slot);
107 break;
108 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
109 unreachable("Unimplemented query type");
110 default:
111 assert(!"Invalid query type");
112 }
113
114 struct tu_query_pool *pool =
115 vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
116 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
117
118 if (!pool)
119 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
120
121 VkResult result = tu_bo_init_new(device, &pool->bo,
122 pCreateInfo->queryCount * slot_size);
123 if (result != VK_SUCCESS) {
124 vk_free2(&device->alloc, pAllocator, pool);
125 return result;
126 }
127
128 result = tu_bo_map(device, &pool->bo);
129 if (result != VK_SUCCESS) {
130 tu_bo_finish(device, &pool->bo);
131 vk_free2(&device->alloc, pAllocator, pool);
132 return result;
133 }
134
135 /* Initialize all query statuses to unavailable */
136 memset(pool->bo.map, 0, pool->bo.size);
137
138 pool->type = pCreateInfo->queryType;
139 pool->stride = slot_size;
140 pool->size = pCreateInfo->queryCount;
141 pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
142 *pQueryPool = tu_query_pool_to_handle(pool);
143
144 return VK_SUCCESS;
145 }
146
147 void
148 tu_DestroyQueryPool(VkDevice _device,
149 VkQueryPool _pool,
150 const VkAllocationCallbacks *pAllocator)
151 {
152 TU_FROM_HANDLE(tu_device, device, _device);
153 TU_FROM_HANDLE(tu_query_pool, pool, _pool);
154
155 if (!pool)
156 return;
157
158 tu_bo_finish(device, &pool->bo);
159 vk_free2(&device->alloc, pAllocator, pool);
160 }
161
162 static uint32_t
163 get_result_count(struct tu_query_pool *pool)
164 {
165 switch (pool->type) {
166 /* Occulusion and timestamp queries write one integer value */
167 case VK_QUERY_TYPE_OCCLUSION:
168 case VK_QUERY_TYPE_TIMESTAMP:
169 return 1;
170 /* Transform feedback queries write two integer values */
171 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
172 return 2;
173 default:
174 assert(!"Invalid query type");
175 return 0;
176 }
177 }
178
179 /* Wait on the the availability status of a query up until a timeout. */
180 static VkResult
181 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
182 uint32_t query)
183 {
184 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
185 * scheduler friendly way instead of busy polling once the patch has landed
186 * upstream. */
187 struct query_slot *slot = slot_address(pool, query);
188 uint64_t abs_timeout = os_time_get_absolute_timeout(
189 WAIT_TIMEOUT * NSEC_PER_SEC);
190 while(os_time_get_nano() < abs_timeout) {
191 if (query_is_available(slot))
192 return VK_SUCCESS;
193 }
194 return vk_error(device->instance, VK_TIMEOUT);
195 }
196
197 /* Writes a query value to a buffer from the CPU. */
198 static void
199 write_query_value_cpu(char* base,
200 uint32_t offset,
201 uint64_t value,
202 VkQueryResultFlags flags)
203 {
204 if (flags & VK_QUERY_RESULT_64_BIT) {
205 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
206 } else {
207 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
208 }
209 }
210
211 static VkResult
212 get_query_pool_results(struct tu_device *device,
213 struct tu_query_pool *pool,
214 uint32_t firstQuery,
215 uint32_t queryCount,
216 size_t dataSize,
217 void *pData,
218 VkDeviceSize stride,
219 VkQueryResultFlags flags)
220 {
221 assert(dataSize >= stride * queryCount);
222
223 char *result_base = pData;
224 VkResult result = VK_SUCCESS;
225 for (uint32_t i = 0; i < queryCount; i++) {
226 uint32_t query = firstQuery + i;
227 struct query_slot *slot = slot_address(pool, query);
228 bool available = query_is_available(slot);
229 uint32_t result_count = get_result_count(pool);
230
231 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
232 VkResult wait_result = wait_for_available(device, pool, query);
233 if (wait_result != VK_SUCCESS)
234 return wait_result;
235 available = true;
236 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
237 /* From the Vulkan 1.1.130 spec:
238 *
239 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
240 * both not set then no result values are written to pData for
241 * queries that are in the unavailable state at the time of the
242 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
243 * availability state is still written to pData for those queries
244 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
245 */
246 result = VK_NOT_READY;
247 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
248 result_base += stride;
249 continue;
250 }
251 }
252
253 for (uint32_t k = 0; k < result_count; k++) {
254 if (available)
255 write_query_value_cpu(result_base, k, slot->result.values[k], flags);
256 else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
257 /* From the Vulkan 1.1.130 spec:
258 *
259 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
260 * is not set, and the query’s status is unavailable, an
261 * intermediate result value between zero and the final result
262 * value is written to pData for that query.
263 *
264 * Just return 0 here for simplicity since it's a valid result.
265 */
266 write_query_value_cpu(result_base, k, 0, flags);
267 }
268
269 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
270 /* From the Vulkan 1.1.130 spec:
271 *
272 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
273 * integer value written for each query is non-zero if the query’s
274 * status was available or zero if the status was unavailable.
275 */
276 write_query_value_cpu(result_base, result_count, available, flags);
277
278 result_base += stride;
279 }
280 return result;
281 }
282
283 VkResult
284 tu_GetQueryPoolResults(VkDevice _device,
285 VkQueryPool queryPool,
286 uint32_t firstQuery,
287 uint32_t queryCount,
288 size_t dataSize,
289 void *pData,
290 VkDeviceSize stride,
291 VkQueryResultFlags flags)
292 {
293 TU_FROM_HANDLE(tu_device, device, _device);
294 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
295 assert(firstQuery + queryCount <= pool->size);
296
297 switch (pool->type) {
298 case VK_QUERY_TYPE_OCCLUSION:
299 case VK_QUERY_TYPE_TIMESTAMP:
300 return get_query_pool_results(device, pool, firstQuery, queryCount,
301 dataSize, pData, stride, flags);
302 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
303 unreachable("Unimplemented query type");
304 default:
305 assert(!"Invalid query type");
306 }
307 return VK_SUCCESS;
308 }
309
310 /* Copies a query value from one buffer to another from the GPU. */
311 static void
312 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
313 struct tu_cs *cs,
314 uint64_t src_iova,
315 uint64_t base_write_iova,
316 uint32_t offset,
317 VkQueryResultFlags flags) {
318 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
319 sizeof(uint64_t) : sizeof(uint32_t);
320 uint64_t write_iova = base_write_iova + (offset * element_size);
321
322 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
323 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
324 CP_MEM_TO_MEM_0_DOUBLE : 0;
325 tu_cs_emit(cs, mem_to_mem_flags);
326 tu_cs_emit_qw(cs, write_iova);
327 tu_cs_emit_qw(cs, src_iova);
328 }
329
330 static void
331 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
332 struct tu_cs *cs,
333 struct tu_query_pool *pool,
334 uint32_t firstQuery,
335 uint32_t queryCount,
336 struct tu_buffer *buffer,
337 VkDeviceSize dstOffset,
338 VkDeviceSize stride,
339 VkQueryResultFlags flags)
340 {
341 /* From the Vulkan 1.1.130 spec:
342 *
343 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
344 * uses of vkCmdResetQueryPool in the same queue, without any additional
345 * synchronization.
346 *
347 * To ensure that previous writes to the available bit are coherent, first
348 * wait for all writes to complete.
349 */
350 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
351
352 for (uint32_t i = 0; i < queryCount; i++) {
353 uint32_t query = firstQuery + i;
354 uint64_t available_iova = query_available_iova(pool, query);
355 uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
356 uint32_t result_count = get_result_count(pool);
357
358 /* Wait for the available bit to be set if executed with the
359 * VK_QUERY_RESULT_WAIT_BIT flag. */
360 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
361 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
362 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
363 CP_WAIT_REG_MEM_0_POLL_MEMORY);
364 tu_cs_emit_qw(cs, available_iova);
365 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
366 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
367 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
368 }
369
370 for (uint32_t k = 0; k < result_count; k++) {
371 uint64_t result_iova = query_result_iova(pool, query, k);
372
373 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
374 /* Unconditionally copying the bo->result into the buffer here is
375 * valid because we only set bo->result on vkCmdEndQuery. Thus, even
376 * if the query is unavailable, this will copy the correct partial
377 * value of 0.
378 */
379 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
380 k /* offset */, flags);
381 } else {
382 /* Conditionally copy bo->result into the buffer based on whether the
383 * query is available.
384 *
385 * NOTE: For the conditional packets to be executed, CP_COND_EXEC
386 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
387 * that 0 < available < 2, aka available == 1.
388 */
389 tu_cs_reserve(cs, 7 + 6);
390 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
391 tu_cs_emit_qw(cs, available_iova);
392 tu_cs_emit_qw(cs, available_iova);
393 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
394 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
395
396 /* Start of conditional execution */
397 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
398 k /* offset */, flags);
399 /* End of conditional execution */
400 }
401 }
402
403 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
404 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
405 result_count /* offset */, flags);
406 }
407 }
408
409 tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
410 }
411
412 void
413 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
414 VkQueryPool queryPool,
415 uint32_t firstQuery,
416 uint32_t queryCount,
417 VkBuffer dstBuffer,
418 VkDeviceSize dstOffset,
419 VkDeviceSize stride,
420 VkQueryResultFlags flags)
421 {
422 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
423 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
424 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
425 struct tu_cs *cs = &cmdbuf->cs;
426 assert(firstQuery + queryCount <= pool->size);
427
428 switch (pool->type) {
429 case VK_QUERY_TYPE_OCCLUSION:
430 case VK_QUERY_TYPE_TIMESTAMP:
431 return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
432 queryCount, buffer, dstOffset, stride, flags);
433 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
434 unreachable("Unimplemented query type");
435 default:
436 assert(!"Invalid query type");
437 }
438 }
439
440 static void
441 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
442 struct tu_query_pool *pool,
443 uint32_t firstQuery,
444 uint32_t queryCount)
445 {
446 struct tu_cs *cs = &cmdbuf->cs;
447
448 for (uint32_t i = 0; i < queryCount; i++) {
449 uint32_t query = firstQuery + i;
450
451 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
452 tu_cs_emit_qw(cs, query_available_iova(pool, query));
453 tu_cs_emit_qw(cs, 0x0);
454
455 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
456 tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
457 tu_cs_emit_qw(cs, 0x0);
458 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
459 tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
460 tu_cs_emit_qw(cs, 0x0);
461 }
462 }
463
464 void
465 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
466 VkQueryPool queryPool,
467 uint32_t firstQuery,
468 uint32_t queryCount)
469 {
470 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
471 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
472
473 switch (pool->type) {
474 case VK_QUERY_TYPE_TIMESTAMP:
475 case VK_QUERY_TYPE_OCCLUSION:
476 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
477 break;
478 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
479 unreachable("Unimplemented query type");
480 default:
481 assert(!"Invalid query type");
482 }
483
484 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
485 }
486
487 static void
488 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
489 struct tu_query_pool *pool,
490 uint32_t query)
491 {
492 /* From the Vulkan 1.1.130 spec:
493 *
494 * A query must begin and end inside the same subpass of a render pass
495 * instance, or must both begin and end outside of a render pass
496 * instance.
497 *
498 * Unlike on an immediate-mode renderer, Turnip renders all tiles on
499 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
500 * query begins/ends inside the same subpass of a render pass, we need to
501 * record the packets on the secondary draw command stream. cmdbuf->draw_cs
502 * is then run on every tile during render, so we just need to accumulate
503 * sample counts in slot->result to compute the query result.
504 */
505 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
506
507 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
508
509 tu_cs_emit_regs(cs,
510 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
511
512 tu_cs_emit_regs(cs,
513 A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
514
515 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
516 tu_cs_emit(cs, ZPASS_DONE);
517 }
518
519 void
520 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
521 VkQueryPool queryPool,
522 uint32_t query,
523 VkQueryControlFlags flags)
524 {
525 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
526 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
527 assert(query < pool->size);
528
529 switch (pool->type) {
530 case VK_QUERY_TYPE_OCCLUSION:
531 /* In freedreno, there is no implementation difference between
532 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
533 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
534 */
535 emit_begin_occlusion_query(cmdbuf, pool, query);
536 break;
537 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
538 case VK_QUERY_TYPE_TIMESTAMP:
539 unreachable("Unimplemented query type");
540 default:
541 assert(!"Invalid query type");
542 }
543
544 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
545 }
546
547 static void
548 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
549 struct tu_query_pool *pool,
550 uint32_t query)
551 {
552 /* Ending an occlusion query happens in a few steps:
553 * 1) Set the slot->end to UINT64_MAX.
554 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
555 * write the current sample count value into slot->end.
556 * 3) Since (2) is asynchronous, wait until slot->end is not equal to
557 * UINT64_MAX before continuing via CP_WAIT_REG_MEM.
558 * 4) Accumulate the results of the query (slot->end - slot->begin) into
559 * slot->result.
560 * 5) If vkCmdEndQuery is *not* called from within the scope of a render
561 * pass, set the slot's available bit since the query is now done.
562 * 6) If vkCmdEndQuery *is* called from within the scope of a render
563 * pass, we cannot mark as available yet since the commands in
564 * draw_cs are not run until vkCmdEndRenderPass.
565 */
566 const struct tu_render_pass *pass = cmdbuf->state.pass;
567 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
568
569 uint64_t available_iova = query_available_iova(pool, query);
570 uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
571 uint64_t end_iova = occlusion_query_iova(pool, query, end);
572 uint64_t result_iova = query_result_iova(pool, query, 0);
573 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
574 tu_cs_emit_qw(cs, end_iova);
575 tu_cs_emit_qw(cs, 0xffffffffffffffffull);
576
577 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
578
579 tu_cs_emit_regs(cs,
580 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
581
582 tu_cs_emit_regs(cs,
583 A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
584
585 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
586 tu_cs_emit(cs, ZPASS_DONE);
587
588 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
589 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
590 CP_WAIT_REG_MEM_0_POLL_MEMORY);
591 tu_cs_emit_qw(cs, end_iova);
592 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
593 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
594 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
595
596 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
597 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
598 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
599 tu_cs_emit_qw(cs, result_iova);
600 tu_cs_emit_qw(cs, result_iova);
601 tu_cs_emit_qw(cs, end_iova);
602 tu_cs_emit_qw(cs, begin_iova);
603
604 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
605
606 if (pass)
607 /* Technically, queries should be tracked per-subpass, but here we track
608 * at the render pass level to simply the code a bit. This is safe
609 * because the only commands that use the available bit are
610 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
611 * cannot be invoked from inside a render pass scope.
612 */
613 cs = &cmdbuf->draw_epilogue_cs;
614
615 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
616 tu_cs_emit_qw(cs, available_iova);
617 tu_cs_emit_qw(cs, 0x1);
618 }
619
620 void
621 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
622 VkQueryPool queryPool,
623 uint32_t query)
624 {
625 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
626 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
627 assert(query < pool->size);
628
629 switch (pool->type) {
630 case VK_QUERY_TYPE_OCCLUSION:
631 emit_end_occlusion_query(cmdbuf, pool, query);
632 break;
633 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
634 case VK_QUERY_TYPE_TIMESTAMP:
635 unreachable("Unimplemented query type");
636 default:
637 assert(!"Invalid query type");
638 }
639
640 tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
641 }
642
643 void
644 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
645 VkPipelineStageFlagBits pipelineStage,
646 VkQueryPool queryPool,
647 uint32_t query)
648 {
649 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
650 TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
651 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
652
653 /* WFI to get more accurate timestamp */
654 tu_cs_emit_wfi(cs);
655
656 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
657 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
658 CP_REG_TO_MEM_0_CNT(2) |
659 CP_REG_TO_MEM_0_64B);
660 tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
661
662 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
663 tu_cs_emit_qw(cs, query_available_iova(pool, query));
664 tu_cs_emit_qw(cs, 0x1);
665
666 if (cmd->state.pass) {
667 /* TODO: to have useful in-renderpass timestamps:
668 * for sysmem path, we can just emit the timestamp in draw_cs,
669 * for gmem renderpass, we do something with accumulate,
670 * but I'm not sure that would follow the spec
671 */
672 tu_finishme("CmdWriteTimestam in renderpass not accurate");
673 }
674 }