src/freedreno/vulkan/tu_query.c

   1 /*
   2  * Copyrigh 2016 Red Hat Inc.
   3  * Based on anv:
   4  * Copyright © 2015 Intel Corporation
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  * DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "tu_private.h"
  27
  28 #include <assert.h>
  29 #include <fcntl.h>
  30 #include <stdbool.h>
  31 #include <string.h>
  32 #include <unistd.h>
  33
  34 #include "registers/adreno_pm4.xml.h"
  35 #include "registers/adreno_common.xml.h"
  36 #include "registers/a6xx.xml.h"
  37
  38 #include "nir/nir_builder.h"
  39 #include "util/os_time.h"
  40
  41 #include "tu_cs.h"
  42
  43 #define NSEC_PER_SEC 1000000000ull
  44 #define WAIT_TIMEOUT 5
  45
  46 /* It seems like sample counts need to be copied over to 16-byte aligned
  47  * memory. */
  48 struct PACKED slot_value {
  49    uint64_t value;
  50    uint64_t __padding;
  51 };
  52
  53 struct PACKED query_slot {
  54    struct slot_value available; /* 0 when unavailable, 1 when available */
  55    struct slot_value result;
  56 };
  57
  58 struct PACKED occlusion_query_slot {
  59    struct query_slot common;
  60    struct slot_value begin;
  61    struct slot_value end;
  62 };
  63
  64 /* Returns the IOVA of a given uint64_t field in a given slot of a query
  65  * pool. */
  66 #define query_iova(type, pool, query, field)                         \
  67    pool->bo.iova + pool->stride * query + offsetof(type, field) +    \
  68          offsetof(struct slot_value, value)
  69
  70 #define occlusion_query_iova(pool, query, field)                     \
  71    query_iova(struct occlusion_query_slot, pool, query, field)
  72
  73 #define query_available_iova(pool, query)                            \
  74    query_iova(struct query_slot, pool, query, available)
  75
  76 #define query_result_iova(pool, query)                               \
  77    query_iova(struct query_slot, pool, query, result)
  78
  79 #define query_is_available(slot) slot->available.value
  80
  81 /*
  82  * Returns a pointer to a given slot in a query pool.
  83  */
  84 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
  85 {
  86    return (char*)pool->bo.map + query * pool->stride;
  87 }
  88
  89 VkResult
  90 tu_CreateQueryPool(VkDevice _device,
  91                    const VkQueryPoolCreateInfo *pCreateInfo,
  92                    const VkAllocationCallbacks *pAllocator,
  93                    VkQueryPool *pQueryPool)
  94 {
  95    TU_FROM_HANDLE(tu_device, device, _device);
  96    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
  97    assert(pCreateInfo->queryCount > 0);
  98
  99    uint32_t slot_size;
 100    switch (pCreateInfo->queryType) {
 101    case VK_QUERY_TYPE_OCCLUSION:
 102       slot_size = sizeof(struct occlusion_query_slot);
 103       break;
 104    case VK_QUERY_TYPE_TIMESTAMP:
 105       slot_size = sizeof(struct query_slot);
 106       break;
 107    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 108       unreachable("Unimplemented query type");
 109    default:
 110       assert(!"Invalid query type");
 111    }
 112
 113    struct tu_query_pool *pool =
 114       vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
 115                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 116
 117    if (!pool)
 118       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 119
 120    VkResult result = tu_bo_init_new(device, &pool->bo,
 121          pCreateInfo->queryCount * slot_size);
 122    if (result != VK_SUCCESS) {
 123       vk_free2(&device->alloc, pAllocator, pool);
 124       return result;
 125    }
 126
 127    result = tu_bo_map(device, &pool->bo);
 128    if (result != VK_SUCCESS) {
 129       tu_bo_finish(device, &pool->bo);
 130       vk_free2(&device->alloc, pAllocator, pool);
 131       return result;
 132    }
 133
 134    /* Initialize all query statuses to unavailable */
 135    memset(pool->bo.map, 0, pool->bo.size);
 136
 137    pool->type = pCreateInfo->queryType;
 138    pool->stride = slot_size;
 139    pool->size = pCreateInfo->queryCount;
 140    pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
 141    *pQueryPool = tu_query_pool_to_handle(pool);
 142
 143    return VK_SUCCESS;
 144 }
 145
 146 void
 147 tu_DestroyQueryPool(VkDevice _device,
 148                     VkQueryPool _pool,
 149                     const VkAllocationCallbacks *pAllocator)
 150 {
 151    TU_FROM_HANDLE(tu_device, device, _device);
 152    TU_FROM_HANDLE(tu_query_pool, pool, _pool);
 153
 154    if (!pool)
 155       return;
 156
 157    tu_bo_finish(device, &pool->bo);
 158    vk_free2(&device->alloc, pAllocator, pool);
 159 }
 160
 161 /* Wait on the the availability status of a query up until a timeout. */
 162 static VkResult
 163 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
 164                    uint32_t query)
 165 {
 166    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
 167     * scheduler friendly way instead of busy polling once the patch has landed
 168     * upstream. */
 169    struct query_slot *slot = slot_address(pool, query);
 170    uint64_t abs_timeout = os_time_get_absolute_timeout(
 171          WAIT_TIMEOUT * NSEC_PER_SEC);
 172    while(os_time_get_nano() < abs_timeout) {
 173       if (query_is_available(slot))
 174          return VK_SUCCESS;
 175    }
 176    return vk_error(device->instance, VK_TIMEOUT);
 177 }
 178
 179 /* Writes a query value to a buffer from the CPU. */
 180 static void
 181 write_query_value_cpu(char* base,
 182                       uint32_t offset,
 183                       uint64_t value,
 184                       VkQueryResultFlags flags)
 185 {
 186    if (flags & VK_QUERY_RESULT_64_BIT) {
 187       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
 188    } else {
 189       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
 190    }
 191 }
 192
 193 static VkResult
 194 get_query_pool_results(struct tu_device *device,
 195                        struct tu_query_pool *pool,
 196                        uint32_t firstQuery,
 197                        uint32_t queryCount,
 198                        size_t dataSize,
 199                        void *pData,
 200                        VkDeviceSize stride,
 201                        VkQueryResultFlags flags)
 202 {
 203    assert(dataSize >= stride * queryCount);
 204
 205    char *result_base = pData;
 206    VkResult result = VK_SUCCESS;
 207    for (uint32_t i = 0; i < queryCount; i++) {
 208       uint32_t query = firstQuery + i;
 209       struct query_slot *slot = slot_address(pool, query);
 210       bool available = query_is_available(slot);
 211       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
 212          VkResult wait_result = wait_for_available(device, pool, query);
 213          if (wait_result != VK_SUCCESS)
 214             return wait_result;
 215          available = true;
 216       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
 217          /* From the Vulkan 1.1.130 spec:
 218           *
 219           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
 220           *    both not set then no result values are written to pData for
 221           *    queries that are in the unavailable state at the time of the
 222           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
 223           *    availability state is still written to pData for those queries
 224           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
 225           */
 226          result = VK_NOT_READY;
 227          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
 228             result_base += stride;
 229             continue;
 230          }
 231       }
 232
 233       if (available)
 234          write_query_value_cpu(result_base, 0, slot->result.value, flags);
 235       else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
 236           /* From the Vulkan 1.1.130 spec:
 237            *
 238            *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
 239            *   is not set, and the query’s status is unavailable, an
 240            *   intermediate result value between zero and the final result
 241            *   value is written to pData for that query.
 242            *
 243            * Just return 0 here for simplicity since it's a valid result.
 244            */
 245          write_query_value_cpu(result_base, 0, 0, flags);
 246
 247       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
 248          /* From the Vulkan 1.1.130 spec:
 249           *
 250           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
 251           *    integer value written for each query is non-zero if the query’s
 252           *    status was available or zero if the status was unavailable.
 253           */
 254          write_query_value_cpu(result_base, 1, available, flags);
 255
 256       result_base += stride;
 257    }
 258    return result;
 259 }
 260
 261 VkResult
 262 tu_GetQueryPoolResults(VkDevice _device,
 263                        VkQueryPool queryPool,
 264                        uint32_t firstQuery,
 265                        uint32_t queryCount,
 266                        size_t dataSize,
 267                        void *pData,
 268                        VkDeviceSize stride,
 269                        VkQueryResultFlags flags)
 270 {
 271    TU_FROM_HANDLE(tu_device, device, _device);
 272    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 273    assert(firstQuery + queryCount <= pool->size);
 274
 275    switch (pool->type) {
 276    case VK_QUERY_TYPE_OCCLUSION:
 277    case VK_QUERY_TYPE_TIMESTAMP:
 278       return get_query_pool_results(device, pool, firstQuery, queryCount,
 279                                     dataSize, pData, stride, flags);
 280    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 281       unreachable("Unimplemented query type");
 282    default:
 283       assert(!"Invalid query type");
 284    }
 285    return VK_SUCCESS;
 286 }
 287
 288 /* Copies a query value from one buffer to another from the GPU. */
 289 static void
 290 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
 291                      struct tu_cs *cs,
 292                      uint64_t src_iova,
 293                      uint64_t base_write_iova,
 294                      uint32_t offset,
 295                      VkQueryResultFlags flags) {
 296    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
 297          sizeof(uint64_t) : sizeof(uint32_t);
 298    uint64_t write_iova = base_write_iova + (offset * element_size);
 299
 300    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
 301    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
 302          CP_MEM_TO_MEM_0_DOUBLE : 0;
 303    tu_cs_emit(cs, mem_to_mem_flags);
 304    tu_cs_emit_qw(cs, write_iova);
 305    tu_cs_emit_qw(cs, src_iova);
 306 }
 307
 308 static void
 309 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
 310                              struct tu_cs *cs,
 311                              struct tu_query_pool *pool,
 312                              uint32_t firstQuery,
 313                              uint32_t queryCount,
 314                              struct tu_buffer *buffer,
 315                              VkDeviceSize dstOffset,
 316                              VkDeviceSize stride,
 317                              VkQueryResultFlags flags)
 318 {
 319    /* From the Vulkan 1.1.130 spec:
 320     *
 321     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
 322     *    uses of vkCmdResetQueryPool in the same queue, without any additional
 323     *    synchronization.
 324     *
 325     * To ensure that previous writes to the available bit are coherent, first
 326     * wait for all writes to complete.
 327     */
 328    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 329
 330    for (uint32_t i = 0; i < queryCount; i++) {
 331       uint32_t query = firstQuery + i;
 332       uint64_t available_iova = query_available_iova(pool, query);
 333       uint64_t result_iova = query_result_iova(pool, query);
 334       uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
 335
 336       /* Wait for the available bit to be set if executed with the
 337        * VK_QUERY_RESULT_WAIT_BIT flag. */
 338       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
 339          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 340          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
 341                         CP_WAIT_REG_MEM_0_POLL_MEMORY);
 342          tu_cs_emit_qw(cs, available_iova);
 343          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
 344          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 345          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 346       }
 347
 348       if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
 349          /* Unconditionally copying the bo->result into the buffer here is
 350           * valid because we only set bo->result on vkCmdEndQuery. Thus, even
 351           * if the query is unavailable, this will copy the correct partial
 352           * value of 0.
 353           */
 354          copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 355                               0 /* offset */, flags);
 356       } else {
 357          /* Conditionally copy bo->result into the buffer based on whether the
 358           * query is available.
 359           *
 360           * NOTE: For the conditional packets to be executed, CP_COND_EXEC
 361           * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
 362           * that 0 < available < 2, aka available == 1.
 363           */
 364          tu_cs_reserve(cs, 7 + 6);
 365          tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
 366          tu_cs_emit_qw(cs, available_iova);
 367          tu_cs_emit_qw(cs, available_iova);
 368          tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
 369          tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
 370
 371          /* Start of conditional execution */
 372          copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 373                               0 /* offset */, flags);
 374          /* End of conditional execution */
 375       }
 376
 377       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 378          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
 379                               1 /* offset */, flags);
 380       }
 381    }
 382
 383    tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
 384 }
 385
 386 void
 387 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
 388                            VkQueryPool queryPool,
 389                            uint32_t firstQuery,
 390                            uint32_t queryCount,
 391                            VkBuffer dstBuffer,
 392                            VkDeviceSize dstOffset,
 393                            VkDeviceSize stride,
 394                            VkQueryResultFlags flags)
 395 {
 396    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 397    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 398    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
 399    struct tu_cs *cs = &cmdbuf->cs;
 400    assert(firstQuery + queryCount <= pool->size);
 401
 402    switch (pool->type) {
 403    case VK_QUERY_TYPE_OCCLUSION:
 404    case VK_QUERY_TYPE_TIMESTAMP:
 405       return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
 406                queryCount, buffer, dstOffset, stride, flags);
 407    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 408       unreachable("Unimplemented query type");
 409    default:
 410       assert(!"Invalid query type");
 411    }
 412 }
 413
 414 static void
 415 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
 416                       struct tu_query_pool *pool,
 417                       uint32_t firstQuery,
 418                       uint32_t queryCount)
 419 {
 420    struct tu_cs *cs = &cmdbuf->cs;
 421
 422    for (uint32_t i = 0; i < queryCount; i++) {
 423       uint32_t query = firstQuery + i;
 424
 425       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 426       tu_cs_emit_qw(cs, query_available_iova(pool, query));
 427       tu_cs_emit_qw(cs, 0x0);
 428
 429       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 430       tu_cs_emit_qw(cs, query_result_iova(pool, query));
 431       tu_cs_emit_qw(cs, 0x0);
 432    }
 433 }
 434
 435 void
 436 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
 437                      VkQueryPool queryPool,
 438                      uint32_t firstQuery,
 439                      uint32_t queryCount)
 440 {
 441    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 442    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 443
 444    switch (pool->type) {
 445    case VK_QUERY_TYPE_TIMESTAMP:
 446    case VK_QUERY_TYPE_OCCLUSION:
 447       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
 448       break;
 449    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 450       unreachable("Unimplemented query type");
 451    default:
 452       assert(!"Invalid query type");
 453    }
 454
 455    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 456 }
 457
 458 static void
 459 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 460                            struct tu_query_pool *pool,
 461                            uint32_t query)
 462 {
 463    /* From the Vulkan 1.1.130 spec:
 464     *
 465     *    A query must begin and end inside the same subpass of a render pass
 466     *    instance, or must both begin and end outside of a render pass
 467     *    instance.
 468     *
 469     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
 470     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
 471     * query begins/ends inside the same subpass of a render pass, we need to
 472     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
 473     * is then run on every tile during render, so we just need to accumulate
 474     * sample counts in slot->result to compute the query result.
 475     */
 476    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 477
 478    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 479
 480    tu_cs_emit_regs(cs,
 481                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 482
 483    tu_cs_emit_regs(cs,
 484                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
 485
 486    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 487    tu_cs_emit(cs, ZPASS_DONE);
 488 }
 489
 490 void
 491 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
 492                  VkQueryPool queryPool,
 493                  uint32_t query,
 494                  VkQueryControlFlags flags)
 495 {
 496    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 497    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 498    assert(query < pool->size);
 499
 500    switch (pool->type) {
 501    case VK_QUERY_TYPE_OCCLUSION:
 502       /* In freedreno, there is no implementation difference between
 503        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
 504        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
 505        */
 506       emit_begin_occlusion_query(cmdbuf, pool, query);
 507       break;
 508    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 509    case VK_QUERY_TYPE_TIMESTAMP:
 510       unreachable("Unimplemented query type");
 511    default:
 512       assert(!"Invalid query type");
 513    }
 514
 515    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 516 }
 517
 518 static void
 519 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 520                          struct tu_query_pool *pool,
 521                          uint32_t query)
 522 {
 523    /* Ending an occlusion query happens in a few steps:
 524     *    1) Set the slot->end to UINT64_MAX.
 525     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
 526     *       write the current sample count value into slot->end.
 527     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
 528     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
 529     *    4) Accumulate the results of the query (slot->end - slot->begin) into
 530     *       slot->result.
 531     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
 532     *       pass, set the slot's available bit since the query is now done.
 533     *    6) If vkCmdEndQuery *is* called from within the scope of a render
 534     *       pass, we cannot mark as available yet since the commands in
 535     *       draw_cs are not run until vkCmdEndRenderPass.
 536     */
 537    const struct tu_render_pass *pass = cmdbuf->state.pass;
 538    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 539
 540    uint64_t available_iova = query_available_iova(pool, query);
 541    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 542    uint64_t end_iova = occlusion_query_iova(pool, query, end);
 543    uint64_t result_iova = query_result_iova(pool, query);
 544    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 545    tu_cs_emit_qw(cs, end_iova);
 546    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
 547
 548    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 549
 550    tu_cs_emit_regs(cs,
 551                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 552
 553    tu_cs_emit_regs(cs,
 554                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
 555
 556    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 557    tu_cs_emit(cs, ZPASS_DONE);
 558
 559    tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 560    tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
 561                   CP_WAIT_REG_MEM_0_POLL_MEMORY);
 562    tu_cs_emit_qw(cs, end_iova);
 563    tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
 564    tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 565    tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 566
 567    /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
 568    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 569    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
 570    tu_cs_emit_qw(cs, result_iova);
 571    tu_cs_emit_qw(cs, result_iova);
 572    tu_cs_emit_qw(cs, end_iova);
 573    tu_cs_emit_qw(cs, begin_iova);
 574
 575    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 576
 577    if (pass)
 578       /* Technically, queries should be tracked per-subpass, but here we track
 579        * at the render pass level to simply the code a bit. This is safe
 580        * because the only commands that use the available bit are
 581        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
 582        * cannot be invoked from inside a render pass scope.
 583        */
 584       cs = &cmdbuf->draw_epilogue_cs;
 585
 586    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 587    tu_cs_emit_qw(cs, available_iova);
 588    tu_cs_emit_qw(cs, 0x1);
 589 }
 590
 591 void
 592 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
 593                VkQueryPool queryPool,
 594                uint32_t query)
 595 {
 596    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 597    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 598    assert(query < pool->size);
 599
 600    switch (pool->type) {
 601    case VK_QUERY_TYPE_OCCLUSION:
 602       emit_end_occlusion_query(cmdbuf, pool, query);
 603       break;
 604    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 605    case VK_QUERY_TYPE_TIMESTAMP:
 606       unreachable("Unimplemented query type");
 607    default:
 608       assert(!"Invalid query type");
 609    }
 610
 611    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 612 }
 613
 614 void
 615 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
 616                      VkPipelineStageFlagBits pipelineStage,
 617                      VkQueryPool queryPool,
 618                      uint32_t query)
 619 {
 620    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
 621    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 622    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
 623
 624    /* WFI to get more accurate timestamp */
 625    tu_cs_emit_wfi(cs);
 626
 627    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
 628    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
 629                   CP_REG_TO_MEM_0_CNT(2) |
 630                   CP_REG_TO_MEM_0_64B);
 631    tu_cs_emit_qw(cs, query_result_iova(pool, query));
 632
 633    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 634    tu_cs_emit_qw(cs, query_available_iova(pool, query));
 635    tu_cs_emit_qw(cs, 0x1);
 636
 637    if (cmd->state.pass) {
 638       /* TODO: to have useful in-renderpass timestamps:
 639        * for sysmem path, we can just emit the timestamp in draw_cs,
 640        * for gmem renderpass, we do something with accumulate,
 641        * but I'm not sure that would follow the spec
 642        */
 643       tu_finishme("CmdWriteTimestam in renderpass not accurate");
 644    }
 645 }