src/freedreno/vulkan/tu_query.c

   1 /*
   2  * Copyrigh 2016 Red Hat Inc.
   3  * Based on anv:
   4  * Copyright © 2015 Intel Corporation
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  * DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "tu_private.h"
  27
  28 #include <assert.h>
  29 #include <fcntl.h>
  30 #include <stdbool.h>
  31 #include <string.h>
  32 #include <unistd.h>
  33
  34 #include "registers/adreno_pm4.xml.h"
  35 #include "registers/adreno_common.xml.h"
  36 #include "registers/a6xx.xml.h"
  37
  38 #include "nir/nir_builder.h"
  39 #include "util/os_time.h"
  40
  41 #include "tu_cs.h"
  42
  43 #define NSEC_PER_SEC 1000000000ull
  44 #define WAIT_TIMEOUT 5
  45
  46 /* Depending on the query type, there might be 2 integer values.
  47  * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
  48  *   values[0] : primitives written, values[1]: primitives generated
  49  */
  50 struct PACKED slot_value {
  51    uint64_t values[2];
  52 };
  53
  54 struct PACKED query_slot {
  55    struct slot_value available; /* 0 when unavailable, 1 when available */
  56    struct slot_value result;
  57 };
  58
  59 struct PACKED occlusion_query_slot {
  60    struct query_slot common;
  61    struct slot_value begin;
  62    struct slot_value end;
  63 };
  64
  65 /* Returns the IOVA of a given uint64_t field in a given slot of a query
  66  * pool. */
  67 #define query_iova(type, pool, query, field, value_index)            \
  68    pool->bo.iova + pool->stride * query + offsetof(type, field) +    \
  69          offsetof(struct slot_value, values[value_index])
  70
  71 #define occlusion_query_iova(pool, query, field)                     \
  72    query_iova(struct occlusion_query_slot, pool, query, field, 0)
  73
  74 #define query_available_iova(pool, query)                            \
  75    query_iova(struct query_slot, pool, query, available, 0)
  76
  77 #define query_result_iova(pool, query, i)                            \
  78    query_iova(struct query_slot, pool, query, result, i)
  79
  80 #define query_is_available(slot) slot->available.values[0]
  81
  82 /*
  83  * Returns a pointer to a given slot in a query pool.
  84  */
  85 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
  86 {
  87    return (char*)pool->bo.map + query * pool->stride;
  88 }
  89
  90 VkResult
  91 tu_CreateQueryPool(VkDevice _device,
  92                    const VkQueryPoolCreateInfo *pCreateInfo,
  93                    const VkAllocationCallbacks *pAllocator,
  94                    VkQueryPool *pQueryPool)
  95 {
  96    TU_FROM_HANDLE(tu_device, device, _device);
  97    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
  98    assert(pCreateInfo->queryCount > 0);
  99
 100    uint32_t slot_size;
 101    switch (pCreateInfo->queryType) {
 102    case VK_QUERY_TYPE_OCCLUSION:
 103       slot_size = sizeof(struct occlusion_query_slot);
 104       break;
 105    case VK_QUERY_TYPE_TIMESTAMP:
 106       slot_size = sizeof(struct query_slot);
 107       break;
 108    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 109       unreachable("Unimplemented query type");
 110    default:
 111       assert(!"Invalid query type");
 112    }
 113
 114    struct tu_query_pool *pool =
 115       vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
 116                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 117
 118    if (!pool)
 119       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 120
 121    VkResult result = tu_bo_init_new(device, &pool->bo,
 122          pCreateInfo->queryCount * slot_size);
 123    if (result != VK_SUCCESS) {
 124       vk_free2(&device->alloc, pAllocator, pool);
 125       return result;
 126    }
 127
 128    result = tu_bo_map(device, &pool->bo);
 129    if (result != VK_SUCCESS) {
 130       tu_bo_finish(device, &pool->bo);
 131       vk_free2(&device->alloc, pAllocator, pool);
 132       return result;
 133    }
 134
 135    /* Initialize all query statuses to unavailable */
 136    memset(pool->bo.map, 0, pool->bo.size);
 137
 138    pool->type = pCreateInfo->queryType;
 139    pool->stride = slot_size;
 140    pool->size = pCreateInfo->queryCount;
 141    pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
 142    *pQueryPool = tu_query_pool_to_handle(pool);
 143
 144    return VK_SUCCESS;
 145 }
 146
 147 void
 148 tu_DestroyQueryPool(VkDevice _device,
 149                     VkQueryPool _pool,
 150                     const VkAllocationCallbacks *pAllocator)
 151 {
 152    TU_FROM_HANDLE(tu_device, device, _device);
 153    TU_FROM_HANDLE(tu_query_pool, pool, _pool);
 154
 155    if (!pool)
 156       return;
 157
 158    tu_bo_finish(device, &pool->bo);
 159    vk_free2(&device->alloc, pAllocator, pool);
 160 }
 161
 162 static uint32_t
 163 get_result_count(struct tu_query_pool *pool)
 164 {
 165    switch (pool->type) {
 166    /* Occulusion and timestamp queries write one integer value */
 167    case VK_QUERY_TYPE_OCCLUSION:
 168    case VK_QUERY_TYPE_TIMESTAMP:
 169       return 1;
 170    /* Transform feedback queries write two integer values */
 171    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 172       return 2;
 173    default:
 174       assert(!"Invalid query type");
 175       return 0;
 176    }
 177 }
 178
 179 /* Wait on the the availability status of a query up until a timeout. */
 180 static VkResult
 181 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
 182                    uint32_t query)
 183 {
 184    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
 185     * scheduler friendly way instead of busy polling once the patch has landed
 186     * upstream. */
 187    struct query_slot *slot = slot_address(pool, query);
 188    uint64_t abs_timeout = os_time_get_absolute_timeout(
 189          WAIT_TIMEOUT * NSEC_PER_SEC);
 190    while(os_time_get_nano() < abs_timeout) {
 191       if (query_is_available(slot))
 192          return VK_SUCCESS;
 193    }
 194    return vk_error(device->instance, VK_TIMEOUT);
 195 }
 196
 197 /* Writes a query value to a buffer from the CPU. */
 198 static void
 199 write_query_value_cpu(char* base,
 200                       uint32_t offset,
 201                       uint64_t value,
 202                       VkQueryResultFlags flags)
 203 {
 204    if (flags & VK_QUERY_RESULT_64_BIT) {
 205       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
 206    } else {
 207       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
 208    }
 209 }
 210
 211 static VkResult
 212 get_query_pool_results(struct tu_device *device,
 213                        struct tu_query_pool *pool,
 214                        uint32_t firstQuery,
 215                        uint32_t queryCount,
 216                        size_t dataSize,
 217                        void *pData,
 218                        VkDeviceSize stride,
 219                        VkQueryResultFlags flags)
 220 {
 221    assert(dataSize >= stride * queryCount);
 222
 223    char *result_base = pData;
 224    VkResult result = VK_SUCCESS;
 225    for (uint32_t i = 0; i < queryCount; i++) {
 226       uint32_t query = firstQuery + i;
 227       struct query_slot *slot = slot_address(pool, query);
 228       bool available = query_is_available(slot);
 229       uint32_t result_count = get_result_count(pool);
 230
 231       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
 232          VkResult wait_result = wait_for_available(device, pool, query);
 233          if (wait_result != VK_SUCCESS)
 234             return wait_result;
 235          available = true;
 236       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
 237          /* From the Vulkan 1.1.130 spec:
 238           *
 239           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
 240           *    both not set then no result values are written to pData for
 241           *    queries that are in the unavailable state at the time of the
 242           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
 243           *    availability state is still written to pData for those queries
 244           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
 245           */
 246          result = VK_NOT_READY;
 247          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
 248             result_base += stride;
 249             continue;
 250          }
 251       }
 252
 253       for (uint32_t k = 0; k < result_count; k++) {
 254          if (available)
 255             write_query_value_cpu(result_base, k, slot->result.values[k], flags);
 256          else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
 257              /* From the Vulkan 1.1.130 spec:
 258               *
 259               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
 260               *   is not set, and the query’s status is unavailable, an
 261               *   intermediate result value between zero and the final result
 262               *   value is written to pData for that query.
 263               *
 264               * Just return 0 here for simplicity since it's a valid result.
 265               */
 266             write_query_value_cpu(result_base, k, 0, flags);
 267       }
 268
 269       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
 270          /* From the Vulkan 1.1.130 spec:
 271           *
 272           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
 273           *    integer value written for each query is non-zero if the query’s
 274           *    status was available or zero if the status was unavailable.
 275           */
 276          write_query_value_cpu(result_base, result_count, available, flags);
 277
 278       result_base += stride;
 279    }
 280    return result;
 281 }
 282
 283 VkResult
 284 tu_GetQueryPoolResults(VkDevice _device,
 285                        VkQueryPool queryPool,
 286                        uint32_t firstQuery,
 287                        uint32_t queryCount,
 288                        size_t dataSize,
 289                        void *pData,
 290                        VkDeviceSize stride,
 291                        VkQueryResultFlags flags)
 292 {
 293    TU_FROM_HANDLE(tu_device, device, _device);
 294    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 295    assert(firstQuery + queryCount <= pool->size);
 296
 297    switch (pool->type) {
 298    case VK_QUERY_TYPE_OCCLUSION:
 299    case VK_QUERY_TYPE_TIMESTAMP:
 300       return get_query_pool_results(device, pool, firstQuery, queryCount,
 301                                     dataSize, pData, stride, flags);
 302    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 303       unreachable("Unimplemented query type");
 304    default:
 305       assert(!"Invalid query type");
 306    }
 307    return VK_SUCCESS;
 308 }
 309
 310 /* Copies a query value from one buffer to another from the GPU. */
 311 static void
 312 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
 313                      struct tu_cs *cs,
 314                      uint64_t src_iova,
 315                      uint64_t base_write_iova,
 316                      uint32_t offset,
 317                      VkQueryResultFlags flags) {
 318    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
 319          sizeof(uint64_t) : sizeof(uint32_t);
 320    uint64_t write_iova = base_write_iova + (offset * element_size);
 321
 322    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
 323    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
 324          CP_MEM_TO_MEM_0_DOUBLE : 0;
 325    tu_cs_emit(cs, mem_to_mem_flags);
 326    tu_cs_emit_qw(cs, write_iova);
 327    tu_cs_emit_qw(cs, src_iova);
 328 }
 329
 330 static void
 331 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
 332                              struct tu_cs *cs,
 333                              struct tu_query_pool *pool,
 334                              uint32_t firstQuery,
 335                              uint32_t queryCount,
 336                              struct tu_buffer *buffer,
 337                              VkDeviceSize dstOffset,
 338                              VkDeviceSize stride,
 339                              VkQueryResultFlags flags)
 340 {
 341    /* From the Vulkan 1.1.130 spec:
 342     *
 343     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
 344     *    uses of vkCmdResetQueryPool in the same queue, without any additional
 345     *    synchronization.
 346     *
 347     * To ensure that previous writes to the available bit are coherent, first
 348     * wait for all writes to complete.
 349     */
 350    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 351
 352    for (uint32_t i = 0; i < queryCount; i++) {
 353       uint32_t query = firstQuery + i;
 354       uint64_t available_iova = query_available_iova(pool, query);
 355       uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
 356       uint32_t result_count = get_result_count(pool);
 357
 358       /* Wait for the available bit to be set if executed with the
 359        * VK_QUERY_RESULT_WAIT_BIT flag. */
 360       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
 361          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 362          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
 363                         CP_WAIT_REG_MEM_0_POLL_MEMORY);
 364          tu_cs_emit_qw(cs, available_iova);
 365          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
 366          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 367          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 368       }
 369
 370       for (uint32_t k = 0; k < result_count; k++) {
 371          uint64_t result_iova = query_result_iova(pool, query, k);
 372
 373          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
 374             /* Unconditionally copying the bo->result into the buffer here is
 375              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
 376              * if the query is unavailable, this will copy the correct partial
 377              * value of 0.
 378              */
 379             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 380                                  k /* offset */, flags);
 381          } else {
 382             /* Conditionally copy bo->result into the buffer based on whether the
 383              * query is available.
 384              *
 385              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
 386              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
 387              * that 0 < available < 2, aka available == 1.
 388              */
 389             tu_cs_reserve(cs, 7 + 6);
 390             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
 391             tu_cs_emit_qw(cs, available_iova);
 392             tu_cs_emit_qw(cs, available_iova);
 393             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
 394             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
 395
 396             /* Start of conditional execution */
 397             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 398                               k /* offset */, flags);
 399             /* End of conditional execution */
 400          }
 401       }
 402
 403       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 404          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
 405                               result_count /* offset */, flags);
 406       }
 407    }
 408
 409    tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
 410 }
 411
 412 void
 413 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
 414                            VkQueryPool queryPool,
 415                            uint32_t firstQuery,
 416                            uint32_t queryCount,
 417                            VkBuffer dstBuffer,
 418                            VkDeviceSize dstOffset,
 419                            VkDeviceSize stride,
 420                            VkQueryResultFlags flags)
 421 {
 422    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 423    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 424    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
 425    struct tu_cs *cs = &cmdbuf->cs;
 426    assert(firstQuery + queryCount <= pool->size);
 427
 428    switch (pool->type) {
 429    case VK_QUERY_TYPE_OCCLUSION:
 430    case VK_QUERY_TYPE_TIMESTAMP:
 431       return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
 432                queryCount, buffer, dstOffset, stride, flags);
 433    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 434       unreachable("Unimplemented query type");
 435    default:
 436       assert(!"Invalid query type");
 437    }
 438 }
 439
 440 static void
 441 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
 442                       struct tu_query_pool *pool,
 443                       uint32_t firstQuery,
 444                       uint32_t queryCount)
 445 {
 446    struct tu_cs *cs = &cmdbuf->cs;
 447
 448    for (uint32_t i = 0; i < queryCount; i++) {
 449       uint32_t query = firstQuery + i;
 450
 451       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 452       tu_cs_emit_qw(cs, query_available_iova(pool, query));
 453       tu_cs_emit_qw(cs, 0x0);
 454
 455       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 456       tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
 457       tu_cs_emit_qw(cs, 0x0);
 458       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 459       tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
 460       tu_cs_emit_qw(cs, 0x0);
 461    }
 462 }
 463
 464 void
 465 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
 466                      VkQueryPool queryPool,
 467                      uint32_t firstQuery,
 468                      uint32_t queryCount)
 469 {
 470    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 471    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 472
 473    switch (pool->type) {
 474    case VK_QUERY_TYPE_TIMESTAMP:
 475    case VK_QUERY_TYPE_OCCLUSION:
 476       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
 477       break;
 478    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 479       unreachable("Unimplemented query type");
 480    default:
 481       assert(!"Invalid query type");
 482    }
 483
 484    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 485 }
 486
 487 static void
 488 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 489                            struct tu_query_pool *pool,
 490                            uint32_t query)
 491 {
 492    /* From the Vulkan 1.1.130 spec:
 493     *
 494     *    A query must begin and end inside the same subpass of a render pass
 495     *    instance, or must both begin and end outside of a render pass
 496     *    instance.
 497     *
 498     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
 499     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
 500     * query begins/ends inside the same subpass of a render pass, we need to
 501     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
 502     * is then run on every tile during render, so we just need to accumulate
 503     * sample counts in slot->result to compute the query result.
 504     */
 505    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 506
 507    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 508
 509    tu_cs_emit_regs(cs,
 510                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 511
 512    tu_cs_emit_regs(cs,
 513                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
 514
 515    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 516    tu_cs_emit(cs, ZPASS_DONE);
 517 }
 518
 519 void
 520 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
 521                  VkQueryPool queryPool,
 522                  uint32_t query,
 523                  VkQueryControlFlags flags)
 524 {
 525    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 526    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 527    assert(query < pool->size);
 528
 529    switch (pool->type) {
 530    case VK_QUERY_TYPE_OCCLUSION:
 531       /* In freedreno, there is no implementation difference between
 532        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
 533        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
 534        */
 535       emit_begin_occlusion_query(cmdbuf, pool, query);
 536       break;
 537    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 538    case VK_QUERY_TYPE_TIMESTAMP:
 539       unreachable("Unimplemented query type");
 540    default:
 541       assert(!"Invalid query type");
 542    }
 543
 544    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 545 }
 546
 547 static void
 548 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 549                          struct tu_query_pool *pool,
 550                          uint32_t query)
 551 {
 552    /* Ending an occlusion query happens in a few steps:
 553     *    1) Set the slot->end to UINT64_MAX.
 554     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
 555     *       write the current sample count value into slot->end.
 556     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
 557     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
 558     *    4) Accumulate the results of the query (slot->end - slot->begin) into
 559     *       slot->result.
 560     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
 561     *       pass, set the slot's available bit since the query is now done.
 562     *    6) If vkCmdEndQuery *is* called from within the scope of a render
 563     *       pass, we cannot mark as available yet since the commands in
 564     *       draw_cs are not run until vkCmdEndRenderPass.
 565     */
 566    const struct tu_render_pass *pass = cmdbuf->state.pass;
 567    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 568
 569    uint64_t available_iova = query_available_iova(pool, query);
 570    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 571    uint64_t end_iova = occlusion_query_iova(pool, query, end);
 572    uint64_t result_iova = query_result_iova(pool, query, 0);
 573    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 574    tu_cs_emit_qw(cs, end_iova);
 575    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
 576
 577    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 578
 579    tu_cs_emit_regs(cs,
 580                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 581
 582    tu_cs_emit_regs(cs,
 583                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
 584
 585    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 586    tu_cs_emit(cs, ZPASS_DONE);
 587
 588    tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 589    tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
 590                   CP_WAIT_REG_MEM_0_POLL_MEMORY);
 591    tu_cs_emit_qw(cs, end_iova);
 592    tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
 593    tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 594    tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 595
 596    /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
 597    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 598    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
 599    tu_cs_emit_qw(cs, result_iova);
 600    tu_cs_emit_qw(cs, result_iova);
 601    tu_cs_emit_qw(cs, end_iova);
 602    tu_cs_emit_qw(cs, begin_iova);
 603
 604    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 605
 606    if (pass)
 607       /* Technically, queries should be tracked per-subpass, but here we track
 608        * at the render pass level to simply the code a bit. This is safe
 609        * because the only commands that use the available bit are
 610        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
 611        * cannot be invoked from inside a render pass scope.
 612        */
 613       cs = &cmdbuf->draw_epilogue_cs;
 614
 615    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 616    tu_cs_emit_qw(cs, available_iova);
 617    tu_cs_emit_qw(cs, 0x1);
 618 }
 619
 620 void
 621 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
 622                VkQueryPool queryPool,
 623                uint32_t query)
 624 {
 625    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 626    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 627    assert(query < pool->size);
 628
 629    switch (pool->type) {
 630    case VK_QUERY_TYPE_OCCLUSION:
 631       emit_end_occlusion_query(cmdbuf, pool, query);
 632       break;
 633    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 634    case VK_QUERY_TYPE_TIMESTAMP:
 635       unreachable("Unimplemented query type");
 636    default:
 637       assert(!"Invalid query type");
 638    }
 639
 640    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 641 }
 642
 643 void
 644 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
 645                      VkPipelineStageFlagBits pipelineStage,
 646                      VkQueryPool queryPool,
 647                      uint32_t query)
 648 {
 649    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
 650    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 651    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
 652
 653    /* WFI to get more accurate timestamp */
 654    tu_cs_emit_wfi(cs);
 655
 656    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
 657    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
 658                   CP_REG_TO_MEM_0_CNT(2) |
 659                   CP_REG_TO_MEM_0_64B);
 660    tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
 661
 662    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 663    tu_cs_emit_qw(cs, query_available_iova(pool, query));
 664    tu_cs_emit_qw(cs, 0x1);
 665
 666    if (cmd->state.pass) {
 667       /* TODO: to have useful in-renderpass timestamps:
 668        * for sysmem path, we can just emit the timestamp in draw_cs,
 669        * for gmem renderpass, we do something with accumulate,
 670        * but I'm not sure that would follow the spec
 671        */
 672       tu_finishme("CmdWriteTimestam in renderpass not accurate");
 673    }
 674 }