src/freedreno/vulkan/tu_query.c

   1 /*
   2  * Copyrigh 2016 Red Hat Inc.
   3  * Based on anv:
   4  * Copyright © 2015 Intel Corporation
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  * DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "tu_private.h"
  27
  28 #include <assert.h>
  29 #include <fcntl.h>
  30 #include <stdbool.h>
  31 #include <string.h>
  32 #include <unistd.h>
  33
  34 #include "registers/adreno_pm4.xml.h"
  35 #include "registers/adreno_common.xml.h"
  36 #include "registers/a6xx.xml.h"
  37
  38 #include "nir/nir_builder.h"
  39 #include "util/os_time.h"
  40
  41 #include "tu_cs.h"
  42
  43 #define NSEC_PER_SEC 1000000000ull
  44 #define WAIT_TIMEOUT 5
  45
  46 /* Depending on the query type, there might be 2 integer values.
  47  * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
  48  *   values[0] : primitives written, values[1]: primitives generated
  49  */
  50 struct PACKED slot_value {
  51    uint64_t values[2];
  52 };
  53
  54 struct PACKED query_slot {
  55    struct slot_value available; /* 0 when unavailable, 1 when available */
  56    struct slot_value result;
  57 };
  58
  59 struct PACKED occlusion_query_slot {
  60    struct query_slot common;
  61    struct slot_value begin;
  62    struct slot_value end;
  63 };
  64
  65 /* The result of transform feedback queries is two integer values:
  66  *   common.result.values[0] is the count of primitives written,
  67  *   common.result.values[1] is the count of primitives generated.
  68  * Also a result for each stream is stored at 4 slots respectively.
  69  */
  70 struct PACKED primitive_query_slot {
  71    struct query_slot common;
  72    struct slot_value begin[4];
  73    struct slot_value end[4];
  74 };
  75
  76 /* Returns the IOVA of a given uint64_t field in a given slot of a query
  77  * pool. */
  78 #define query_iova(type, pool, query, field, value_index)            \
  79    pool->bo.iova + pool->stride * query + offsetof(type, field) +    \
  80          offsetof(struct slot_value, values[value_index])
  81
  82 #define occlusion_query_iova(pool, query, field)                     \
  83    query_iova(struct occlusion_query_slot, pool, query, field, 0)
  84
  85 #define primitive_query_iova(pool, query, field, i)                  \
  86    query_iova(struct primitive_query_slot, pool, query, field, i)
  87
  88 #define query_available_iova(pool, query)                            \
  89    query_iova(struct query_slot, pool, query, available, 0)
  90
  91 #define query_result_iova(pool, query, i)                            \
  92    query_iova(struct query_slot, pool, query, result, i)
  93
  94 #define query_is_available(slot) slot->available.values[0]
  95
  96 /*
  97  * Returns a pointer to a given slot in a query pool.
  98  */
  99 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
 100 {
 101    return (char*)pool->bo.map + query * pool->stride;
 102 }
 103
 104 VkResult
 105 tu_CreateQueryPool(VkDevice _device,
 106                    const VkQueryPoolCreateInfo *pCreateInfo,
 107                    const VkAllocationCallbacks *pAllocator,
 108                    VkQueryPool *pQueryPool)
 109 {
 110    TU_FROM_HANDLE(tu_device, device, _device);
 111    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
 112    assert(pCreateInfo->queryCount > 0);
 113
 114    uint32_t slot_size;
 115    switch (pCreateInfo->queryType) {
 116    case VK_QUERY_TYPE_OCCLUSION:
 117       slot_size = sizeof(struct occlusion_query_slot);
 118       break;
 119    case VK_QUERY_TYPE_TIMESTAMP:
 120       slot_size = sizeof(struct query_slot);
 121       break;
 122    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 123       slot_size = sizeof(struct primitive_query_slot);
 124       break;
 125    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 126       unreachable("Unimplemented query type");
 127    default:
 128       assert(!"Invalid query type");
 129    }
 130
 131    struct tu_query_pool *pool =
 132       vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
 133                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 134
 135    if (!pool)
 136       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 137
 138    VkResult result = tu_bo_init_new(device, &pool->bo,
 139          pCreateInfo->queryCount * slot_size);
 140    if (result != VK_SUCCESS) {
 141       vk_free2(&device->alloc, pAllocator, pool);
 142       return result;
 143    }
 144
 145    result = tu_bo_map(device, &pool->bo);
 146    if (result != VK_SUCCESS) {
 147       tu_bo_finish(device, &pool->bo);
 148       vk_free2(&device->alloc, pAllocator, pool);
 149       return result;
 150    }
 151
 152    /* Initialize all query statuses to unavailable */
 153    memset(pool->bo.map, 0, pool->bo.size);
 154
 155    pool->type = pCreateInfo->queryType;
 156    pool->stride = slot_size;
 157    pool->size = pCreateInfo->queryCount;
 158    pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
 159    *pQueryPool = tu_query_pool_to_handle(pool);
 160
 161    return VK_SUCCESS;
 162 }
 163
 164 void
 165 tu_DestroyQueryPool(VkDevice _device,
 166                     VkQueryPool _pool,
 167                     const VkAllocationCallbacks *pAllocator)
 168 {
 169    TU_FROM_HANDLE(tu_device, device, _device);
 170    TU_FROM_HANDLE(tu_query_pool, pool, _pool);
 171
 172    if (!pool)
 173       return;
 174
 175    tu_bo_finish(device, &pool->bo);
 176    vk_free2(&device->alloc, pAllocator, pool);
 177 }
 178
 179 static uint32_t
 180 get_result_count(struct tu_query_pool *pool)
 181 {
 182    switch (pool->type) {
 183    /* Occulusion and timestamp queries write one integer value */
 184    case VK_QUERY_TYPE_OCCLUSION:
 185    case VK_QUERY_TYPE_TIMESTAMP:
 186       return 1;
 187    /* Transform feedback queries write two integer values */
 188    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 189       return 2;
 190    default:
 191       assert(!"Invalid query type");
 192       return 0;
 193    }
 194 }
 195
 196 /* Wait on the the availability status of a query up until a timeout. */
 197 static VkResult
 198 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
 199                    uint32_t query)
 200 {
 201    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
 202     * scheduler friendly way instead of busy polling once the patch has landed
 203     * upstream. */
 204    struct query_slot *slot = slot_address(pool, query);
 205    uint64_t abs_timeout = os_time_get_absolute_timeout(
 206          WAIT_TIMEOUT * NSEC_PER_SEC);
 207    while(os_time_get_nano() < abs_timeout) {
 208       if (query_is_available(slot))
 209          return VK_SUCCESS;
 210    }
 211    return vk_error(device->instance, VK_TIMEOUT);
 212 }
 213
 214 /* Writes a query value to a buffer from the CPU. */
 215 static void
 216 write_query_value_cpu(char* base,
 217                       uint32_t offset,
 218                       uint64_t value,
 219                       VkQueryResultFlags flags)
 220 {
 221    if (flags & VK_QUERY_RESULT_64_BIT) {
 222       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
 223    } else {
 224       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
 225    }
 226 }
 227
 228 static VkResult
 229 get_query_pool_results(struct tu_device *device,
 230                        struct tu_query_pool *pool,
 231                        uint32_t firstQuery,
 232                        uint32_t queryCount,
 233                        size_t dataSize,
 234                        void *pData,
 235                        VkDeviceSize stride,
 236                        VkQueryResultFlags flags)
 237 {
 238    assert(dataSize >= stride * queryCount);
 239
 240    char *result_base = pData;
 241    VkResult result = VK_SUCCESS;
 242    for (uint32_t i = 0; i < queryCount; i++) {
 243       uint32_t query = firstQuery + i;
 244       struct query_slot *slot = slot_address(pool, query);
 245       bool available = query_is_available(slot);
 246       uint32_t result_count = get_result_count(pool);
 247
 248       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
 249          VkResult wait_result = wait_for_available(device, pool, query);
 250          if (wait_result != VK_SUCCESS)
 251             return wait_result;
 252          available = true;
 253       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
 254          /* From the Vulkan 1.1.130 spec:
 255           *
 256           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
 257           *    both not set then no result values are written to pData for
 258           *    queries that are in the unavailable state at the time of the
 259           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
 260           *    availability state is still written to pData for those queries
 261           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
 262           */
 263          result = VK_NOT_READY;
 264          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
 265             result_base += stride;
 266             continue;
 267          }
 268       }
 269
 270       for (uint32_t k = 0; k < result_count; k++) {
 271          if (available)
 272             write_query_value_cpu(result_base, k, slot->result.values[k], flags);
 273          else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
 274              /* From the Vulkan 1.1.130 spec:
 275               *
 276               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
 277               *   is not set, and the query’s status is unavailable, an
 278               *   intermediate result value between zero and the final result
 279               *   value is written to pData for that query.
 280               *
 281               * Just return 0 here for simplicity since it's a valid result.
 282               */
 283             write_query_value_cpu(result_base, k, 0, flags);
 284       }
 285
 286       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
 287          /* From the Vulkan 1.1.130 spec:
 288           *
 289           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
 290           *    integer value written for each query is non-zero if the query’s
 291           *    status was available or zero if the status was unavailable.
 292           */
 293          write_query_value_cpu(result_base, result_count, available, flags);
 294
 295       result_base += stride;
 296    }
 297    return result;
 298 }
 299
 300 VkResult
 301 tu_GetQueryPoolResults(VkDevice _device,
 302                        VkQueryPool queryPool,
 303                        uint32_t firstQuery,
 304                        uint32_t queryCount,
 305                        size_t dataSize,
 306                        void *pData,
 307                        VkDeviceSize stride,
 308                        VkQueryResultFlags flags)
 309 {
 310    TU_FROM_HANDLE(tu_device, device, _device);
 311    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 312    assert(firstQuery + queryCount <= pool->size);
 313
 314    switch (pool->type) {
 315    case VK_QUERY_TYPE_OCCLUSION:
 316    case VK_QUERY_TYPE_TIMESTAMP:
 317    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 318       return get_query_pool_results(device, pool, firstQuery, queryCount,
 319                                     dataSize, pData, stride, flags);
 320    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 321       unreachable("Unimplemented query type");
 322    default:
 323       assert(!"Invalid query type");
 324    }
 325    return VK_SUCCESS;
 326 }
 327
 328 /* Copies a query value from one buffer to another from the GPU. */
 329 static void
 330 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
 331                      struct tu_cs *cs,
 332                      uint64_t src_iova,
 333                      uint64_t base_write_iova,
 334                      uint32_t offset,
 335                      VkQueryResultFlags flags) {
 336    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
 337          sizeof(uint64_t) : sizeof(uint32_t);
 338    uint64_t write_iova = base_write_iova + (offset * element_size);
 339
 340    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
 341    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
 342          CP_MEM_TO_MEM_0_DOUBLE : 0;
 343    tu_cs_emit(cs, mem_to_mem_flags);
 344    tu_cs_emit_qw(cs, write_iova);
 345    tu_cs_emit_qw(cs, src_iova);
 346 }
 347
 348 static void
 349 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
 350                              struct tu_cs *cs,
 351                              struct tu_query_pool *pool,
 352                              uint32_t firstQuery,
 353                              uint32_t queryCount,
 354                              struct tu_buffer *buffer,
 355                              VkDeviceSize dstOffset,
 356                              VkDeviceSize stride,
 357                              VkQueryResultFlags flags)
 358 {
 359    /* From the Vulkan 1.1.130 spec:
 360     *
 361     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
 362     *    uses of vkCmdResetQueryPool in the same queue, without any additional
 363     *    synchronization.
 364     *
 365     * To ensure that previous writes to the available bit are coherent, first
 366     * wait for all writes to complete.
 367     */
 368    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 369
 370    for (uint32_t i = 0; i < queryCount; i++) {
 371       uint32_t query = firstQuery + i;
 372       uint64_t available_iova = query_available_iova(pool, query);
 373       uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
 374       uint32_t result_count = get_result_count(pool);
 375
 376       /* Wait for the available bit to be set if executed with the
 377        * VK_QUERY_RESULT_WAIT_BIT flag. */
 378       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
 379          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 380          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
 381                         CP_WAIT_REG_MEM_0_POLL_MEMORY);
 382          tu_cs_emit_qw(cs, available_iova);
 383          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
 384          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 385          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 386       }
 387
 388       for (uint32_t k = 0; k < result_count; k++) {
 389          uint64_t result_iova = query_result_iova(pool, query, k);
 390
 391          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
 392             /* Unconditionally copying the bo->result into the buffer here is
 393              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
 394              * if the query is unavailable, this will copy the correct partial
 395              * value of 0.
 396              */
 397             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 398                                  k /* offset */, flags);
 399          } else {
 400             /* Conditionally copy bo->result into the buffer based on whether the
 401              * query is available.
 402              *
 403              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
 404              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
 405              * that 0 < available < 2, aka available == 1.
 406              */
 407             tu_cs_reserve(cs, 7 + 6);
 408             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
 409             tu_cs_emit_qw(cs, available_iova);
 410             tu_cs_emit_qw(cs, available_iova);
 411             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
 412             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
 413
 414             /* Start of conditional execution */
 415             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 416                               k /* offset */, flags);
 417             /* End of conditional execution */
 418          }
 419       }
 420
 421       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 422          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
 423                               result_count /* offset */, flags);
 424       }
 425    }
 426
 427    tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
 428 }
 429
 430 void
 431 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
 432                            VkQueryPool queryPool,
 433                            uint32_t firstQuery,
 434                            uint32_t queryCount,
 435                            VkBuffer dstBuffer,
 436                            VkDeviceSize dstOffset,
 437                            VkDeviceSize stride,
 438                            VkQueryResultFlags flags)
 439 {
 440    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 441    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 442    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
 443    struct tu_cs *cs = &cmdbuf->cs;
 444    assert(firstQuery + queryCount <= pool->size);
 445
 446    switch (pool->type) {
 447    case VK_QUERY_TYPE_OCCLUSION:
 448    case VK_QUERY_TYPE_TIMESTAMP:
 449    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 450       return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
 451                queryCount, buffer, dstOffset, stride, flags);
 452    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 453       unreachable("Unimplemented query type");
 454    default:
 455       assert(!"Invalid query type");
 456    }
 457 }
 458
 459 static void
 460 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
 461                       struct tu_query_pool *pool,
 462                       uint32_t firstQuery,
 463                       uint32_t queryCount)
 464 {
 465    struct tu_cs *cs = &cmdbuf->cs;
 466
 467    for (uint32_t i = 0; i < queryCount; i++) {
 468       uint32_t query = firstQuery + i;
 469
 470       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 471       tu_cs_emit_qw(cs, query_available_iova(pool, query));
 472       tu_cs_emit_qw(cs, 0x0);
 473
 474       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 475       tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
 476       tu_cs_emit_qw(cs, 0x0);
 477       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 478       tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
 479       tu_cs_emit_qw(cs, 0x0);
 480    }
 481 }
 482
 483 void
 484 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
 485                      VkQueryPool queryPool,
 486                      uint32_t firstQuery,
 487                      uint32_t queryCount)
 488 {
 489    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 490    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 491
 492    switch (pool->type) {
 493    case VK_QUERY_TYPE_TIMESTAMP:
 494    case VK_QUERY_TYPE_OCCLUSION:
 495    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 496       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
 497       break;
 498    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 499       unreachable("Unimplemented query type");
 500    default:
 501       assert(!"Invalid query type");
 502    }
 503
 504    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 505 }
 506
 507 static void
 508 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 509                            struct tu_query_pool *pool,
 510                            uint32_t query)
 511 {
 512    /* From the Vulkan 1.1.130 spec:
 513     *
 514     *    A query must begin and end inside the same subpass of a render pass
 515     *    instance, or must both begin and end outside of a render pass
 516     *    instance.
 517     *
 518     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
 519     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
 520     * query begins/ends inside the same subpass of a render pass, we need to
 521     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
 522     * is then run on every tile during render, so we just need to accumulate
 523     * sample counts in slot->result to compute the query result.
 524     */
 525    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 526
 527    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 528
 529    tu_cs_emit_regs(cs,
 530                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 531
 532    tu_cs_emit_regs(cs,
 533                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
 534
 535    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 536    tu_cs_emit(cs, ZPASS_DONE);
 537 }
 538
 539 static void
 540 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
 541                      struct tu_query_pool *pool,
 542                      uint32_t query,
 543                      uint32_t stream_id)
 544 {
 545    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 546    uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
 547
 548    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova));
 549    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
 550 }
 551
 552 void
 553 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
 554                  VkQueryPool queryPool,
 555                  uint32_t query,
 556                  VkQueryControlFlags flags)
 557 {
 558    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 559    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 560    assert(query < pool->size);
 561
 562    switch (pool->type) {
 563    case VK_QUERY_TYPE_OCCLUSION:
 564       /* In freedreno, there is no implementation difference between
 565        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
 566        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
 567        */
 568       emit_begin_occlusion_query(cmdbuf, pool, query);
 569       break;
 570    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 571       emit_begin_xfb_query(cmdbuf, pool, query, 0);
 572       break;
 573    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 574    case VK_QUERY_TYPE_TIMESTAMP:
 575       unreachable("Unimplemented query type");
 576    default:
 577       assert(!"Invalid query type");
 578    }
 579
 580    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 581 }
 582
 583 void
 584 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
 585                            VkQueryPool queryPool,
 586                            uint32_t query,
 587                            VkQueryControlFlags flags,
 588                            uint32_t index)
 589 {
 590    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 591    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 592    assert(query < pool->size);
 593
 594    switch (pool->type) {
 595    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 596       emit_begin_xfb_query(cmdbuf, pool, query, index);
 597       break;
 598    default:
 599       assert(!"Invalid query type");
 600    }
 601
 602    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 603 }
 604
 605 static void
 606 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 607                          struct tu_query_pool *pool,
 608                          uint32_t query)
 609 {
 610    /* Ending an occlusion query happens in a few steps:
 611     *    1) Set the slot->end to UINT64_MAX.
 612     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
 613     *       write the current sample count value into slot->end.
 614     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
 615     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
 616     *    4) Accumulate the results of the query (slot->end - slot->begin) into
 617     *       slot->result.
 618     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
 619     *       pass, set the slot's available bit since the query is now done.
 620     *    6) If vkCmdEndQuery *is* called from within the scope of a render
 621     *       pass, we cannot mark as available yet since the commands in
 622     *       draw_cs are not run until vkCmdEndRenderPass.
 623     */
 624    const struct tu_render_pass *pass = cmdbuf->state.pass;
 625    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 626
 627    uint64_t available_iova = query_available_iova(pool, query);
 628    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 629    uint64_t end_iova = occlusion_query_iova(pool, query, end);
 630    uint64_t result_iova = query_result_iova(pool, query, 0);
 631    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 632    tu_cs_emit_qw(cs, end_iova);
 633    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
 634
 635    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 636
 637    tu_cs_emit_regs(cs,
 638                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 639
 640    tu_cs_emit_regs(cs,
 641                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
 642
 643    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 644    tu_cs_emit(cs, ZPASS_DONE);
 645
 646    tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 647    tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
 648                   CP_WAIT_REG_MEM_0_POLL_MEMORY);
 649    tu_cs_emit_qw(cs, end_iova);
 650    tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
 651    tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 652    tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 653
 654    /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
 655    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 656    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
 657    tu_cs_emit_qw(cs, result_iova);
 658    tu_cs_emit_qw(cs, result_iova);
 659    tu_cs_emit_qw(cs, end_iova);
 660    tu_cs_emit_qw(cs, begin_iova);
 661
 662    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 663
 664    if (pass)
 665       /* Technically, queries should be tracked per-subpass, but here we track
 666        * at the render pass level to simply the code a bit. This is safe
 667        * because the only commands that use the available bit are
 668        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
 669        * cannot be invoked from inside a render pass scope.
 670        */
 671       cs = &cmdbuf->draw_epilogue_cs;
 672
 673    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 674    tu_cs_emit_qw(cs, available_iova);
 675    tu_cs_emit_qw(cs, 0x1);
 676 }
 677
 678 static void
 679 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
 680                    struct tu_query_pool *pool,
 681                    uint32_t query,
 682                    uint32_t stream_id)
 683 {
 684    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 685
 686    uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
 687    uint64_t result_written_iova = query_result_iova(pool, query, 0);
 688    uint64_t result_generated_iova = query_result_iova(pool, query, 1);
 689    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
 690    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
 691    uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
 692    uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
 693    uint64_t available_iova = query_available_iova(pool, query);
 694
 695    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova));
 696    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
 697
 698    tu_cs_emit_wfi(cs);
 699    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
 700
 701    /* Set the count of written primitives */
 702    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 703    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
 704                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
 705    tu_cs_emit_qw(cs, result_written_iova);
 706    tu_cs_emit_qw(cs, result_written_iova);
 707    tu_cs_emit_qw(cs, end_written_iova);
 708    tu_cs_emit_qw(cs, begin_written_iova);
 709
 710    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
 711
 712    /* Set the count of generated primitives */
 713    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 714    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
 715                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
 716    tu_cs_emit_qw(cs, result_generated_iova);
 717    tu_cs_emit_qw(cs, result_generated_iova);
 718    tu_cs_emit_qw(cs, end_generated_iova);
 719    tu_cs_emit_qw(cs, begin_generated_iova);
 720
 721    /* Set the availability to 1 */
 722    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 723    tu_cs_emit_qw(cs, available_iova);
 724    tu_cs_emit_qw(cs, 0x1);
 725 }
 726
 727 void
 728 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
 729                VkQueryPool queryPool,
 730                uint32_t query)
 731 {
 732    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 733    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 734    assert(query < pool->size);
 735
 736    switch (pool->type) {
 737    case VK_QUERY_TYPE_OCCLUSION:
 738       emit_end_occlusion_query(cmdbuf, pool, query);
 739       break;
 740    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 741       emit_end_xfb_query(cmdbuf, pool, query, 0);
 742       break;
 743    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 744    case VK_QUERY_TYPE_TIMESTAMP:
 745       unreachable("Unimplemented query type");
 746    default:
 747       assert(!"Invalid query type");
 748    }
 749
 750    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 751 }
 752
 753 void
 754 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
 755                          VkQueryPool queryPool,
 756                          uint32_t query,
 757                          uint32_t index)
 758 {
 759    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 760    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 761    assert(query < pool->size);
 762
 763    switch (pool->type) {
 764    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 765       assert(index <= 4);
 766       emit_end_xfb_query(cmdbuf, pool, query, index);
 767       break;
 768    default:
 769       assert(!"Invalid query type");
 770    }
 771
 772    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 773 }
 774
 775 void
 776 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
 777                      VkPipelineStageFlagBits pipelineStage,
 778                      VkQueryPool queryPool,
 779                      uint32_t query)
 780 {
 781    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
 782    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 783    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
 784
 785    /* WFI to get more accurate timestamp */
 786    tu_cs_emit_wfi(cs);
 787
 788    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
 789    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
 790                   CP_REG_TO_MEM_0_CNT(2) |
 791                   CP_REG_TO_MEM_0_64B);
 792    tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
 793
 794    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 795    tu_cs_emit_qw(cs, query_available_iova(pool, query));
 796    tu_cs_emit_qw(cs, 0x1);
 797
 798    if (cmd->state.pass) {
 799       /* TODO: to have useful in-renderpass timestamps:
 800        * for sysmem path, we can just emit the timestamp in draw_cs,
 801        * for gmem renderpass, we do something with accumulate,
 802        * but I'm not sure that would follow the spec
 803        */
 804       tu_finishme("CmdWriteTimestam in renderpass not accurate");
 805    }
 806 }