src/freedreno/vulkan/tu_query.c

   1 /*
   2  * Copyrigh 2016 Red Hat Inc.
   3  * Based on anv:
   4  * Copyright © 2015 Intel Corporation
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  * DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "tu_private.h"
  27
  28 #include <assert.h>
  29 #include <fcntl.h>
  30 #include <stdbool.h>
  31 #include <string.h>
  32 #include <unistd.h>
  33
  34 #include "adreno_pm4.xml.h"
  35 #include "adreno_common.xml.h"
  36 #include "a6xx.xml.h"
  37
  38 #include "nir/nir_builder.h"
  39 #include "util/os_time.h"
  40
  41 #include "tu_cs.h"
  42
  43 #define NSEC_PER_SEC 1000000000ull
  44 #define WAIT_TIMEOUT 5
  45
  46 struct PACKED query_slot {
  47    uint64_t available;
  48 };
  49
  50 struct PACKED occlusion_slot_value {
  51    /* Seems sample counters are placed to be 16-byte aligned
  52     * even though this query needs an 8-byte slot. */
  53    uint64_t value;
  54    uint64_t _padding;
  55 };
  56
  57 struct PACKED occlusion_query_slot {
  58    struct query_slot common;
  59    uint64_t result;
  60
  61    struct occlusion_slot_value begin;
  62    struct occlusion_slot_value end;
  63 };
  64
  65 struct PACKED timestamp_query_slot {
  66    struct query_slot common;
  67    uint64_t result;
  68 };
  69
  70 struct PACKED primitive_slot_value {
  71    uint64_t values[2];
  72 };
  73
  74 struct PACKED primitive_query_slot {
  75    struct query_slot common;
  76    /* The result of transform feedback queries is two integer values:
  77     *   results[0] is the count of primitives written,
  78     *   results[1] is the count of primitives generated.
  79     * Also a result for each stream is stored at 4 slots respectively.
  80     */
  81    uint64_t results[2];
  82
  83    /* Primitive counters also need to be 16-byte aligned. */
  84    uint64_t _padding;
  85
  86    struct primitive_slot_value begin[4];
  87    struct primitive_slot_value end[4];
  88 };
  89
  90 /* Returns the IOVA of a given uint64_t field in a given slot of a query
  91  * pool. */
  92 #define query_iova(type, pool, query, field)                         \
  93    pool->bo.iova + pool->stride * (query) + offsetof(type, field)
  94
  95 #define occlusion_query_iova(pool, query, field)                     \
  96    query_iova(struct occlusion_query_slot, pool, query, field)
  97
  98 #define primitive_query_iova(pool, query, field, i)                  \
  99    query_iova(struct primitive_query_slot, pool, query, field) +     \
 100    offsetof(struct primitive_slot_value, values[i])
 101
 102 #define query_available_iova(pool, query)                            \
 103    query_iova(struct query_slot, pool, query, available)
 104
 105 #define query_result_iova(pool, query, i)                            \
 106    pool->bo.iova + pool->stride * (query) +                          \
 107    sizeof(struct query_slot) + sizeof(uint64_t) * i
 108
 109 #define query_result_addr(pool, query, i)                            \
 110    pool->bo.map + pool->stride * query +                             \
 111    sizeof(struct query_slot) + sizeof(uint64_t) * i
 112
 113 #define query_is_available(slot) slot->available
 114
 115 /*
 116  * Returns a pointer to a given slot in a query pool.
 117  */
 118 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
 119 {
 120    return (char*)pool->bo.map + query * pool->stride;
 121 }
 122
 123 VkResult
 124 tu_CreateQueryPool(VkDevice _device,
 125                    const VkQueryPoolCreateInfo *pCreateInfo,
 126                    const VkAllocationCallbacks *pAllocator,
 127                    VkQueryPool *pQueryPool)
 128 {
 129    TU_FROM_HANDLE(tu_device, device, _device);
 130    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
 131    assert(pCreateInfo->queryCount > 0);
 132
 133    uint32_t slot_size;
 134    switch (pCreateInfo->queryType) {
 135    case VK_QUERY_TYPE_OCCLUSION:
 136       slot_size = sizeof(struct occlusion_query_slot);
 137       break;
 138    case VK_QUERY_TYPE_TIMESTAMP:
 139       slot_size = sizeof(struct timestamp_query_slot);
 140       break;
 141    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 142       slot_size = sizeof(struct primitive_query_slot);
 143       break;
 144    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 145       unreachable("Unimplemented query type");
 146    default:
 147       assert(!"Invalid query type");
 148    }
 149
 150    struct tu_query_pool *pool =
 151          vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
 152                          VK_OBJECT_TYPE_QUERY_POOL);
 153    if (!pool)
 154       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 155
 156    VkResult result = tu_bo_init_new(device, &pool->bo,
 157          pCreateInfo->queryCount * slot_size);
 158    if (result != VK_SUCCESS) {
 159       vk_object_free(&device->vk, pAllocator, pool);
 160       return result;
 161    }
 162
 163    result = tu_bo_map(device, &pool->bo);
 164    if (result != VK_SUCCESS) {
 165       tu_bo_finish(device, &pool->bo);
 166       vk_object_free(&device->vk, pAllocator, pool);
 167       return result;
 168    }
 169
 170    /* Initialize all query statuses to unavailable */
 171    memset(pool->bo.map, 0, pool->bo.size);
 172
 173    pool->type = pCreateInfo->queryType;
 174    pool->stride = slot_size;
 175    pool->size = pCreateInfo->queryCount;
 176    pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
 177    *pQueryPool = tu_query_pool_to_handle(pool);
 178
 179    return VK_SUCCESS;
 180 }
 181
 182 void
 183 tu_DestroyQueryPool(VkDevice _device,
 184                     VkQueryPool _pool,
 185                     const VkAllocationCallbacks *pAllocator)
 186 {
 187    TU_FROM_HANDLE(tu_device, device, _device);
 188    TU_FROM_HANDLE(tu_query_pool, pool, _pool);
 189
 190    if (!pool)
 191       return;
 192
 193    tu_bo_finish(device, &pool->bo);
 194    vk_object_free(&device->vk, pAllocator, pool);
 195 }
 196
 197 static uint32_t
 198 get_result_count(struct tu_query_pool *pool)
 199 {
 200    switch (pool->type) {
 201    /* Occulusion and timestamp queries write one integer value */
 202    case VK_QUERY_TYPE_OCCLUSION:
 203    case VK_QUERY_TYPE_TIMESTAMP:
 204       return 1;
 205    /* Transform feedback queries write two integer values */
 206    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 207       return 2;
 208    default:
 209       assert(!"Invalid query type");
 210       return 0;
 211    }
 212 }
 213
 214 /* Wait on the the availability status of a query up until a timeout. */
 215 static VkResult
 216 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
 217                    uint32_t query)
 218 {
 219    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
 220     * scheduler friendly way instead of busy polling once the patch has landed
 221     * upstream. */
 222    struct query_slot *slot = slot_address(pool, query);
 223    uint64_t abs_timeout = os_time_get_absolute_timeout(
 224          WAIT_TIMEOUT * NSEC_PER_SEC);
 225    while(os_time_get_nano() < abs_timeout) {
 226       if (query_is_available(slot))
 227          return VK_SUCCESS;
 228    }
 229    return vk_error(device->instance, VK_TIMEOUT);
 230 }
 231
 232 /* Writes a query value to a buffer from the CPU. */
 233 static void
 234 write_query_value_cpu(char* base,
 235                       uint32_t offset,
 236                       uint64_t value,
 237                       VkQueryResultFlags flags)
 238 {
 239    if (flags & VK_QUERY_RESULT_64_BIT) {
 240       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
 241    } else {
 242       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
 243    }
 244 }
 245
 246 static VkResult
 247 get_query_pool_results(struct tu_device *device,
 248                        struct tu_query_pool *pool,
 249                        uint32_t firstQuery,
 250                        uint32_t queryCount,
 251                        size_t dataSize,
 252                        void *pData,
 253                        VkDeviceSize stride,
 254                        VkQueryResultFlags flags)
 255 {
 256    assert(dataSize >= stride * queryCount);
 257
 258    char *result_base = pData;
 259    VkResult result = VK_SUCCESS;
 260    for (uint32_t i = 0; i < queryCount; i++) {
 261       uint32_t query = firstQuery + i;
 262       struct query_slot *slot = slot_address(pool, query);
 263       bool available = query_is_available(slot);
 264       uint32_t result_count = get_result_count(pool);
 265
 266       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
 267          VkResult wait_result = wait_for_available(device, pool, query);
 268          if (wait_result != VK_SUCCESS)
 269             return wait_result;
 270          available = true;
 271       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
 272          /* From the Vulkan 1.1.130 spec:
 273           *
 274           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
 275           *    both not set then no result values are written to pData for
 276           *    queries that are in the unavailable state at the time of the
 277           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
 278           *    availability state is still written to pData for those queries
 279           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
 280           */
 281          result = VK_NOT_READY;
 282          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
 283             result_base += stride;
 284             continue;
 285          }
 286       }
 287
 288       for (uint32_t k = 0; k < result_count; k++) {
 289          if (available) {
 290             uint64_t *result = query_result_addr(pool, query, k);
 291             write_query_value_cpu(result_base, k, *result, flags);
 292          } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
 293              /* From the Vulkan 1.1.130 spec:
 294               *
 295               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
 296               *   is not set, and the query’s status is unavailable, an
 297               *   intermediate result value between zero and the final result
 298               *   value is written to pData for that query.
 299               *
 300               * Just return 0 here for simplicity since it's a valid result.
 301               */
 302             write_query_value_cpu(result_base, k, 0, flags);
 303       }
 304
 305       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
 306          /* From the Vulkan 1.1.130 spec:
 307           *
 308           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
 309           *    integer value written for each query is non-zero if the query’s
 310           *    status was available or zero if the status was unavailable.
 311           */
 312          write_query_value_cpu(result_base, result_count, available, flags);
 313
 314       result_base += stride;
 315    }
 316    return result;
 317 }
 318
 319 VkResult
 320 tu_GetQueryPoolResults(VkDevice _device,
 321                        VkQueryPool queryPool,
 322                        uint32_t firstQuery,
 323                        uint32_t queryCount,
 324                        size_t dataSize,
 325                        void *pData,
 326                        VkDeviceSize stride,
 327                        VkQueryResultFlags flags)
 328 {
 329    TU_FROM_HANDLE(tu_device, device, _device);
 330    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 331    assert(firstQuery + queryCount <= pool->size);
 332
 333    if (tu_device_is_lost(device))
 334       return VK_ERROR_DEVICE_LOST;
 335
 336    switch (pool->type) {
 337    case VK_QUERY_TYPE_OCCLUSION:
 338    case VK_QUERY_TYPE_TIMESTAMP:
 339    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 340       return get_query_pool_results(device, pool, firstQuery, queryCount,
 341                                     dataSize, pData, stride, flags);
 342    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 343       unreachable("Unimplemented query type");
 344    default:
 345       assert(!"Invalid query type");
 346    }
 347    return VK_SUCCESS;
 348 }
 349
 350 /* Copies a query value from one buffer to another from the GPU. */
 351 static void
 352 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
 353                      struct tu_cs *cs,
 354                      uint64_t src_iova,
 355                      uint64_t base_write_iova,
 356                      uint32_t offset,
 357                      VkQueryResultFlags flags) {
 358    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
 359          sizeof(uint64_t) : sizeof(uint32_t);
 360    uint64_t write_iova = base_write_iova + (offset * element_size);
 361
 362    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
 363    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
 364          CP_MEM_TO_MEM_0_DOUBLE : 0;
 365    tu_cs_emit(cs, mem_to_mem_flags);
 366    tu_cs_emit_qw(cs, write_iova);
 367    tu_cs_emit_qw(cs, src_iova);
 368 }
 369
 370 static void
 371 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
 372                              struct tu_cs *cs,
 373                              struct tu_query_pool *pool,
 374                              uint32_t firstQuery,
 375                              uint32_t queryCount,
 376                              struct tu_buffer *buffer,
 377                              VkDeviceSize dstOffset,
 378                              VkDeviceSize stride,
 379                              VkQueryResultFlags flags)
 380 {
 381    /* From the Vulkan 1.1.130 spec:
 382     *
 383     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
 384     *    uses of vkCmdResetQueryPool in the same queue, without any additional
 385     *    synchronization.
 386     *
 387     * To ensure that previous writes to the available bit are coherent, first
 388     * wait for all writes to complete.
 389     */
 390    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 391
 392    for (uint32_t i = 0; i < queryCount; i++) {
 393       uint32_t query = firstQuery + i;
 394       uint64_t available_iova = query_available_iova(pool, query);
 395       uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
 396       uint32_t result_count = get_result_count(pool);
 397
 398       /* Wait for the available bit to be set if executed with the
 399        * VK_QUERY_RESULT_WAIT_BIT flag. */
 400       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
 401          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 402          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
 403                         CP_WAIT_REG_MEM_0_POLL_MEMORY);
 404          tu_cs_emit_qw(cs, available_iova);
 405          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
 406          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 407          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 408       }
 409
 410       for (uint32_t k = 0; k < result_count; k++) {
 411          uint64_t result_iova = query_result_iova(pool, query, k);
 412
 413          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
 414             /* Unconditionally copying the bo->result into the buffer here is
 415              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
 416              * if the query is unavailable, this will copy the correct partial
 417              * value of 0.
 418              */
 419             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 420                                  k /* offset */, flags);
 421          } else {
 422             /* Conditionally copy bo->result into the buffer based on whether the
 423              * query is available.
 424              *
 425              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
 426              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
 427              * that 0 < available < 2, aka available == 1.
 428              */
 429             tu_cs_reserve(cs, 7 + 6);
 430             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
 431             tu_cs_emit_qw(cs, available_iova);
 432             tu_cs_emit_qw(cs, available_iova);
 433             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
 434             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
 435
 436             /* Start of conditional execution */
 437             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 438                               k /* offset */, flags);
 439             /* End of conditional execution */
 440          }
 441       }
 442
 443       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 444          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
 445                               result_count /* offset */, flags);
 446       }
 447    }
 448
 449    tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
 450 }
 451
 452 void
 453 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
 454                            VkQueryPool queryPool,
 455                            uint32_t firstQuery,
 456                            uint32_t queryCount,
 457                            VkBuffer dstBuffer,
 458                            VkDeviceSize dstOffset,
 459                            VkDeviceSize stride,
 460                            VkQueryResultFlags flags)
 461 {
 462    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 463    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 464    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
 465    struct tu_cs *cs = &cmdbuf->cs;
 466    assert(firstQuery + queryCount <= pool->size);
 467
 468    switch (pool->type) {
 469    case VK_QUERY_TYPE_OCCLUSION:
 470    case VK_QUERY_TYPE_TIMESTAMP:
 471    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 472       return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
 473                queryCount, buffer, dstOffset, stride, flags);
 474    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 475       unreachable("Unimplemented query type");
 476    default:
 477       assert(!"Invalid query type");
 478    }
 479 }
 480
 481 static void
 482 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
 483                       struct tu_query_pool *pool,
 484                       uint32_t firstQuery,
 485                       uint32_t queryCount)
 486 {
 487    struct tu_cs *cs = &cmdbuf->cs;
 488
 489    for (uint32_t i = 0; i < queryCount; i++) {
 490       uint32_t query = firstQuery + i;
 491
 492       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 493       tu_cs_emit_qw(cs, query_available_iova(pool, query));
 494       tu_cs_emit_qw(cs, 0x0);
 495
 496       for (uint32_t k = 0; k < get_result_count(pool); k++) {
 497          tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 498          tu_cs_emit_qw(cs, query_result_iova(pool, query, k));
 499          tu_cs_emit_qw(cs, 0x0);
 500       }
 501    }
 502 }
 503
 504 void
 505 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
 506                      VkQueryPool queryPool,
 507                      uint32_t firstQuery,
 508                      uint32_t queryCount)
 509 {
 510    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 511    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 512
 513    switch (pool->type) {
 514    case VK_QUERY_TYPE_TIMESTAMP:
 515    case VK_QUERY_TYPE_OCCLUSION:
 516    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 517       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
 518       break;
 519    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 520       unreachable("Unimplemented query type");
 521    default:
 522       assert(!"Invalid query type");
 523    }
 524
 525    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 526 }
 527
 528 static void
 529 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 530                            struct tu_query_pool *pool,
 531                            uint32_t query)
 532 {
 533    /* From the Vulkan 1.1.130 spec:
 534     *
 535     *    A query must begin and end inside the same subpass of a render pass
 536     *    instance, or must both begin and end outside of a render pass
 537     *    instance.
 538     *
 539     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
 540     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
 541     * query begins/ends inside the same subpass of a render pass, we need to
 542     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
 543     * is then run on every tile during render, so we just need to accumulate
 544     * sample counts in slot->result to compute the query result.
 545     */
 546    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 547
 548    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 549
 550    tu_cs_emit_regs(cs,
 551                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 552
 553    tu_cs_emit_regs(cs,
 554                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
 555
 556    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 557    tu_cs_emit(cs, ZPASS_DONE);
 558 }
 559
 560 static void
 561 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
 562                      struct tu_query_pool *pool,
 563                      uint32_t query,
 564                      uint32_t stream_id)
 565 {
 566    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 567    uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
 568
 569    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova));
 570    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
 571 }
 572
 573 void
 574 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
 575                  VkQueryPool queryPool,
 576                  uint32_t query,
 577                  VkQueryControlFlags flags)
 578 {
 579    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 580    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 581    assert(query < pool->size);
 582
 583    switch (pool->type) {
 584    case VK_QUERY_TYPE_OCCLUSION:
 585       /* In freedreno, there is no implementation difference between
 586        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
 587        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
 588        */
 589       emit_begin_occlusion_query(cmdbuf, pool, query);
 590       break;
 591    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 592       emit_begin_xfb_query(cmdbuf, pool, query, 0);
 593       break;
 594    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 595    case VK_QUERY_TYPE_TIMESTAMP:
 596       unreachable("Unimplemented query type");
 597    default:
 598       assert(!"Invalid query type");
 599    }
 600
 601    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 602 }
 603
 604 void
 605 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
 606                            VkQueryPool queryPool,
 607                            uint32_t query,
 608                            VkQueryControlFlags flags,
 609                            uint32_t index)
 610 {
 611    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 612    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 613    assert(query < pool->size);
 614
 615    switch (pool->type) {
 616    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 617       emit_begin_xfb_query(cmdbuf, pool, query, index);
 618       break;
 619    default:
 620       assert(!"Invalid query type");
 621    }
 622
 623    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 624 }
 625
 626 static void
 627 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 628                          struct tu_query_pool *pool,
 629                          uint32_t query)
 630 {
 631    /* Ending an occlusion query happens in a few steps:
 632     *    1) Set the slot->end to UINT64_MAX.
 633     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
 634     *       write the current sample count value into slot->end.
 635     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
 636     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
 637     *    4) Accumulate the results of the query (slot->end - slot->begin) into
 638     *       slot->result.
 639     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
 640     *       pass, set the slot's available bit since the query is now done.
 641     *    6) If vkCmdEndQuery *is* called from within the scope of a render
 642     *       pass, we cannot mark as available yet since the commands in
 643     *       draw_cs are not run until vkCmdEndRenderPass.
 644     */
 645    const struct tu_render_pass *pass = cmdbuf->state.pass;
 646    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 647
 648    uint64_t available_iova = query_available_iova(pool, query);
 649    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 650    uint64_t end_iova = occlusion_query_iova(pool, query, end);
 651    uint64_t result_iova = query_result_iova(pool, query, 0);
 652    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 653    tu_cs_emit_qw(cs, end_iova);
 654    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
 655
 656    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 657
 658    tu_cs_emit_regs(cs,
 659                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 660
 661    tu_cs_emit_regs(cs,
 662                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
 663
 664    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 665    tu_cs_emit(cs, ZPASS_DONE);
 666
 667    tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 668    tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
 669                   CP_WAIT_REG_MEM_0_POLL_MEMORY);
 670    tu_cs_emit_qw(cs, end_iova);
 671    tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
 672    tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 673    tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 674
 675    /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
 676    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 677    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
 678    tu_cs_emit_qw(cs, result_iova);
 679    tu_cs_emit_qw(cs, result_iova);
 680    tu_cs_emit_qw(cs, end_iova);
 681    tu_cs_emit_qw(cs, begin_iova);
 682
 683    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 684
 685    if (pass)
 686       /* Technically, queries should be tracked per-subpass, but here we track
 687        * at the render pass level to simply the code a bit. This is safe
 688        * because the only commands that use the available bit are
 689        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
 690        * cannot be invoked from inside a render pass scope.
 691        */
 692       cs = &cmdbuf->draw_epilogue_cs;
 693
 694    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 695    tu_cs_emit_qw(cs, available_iova);
 696    tu_cs_emit_qw(cs, 0x1);
 697 }
 698
 699 static void
 700 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
 701                    struct tu_query_pool *pool,
 702                    uint32_t query,
 703                    uint32_t stream_id)
 704 {
 705    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 706
 707    uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
 708    uint64_t result_written_iova = query_result_iova(pool, query, 0);
 709    uint64_t result_generated_iova = query_result_iova(pool, query, 1);
 710    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
 711    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
 712    uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
 713    uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
 714    uint64_t available_iova = query_available_iova(pool, query);
 715
 716    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova));
 717    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
 718
 719    tu_cs_emit_wfi(cs);
 720    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
 721
 722    /* Set the count of written primitives */
 723    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 724    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
 725                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
 726    tu_cs_emit_qw(cs, result_written_iova);
 727    tu_cs_emit_qw(cs, result_written_iova);
 728    tu_cs_emit_qw(cs, end_written_iova);
 729    tu_cs_emit_qw(cs, begin_written_iova);
 730
 731    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
 732
 733    /* Set the count of generated primitives */
 734    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 735    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
 736                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
 737    tu_cs_emit_qw(cs, result_generated_iova);
 738    tu_cs_emit_qw(cs, result_generated_iova);
 739    tu_cs_emit_qw(cs, end_generated_iova);
 740    tu_cs_emit_qw(cs, begin_generated_iova);
 741
 742    /* Set the availability to 1 */
 743    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 744    tu_cs_emit_qw(cs, available_iova);
 745    tu_cs_emit_qw(cs, 0x1);
 746 }
 747
 748 /* Implement this bit of spec text from section 17.2 "Query Operation":
 749  *
 750  *     If queries are used while executing a render pass instance that has
 751  *     multiview enabled, the query uses N consecutive query indices in the
 752  *     query pool (starting at query) where N is the number of bits set in the
 753  *     view mask in the subpass the query is used in. How the numerical
 754  *     results of the query are distributed among the queries is
 755  *     implementation-dependent. For example, some implementations may write
 756  *     each view’s results to a distinct query, while other implementations
 757  *     may write the total result to the first query and write zero to the
 758  *     other queries. However, the sum of the results in all the queries must
 759  *     accurately reflect the total result of the query summed over all views.
 760  *     Applications can sum the results from all the queries to compute the
 761  *     total result.
 762  *
 763  * Since we execute all views at once, we write zero to the other queries.
 764  * Furthermore, because queries must be reset before use, and we set the
 765  * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
 766  */
 767
 768 static void
 769 handle_multiview_queries(struct tu_cmd_buffer *cmd,
 770                          struct tu_query_pool *pool,
 771                          uint32_t query)
 772 {
 773    if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
 774       return;
 775
 776    unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
 777    struct tu_cs *cs = &cmd->draw_epilogue_cs;
 778
 779    for (uint32_t i = 1; i < views; i++) {
 780       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 781       tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
 782       tu_cs_emit_qw(cs, 0x1);
 783    }
 784 }
 785
 786 void
 787 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
 788                VkQueryPool queryPool,
 789                uint32_t query)
 790 {
 791    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 792    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 793    assert(query < pool->size);
 794
 795    switch (pool->type) {
 796    case VK_QUERY_TYPE_OCCLUSION:
 797       emit_end_occlusion_query(cmdbuf, pool, query);
 798       break;
 799    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 800       emit_end_xfb_query(cmdbuf, pool, query, 0);
 801       break;
 802    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 803    case VK_QUERY_TYPE_TIMESTAMP:
 804       unreachable("Unimplemented query type");
 805    default:
 806       assert(!"Invalid query type");
 807    }
 808
 809    handle_multiview_queries(cmdbuf, pool, query);
 810
 811    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 812 }
 813
 814 void
 815 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
 816                          VkQueryPool queryPool,
 817                          uint32_t query,
 818                          uint32_t index)
 819 {
 820    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 821    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 822    assert(query < pool->size);
 823
 824    switch (pool->type) {
 825    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 826       assert(index <= 4);
 827       emit_end_xfb_query(cmdbuf, pool, query, index);
 828       break;
 829    default:
 830       assert(!"Invalid query type");
 831    }
 832
 833    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 834 }
 835
 836 void
 837 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
 838                      VkPipelineStageFlagBits pipelineStage,
 839                      VkQueryPool queryPool,
 840                      uint32_t query)
 841 {
 842    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
 843    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 844
 845    tu_bo_list_add(&cmd->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 846
 847    /* Inside a render pass, just write the timestamp multiple times so that
 848     * the user gets the last one if we use GMEM. There isn't really much
 849     * better we can do, and this seems to be what the blob does too.
 850     */
 851    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
 852
 853    /* Stages that will already have been executed by the time the CP executes
 854     * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
 855     * indirect stage counts as top-of-pipe too.
 856     */
 857    VkPipelineStageFlags top_of_pipe_flags =
 858       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
 859       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
 860
 861    if (pipelineStage & ~top_of_pipe_flags) {
 862       /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
 863        * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
 864        * complete.
 865        *
 866        * Stalling the CP like this is really unfortunate, but I don't think
 867        * there's a better solution that allows all 48 bits of precision
 868        * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
 869        */
 870       tu_cs_emit_wfi(cs);
 871    }
 872
 873    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
 874    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
 875                   CP_REG_TO_MEM_0_CNT(2) |
 876                   CP_REG_TO_MEM_0_64B);
 877    tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
 878
 879    /* Only flag availability once the entire renderpass is done, similar to
 880     * the begin/end path.
 881     */
 882    cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
 883
 884    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 885    tu_cs_emit_qw(cs, query_available_iova(pool, query));
 886    tu_cs_emit_qw(cs, 0x1);
 887
 888    /* From the spec for vkCmdWriteTimestamp:
 889     *
 890     *    If vkCmdWriteTimestamp is called while executing a render pass
 891     *    instance that has multiview enabled, the timestamp uses N consecutive
 892     *    query indices in the query pool (starting at query) where N is the
 893     *    number of bits set in the view mask of the subpass the command is
 894     *    executed in. The resulting query values are determined by an
 895     *    implementation-dependent choice of one of the following behaviors:
 896     *
 897     *    -   The first query is a timestamp value and (if more than one bit is
 898     *        set in the view mask) zero is written to the remaining queries.
 899     *        If two timestamps are written in the same subpass, the sum of the
 900     *        execution time of all views between those commands is the
 901     *        difference between the first query written by each command.
 902     *
 903     *    -   All N queries are timestamp values. If two timestamps are written
 904     *        in the same subpass, the sum of the execution time of all views
 905     *        between those commands is the sum of the difference between
 906     *        corresponding queries written by each command. The difference
 907     *        between corresponding queries may be the execution time of a
 908     *        single view.
 909     *
 910     * We execute all views in the same draw call, so we implement the first
 911     * option, the same as regular queries.
 912     */
 913    handle_multiview_queries(cmd, pool, query);
 914 }