src/freedreno/vulkan/tu_query.c

   1 /*
   2  * Copyrigh 2016 Red Hat Inc.
   3  * Based on anv:
   4  * Copyright © 2015 Intel Corporation
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice (including the next
  14  * paragraph) shall be included in all copies or substantial portions of the
  15  * Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  23  * DEALINGS IN THE SOFTWARE.
  24  */
  25
  26 #include "tu_private.h"
  27
  28 #include <assert.h>
  29 #include <fcntl.h>
  30 #include <stdbool.h>
  31 #include <string.h>
  32 #include <unistd.h>
  33
  34 #include "adreno_pm4.xml.h"
  35 #include "adreno_common.xml.h"
  36 #include "a6xx.xml.h"
  37
  38 #include "nir/nir_builder.h"
  39 #include "util/os_time.h"
  40
  41 #include "tu_cs.h"
  42
  43 #define NSEC_PER_SEC 1000000000ull
  44 #define WAIT_TIMEOUT 5
  45
  46 /* Depending on the query type, there might be 2 integer values.
  47  * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
  48  *   values[0] : primitives written, values[1]: primitives generated
  49  */
  50 struct PACKED slot_value {
  51    uint64_t values[2];
  52 };
  53
  54 struct PACKED query_slot {
  55    struct slot_value available; /* 0 when unavailable, 1 when available */
  56    struct slot_value result;
  57 };
  58
  59 struct PACKED occlusion_query_slot {
  60    struct query_slot common;
  61    struct slot_value begin;
  62    struct slot_value end;
  63 };
  64
  65 /* The result of transform feedback queries is two integer values:
  66  *   common.result.values[0] is the count of primitives written,
  67  *   common.result.values[1] is the count of primitives generated.
  68  * Also a result for each stream is stored at 4 slots respectively.
  69  */
  70 struct PACKED primitive_query_slot {
  71    struct query_slot common;
  72    struct slot_value begin[4];
  73    struct slot_value end[4];
  74 };
  75
  76 /* Returns the IOVA of a given uint64_t field in a given slot of a query
  77  * pool. */
  78 #define query_iova(type, pool, query, field, value_index)            \
  79    pool->bo.iova + pool->stride * (query) + offsetof(type, field) +  \
  80          offsetof(struct slot_value, values[value_index])
  81
  82 #define occlusion_query_iova(pool, query, field)                     \
  83    query_iova(struct occlusion_query_slot, pool, query, field, 0)
  84
  85 #define primitive_query_iova(pool, query, field, i)                  \
  86    query_iova(struct primitive_query_slot, pool, query, field, i)
  87
  88 #define query_available_iova(pool, query)                            \
  89    query_iova(struct query_slot, pool, query, available, 0)
  90
  91 #define query_result_iova(pool, query, i)                            \
  92    query_iova(struct query_slot, pool, query, result, i)
  93
  94 #define query_is_available(slot) slot->available.values[0]
  95
  96 /*
  97  * Returns a pointer to a given slot in a query pool.
  98  */
  99 static void* slot_address(struct tu_query_pool *pool, uint32_t query)
 100 {
 101    return (char*)pool->bo.map + query * pool->stride;
 102 }
 103
 104 VkResult
 105 tu_CreateQueryPool(VkDevice _device,
 106                    const VkQueryPoolCreateInfo *pCreateInfo,
 107                    const VkAllocationCallbacks *pAllocator,
 108                    VkQueryPool *pQueryPool)
 109 {
 110    TU_FROM_HANDLE(tu_device, device, _device);
 111    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
 112    assert(pCreateInfo->queryCount > 0);
 113
 114    uint32_t slot_size;
 115    switch (pCreateInfo->queryType) {
 116    case VK_QUERY_TYPE_OCCLUSION:
 117       slot_size = sizeof(struct occlusion_query_slot);
 118       break;
 119    case VK_QUERY_TYPE_TIMESTAMP:
 120       slot_size = sizeof(struct query_slot);
 121       break;
 122    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 123       slot_size = sizeof(struct primitive_query_slot);
 124       break;
 125    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 126       unreachable("Unimplemented query type");
 127    default:
 128       assert(!"Invalid query type");
 129    }
 130
 131    struct tu_query_pool *pool =
 132          vk_object_alloc(&device->vk, pAllocator, sizeof(*pool),
 133                          VK_OBJECT_TYPE_QUERY_POOL);
 134    if (!pool)
 135       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 136
 137    VkResult result = tu_bo_init_new(device, &pool->bo,
 138          pCreateInfo->queryCount * slot_size);
 139    if (result != VK_SUCCESS) {
 140       vk_object_free(&device->vk, pAllocator, pool);
 141       return result;
 142    }
 143
 144    result = tu_bo_map(device, &pool->bo);
 145    if (result != VK_SUCCESS) {
 146       tu_bo_finish(device, &pool->bo);
 147       vk_object_free(&device->vk, pAllocator, pool);
 148       return result;
 149    }
 150
 151    /* Initialize all query statuses to unavailable */
 152    memset(pool->bo.map, 0, pool->bo.size);
 153
 154    pool->type = pCreateInfo->queryType;
 155    pool->stride = slot_size;
 156    pool->size = pCreateInfo->queryCount;
 157    pool->pipeline_statistics = pCreateInfo->pipelineStatistics;
 158    *pQueryPool = tu_query_pool_to_handle(pool);
 159
 160    return VK_SUCCESS;
 161 }
 162
 163 void
 164 tu_DestroyQueryPool(VkDevice _device,
 165                     VkQueryPool _pool,
 166                     const VkAllocationCallbacks *pAllocator)
 167 {
 168    TU_FROM_HANDLE(tu_device, device, _device);
 169    TU_FROM_HANDLE(tu_query_pool, pool, _pool);
 170
 171    if (!pool)
 172       return;
 173
 174    tu_bo_finish(device, &pool->bo);
 175    vk_object_free(&device->vk, pAllocator, pool);
 176 }
 177
 178 static uint32_t
 179 get_result_count(struct tu_query_pool *pool)
 180 {
 181    switch (pool->type) {
 182    /* Occulusion and timestamp queries write one integer value */
 183    case VK_QUERY_TYPE_OCCLUSION:
 184    case VK_QUERY_TYPE_TIMESTAMP:
 185       return 1;
 186    /* Transform feedback queries write two integer values */
 187    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 188       return 2;
 189    default:
 190       assert(!"Invalid query type");
 191       return 0;
 192    }
 193 }
 194
 195 /* Wait on the the availability status of a query up until a timeout. */
 196 static VkResult
 197 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
 198                    uint32_t query)
 199 {
 200    /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a
 201     * scheduler friendly way instead of busy polling once the patch has landed
 202     * upstream. */
 203    struct query_slot *slot = slot_address(pool, query);
 204    uint64_t abs_timeout = os_time_get_absolute_timeout(
 205          WAIT_TIMEOUT * NSEC_PER_SEC);
 206    while(os_time_get_nano() < abs_timeout) {
 207       if (query_is_available(slot))
 208          return VK_SUCCESS;
 209    }
 210    return vk_error(device->instance, VK_TIMEOUT);
 211 }
 212
 213 /* Writes a query value to a buffer from the CPU. */
 214 static void
 215 write_query_value_cpu(char* base,
 216                       uint32_t offset,
 217                       uint64_t value,
 218                       VkQueryResultFlags flags)
 219 {
 220    if (flags & VK_QUERY_RESULT_64_BIT) {
 221       *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value;
 222    } else {
 223       *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value;
 224    }
 225 }
 226
 227 static VkResult
 228 get_query_pool_results(struct tu_device *device,
 229                        struct tu_query_pool *pool,
 230                        uint32_t firstQuery,
 231                        uint32_t queryCount,
 232                        size_t dataSize,
 233                        void *pData,
 234                        VkDeviceSize stride,
 235                        VkQueryResultFlags flags)
 236 {
 237    assert(dataSize >= stride * queryCount);
 238
 239    char *result_base = pData;
 240    VkResult result = VK_SUCCESS;
 241    for (uint32_t i = 0; i < queryCount; i++) {
 242       uint32_t query = firstQuery + i;
 243       struct query_slot *slot = slot_address(pool, query);
 244       bool available = query_is_available(slot);
 245       uint32_t result_count = get_result_count(pool);
 246
 247       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
 248          VkResult wait_result = wait_for_available(device, pool, query);
 249          if (wait_result != VK_SUCCESS)
 250             return wait_result;
 251          available = true;
 252       } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) {
 253          /* From the Vulkan 1.1.130 spec:
 254           *
 255           *    If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
 256           *    both not set then no result values are written to pData for
 257           *    queries that are in the unavailable state at the time of the
 258           *    call, and vkGetQueryPoolResults returns VK_NOT_READY. However,
 259           *    availability state is still written to pData for those queries
 260           *    if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set.
 261           */
 262          result = VK_NOT_READY;
 263          if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) {
 264             result_base += stride;
 265             continue;
 266          }
 267       }
 268
 269       for (uint32_t k = 0; k < result_count; k++) {
 270          if (available)
 271             write_query_value_cpu(result_base, k, slot->result.values[k], flags);
 272          else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
 273              /* From the Vulkan 1.1.130 spec:
 274               *
 275               *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
 276               *   is not set, and the query’s status is unavailable, an
 277               *   intermediate result value between zero and the final result
 278               *   value is written to pData for that query.
 279               *
 280               * Just return 0 here for simplicity since it's a valid result.
 281               */
 282             write_query_value_cpu(result_base, k, 0, flags);
 283       }
 284
 285       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
 286          /* From the Vulkan 1.1.130 spec:
 287           *
 288           *    If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final
 289           *    integer value written for each query is non-zero if the query’s
 290           *    status was available or zero if the status was unavailable.
 291           */
 292          write_query_value_cpu(result_base, result_count, available, flags);
 293
 294       result_base += stride;
 295    }
 296    return result;
 297 }
 298
 299 VkResult
 300 tu_GetQueryPoolResults(VkDevice _device,
 301                        VkQueryPool queryPool,
 302                        uint32_t firstQuery,
 303                        uint32_t queryCount,
 304                        size_t dataSize,
 305                        void *pData,
 306                        VkDeviceSize stride,
 307                        VkQueryResultFlags flags)
 308 {
 309    TU_FROM_HANDLE(tu_device, device, _device);
 310    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 311    assert(firstQuery + queryCount <= pool->size);
 312
 313    if (tu_device_is_lost(device))
 314       return VK_ERROR_DEVICE_LOST;
 315
 316    switch (pool->type) {
 317    case VK_QUERY_TYPE_OCCLUSION:
 318    case VK_QUERY_TYPE_TIMESTAMP:
 319    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 320       return get_query_pool_results(device, pool, firstQuery, queryCount,
 321                                     dataSize, pData, stride, flags);
 322    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 323       unreachable("Unimplemented query type");
 324    default:
 325       assert(!"Invalid query type");
 326    }
 327    return VK_SUCCESS;
 328 }
 329
 330 /* Copies a query value from one buffer to another from the GPU. */
 331 static void
 332 copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf,
 333                      struct tu_cs *cs,
 334                      uint64_t src_iova,
 335                      uint64_t base_write_iova,
 336                      uint32_t offset,
 337                      VkQueryResultFlags flags) {
 338    uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ?
 339          sizeof(uint64_t) : sizeof(uint32_t);
 340    uint64_t write_iova = base_write_iova + (offset * element_size);
 341
 342    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
 343    uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ?
 344          CP_MEM_TO_MEM_0_DOUBLE : 0;
 345    tu_cs_emit(cs, mem_to_mem_flags);
 346    tu_cs_emit_qw(cs, write_iova);
 347    tu_cs_emit_qw(cs, src_iova);
 348 }
 349
 350 static void
 351 emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf,
 352                              struct tu_cs *cs,
 353                              struct tu_query_pool *pool,
 354                              uint32_t firstQuery,
 355                              uint32_t queryCount,
 356                              struct tu_buffer *buffer,
 357                              VkDeviceSize dstOffset,
 358                              VkDeviceSize stride,
 359                              VkQueryResultFlags flags)
 360 {
 361    /* From the Vulkan 1.1.130 spec:
 362     *
 363     *    vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous
 364     *    uses of vkCmdResetQueryPool in the same queue, without any additional
 365     *    synchronization.
 366     *
 367     * To ensure that previous writes to the available bit are coherent, first
 368     * wait for all writes to complete.
 369     */
 370    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 371
 372    for (uint32_t i = 0; i < queryCount; i++) {
 373       uint32_t query = firstQuery + i;
 374       uint64_t available_iova = query_available_iova(pool, query);
 375       uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
 376       uint32_t result_count = get_result_count(pool);
 377
 378       /* Wait for the available bit to be set if executed with the
 379        * VK_QUERY_RESULT_WAIT_BIT flag. */
 380       if (flags & VK_QUERY_RESULT_WAIT_BIT) {
 381          tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 382          tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) |
 383                         CP_WAIT_REG_MEM_0_POLL_MEMORY);
 384          tu_cs_emit_qw(cs, available_iova);
 385          tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1));
 386          tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 387          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 388       }
 389
 390       for (uint32_t k = 0; k < result_count; k++) {
 391          uint64_t result_iova = query_result_iova(pool, query, k);
 392
 393          if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
 394             /* Unconditionally copying the bo->result into the buffer here is
 395              * valid because we only set bo->result on vkCmdEndQuery. Thus, even
 396              * if the query is unavailable, this will copy the correct partial
 397              * value of 0.
 398              */
 399             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 400                                  k /* offset */, flags);
 401          } else {
 402             /* Conditionally copy bo->result into the buffer based on whether the
 403              * query is available.
 404              *
 405              * NOTE: For the conditional packets to be executed, CP_COND_EXEC
 406              * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
 407              * that 0 < available < 2, aka available == 1.
 408              */
 409             tu_cs_reserve(cs, 7 + 6);
 410             tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
 411             tu_cs_emit_qw(cs, available_iova);
 412             tu_cs_emit_qw(cs, available_iova);
 413             tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
 414             tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
 415
 416             /* Start of conditional execution */
 417             copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
 418                               k /* offset */, flags);
 419             /* End of conditional execution */
 420          }
 421       }
 422
 423       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 424          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
 425                               result_count /* offset */, flags);
 426       }
 427    }
 428
 429    tu_bo_list_add(&cmdbuf->bo_list, buffer->bo, MSM_SUBMIT_BO_WRITE);
 430 }
 431
 432 void
 433 tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,
 434                            VkQueryPool queryPool,
 435                            uint32_t firstQuery,
 436                            uint32_t queryCount,
 437                            VkBuffer dstBuffer,
 438                            VkDeviceSize dstOffset,
 439                            VkDeviceSize stride,
 440                            VkQueryResultFlags flags)
 441 {
 442    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 443    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 444    TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
 445    struct tu_cs *cs = &cmdbuf->cs;
 446    assert(firstQuery + queryCount <= pool->size);
 447
 448    switch (pool->type) {
 449    case VK_QUERY_TYPE_OCCLUSION:
 450    case VK_QUERY_TYPE_TIMESTAMP:
 451    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 452       return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery,
 453                queryCount, buffer, dstOffset, stride, flags);
 454    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 455       unreachable("Unimplemented query type");
 456    default:
 457       assert(!"Invalid query type");
 458    }
 459 }
 460
 461 static void
 462 emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf,
 463                       struct tu_query_pool *pool,
 464                       uint32_t firstQuery,
 465                       uint32_t queryCount)
 466 {
 467    struct tu_cs *cs = &cmdbuf->cs;
 468
 469    for (uint32_t i = 0; i < queryCount; i++) {
 470       uint32_t query = firstQuery + i;
 471
 472       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 473       tu_cs_emit_qw(cs, query_available_iova(pool, query));
 474       tu_cs_emit_qw(cs, 0x0);
 475
 476       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 477       tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
 478       tu_cs_emit_qw(cs, 0x0);
 479       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 480       tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
 481       tu_cs_emit_qw(cs, 0x0);
 482    }
 483 }
 484
 485 void
 486 tu_CmdResetQueryPool(VkCommandBuffer commandBuffer,
 487                      VkQueryPool queryPool,
 488                      uint32_t firstQuery,
 489                      uint32_t queryCount)
 490 {
 491    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 492    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 493
 494    switch (pool->type) {
 495    case VK_QUERY_TYPE_TIMESTAMP:
 496    case VK_QUERY_TYPE_OCCLUSION:
 497    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 498       emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount);
 499       break;
 500    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 501       unreachable("Unimplemented query type");
 502    default:
 503       assert(!"Invalid query type");
 504    }
 505
 506    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 507 }
 508
 509 static void
 510 emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 511                            struct tu_query_pool *pool,
 512                            uint32_t query)
 513 {
 514    /* From the Vulkan 1.1.130 spec:
 515     *
 516     *    A query must begin and end inside the same subpass of a render pass
 517     *    instance, or must both begin and end outside of a render pass
 518     *    instance.
 519     *
 520     * Unlike on an immediate-mode renderer, Turnip renders all tiles on
 521     * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a
 522     * query begins/ends inside the same subpass of a render pass, we need to
 523     * record the packets on the secondary draw command stream. cmdbuf->draw_cs
 524     * is then run on every tile during render, so we just need to accumulate
 525     * sample counts in slot->result to compute the query result.
 526     */
 527    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 528
 529    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 530
 531    tu_cs_emit_regs(cs,
 532                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 533
 534    tu_cs_emit_regs(cs,
 535                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(begin_iova));
 536
 537    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 538    tu_cs_emit(cs, ZPASS_DONE);
 539 }
 540
 541 static void
 542 emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf,
 543                      struct tu_query_pool *pool,
 544                      uint32_t query,
 545                      uint32_t stream_id)
 546 {
 547    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 548    uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0);
 549
 550    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(begin_iova));
 551    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
 552 }
 553
 554 void
 555 tu_CmdBeginQuery(VkCommandBuffer commandBuffer,
 556                  VkQueryPool queryPool,
 557                  uint32_t query,
 558                  VkQueryControlFlags flags)
 559 {
 560    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 561    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 562    assert(query < pool->size);
 563
 564    switch (pool->type) {
 565    case VK_QUERY_TYPE_OCCLUSION:
 566       /* In freedreno, there is no implementation difference between
 567        * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly
 568        * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here.
 569        */
 570       emit_begin_occlusion_query(cmdbuf, pool, query);
 571       break;
 572    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 573       emit_begin_xfb_query(cmdbuf, pool, query, 0);
 574       break;
 575    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 576    case VK_QUERY_TYPE_TIMESTAMP:
 577       unreachable("Unimplemented query type");
 578    default:
 579       assert(!"Invalid query type");
 580    }
 581
 582    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 583 }
 584
 585 void
 586 tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,
 587                            VkQueryPool queryPool,
 588                            uint32_t query,
 589                            VkQueryControlFlags flags,
 590                            uint32_t index)
 591 {
 592    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 593    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 594    assert(query < pool->size);
 595
 596    switch (pool->type) {
 597    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 598       emit_begin_xfb_query(cmdbuf, pool, query, index);
 599       break;
 600    default:
 601       assert(!"Invalid query type");
 602    }
 603
 604    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 605 }
 606
 607 static void
 608 emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf,
 609                          struct tu_query_pool *pool,
 610                          uint32_t query)
 611 {
 612    /* Ending an occlusion query happens in a few steps:
 613     *    1) Set the slot->end to UINT64_MAX.
 614     *    2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to
 615     *       write the current sample count value into slot->end.
 616     *    3) Since (2) is asynchronous, wait until slot->end is not equal to
 617     *       UINT64_MAX before continuing via CP_WAIT_REG_MEM.
 618     *    4) Accumulate the results of the query (slot->end - slot->begin) into
 619     *       slot->result.
 620     *    5) If vkCmdEndQuery is *not* called from within the scope of a render
 621     *       pass, set the slot's available bit since the query is now done.
 622     *    6) If vkCmdEndQuery *is* called from within the scope of a render
 623     *       pass, we cannot mark as available yet since the commands in
 624     *       draw_cs are not run until vkCmdEndRenderPass.
 625     */
 626    const struct tu_render_pass *pass = cmdbuf->state.pass;
 627    struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 628
 629    uint64_t available_iova = query_available_iova(pool, query);
 630    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
 631    uint64_t end_iova = occlusion_query_iova(pool, query, end);
 632    uint64_t result_iova = query_result_iova(pool, query, 0);
 633    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 634    tu_cs_emit_qw(cs, end_iova);
 635    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
 636
 637    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 638
 639    tu_cs_emit_regs(cs,
 640                    A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
 641
 642    tu_cs_emit_regs(cs,
 643                    A6XX_RB_SAMPLE_COUNT_ADDR_LO(end_iova));
 644
 645    tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
 646    tu_cs_emit(cs, ZPASS_DONE);
 647
 648    tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6);
 649    tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) |
 650                   CP_WAIT_REG_MEM_0_POLL_MEMORY);
 651    tu_cs_emit_qw(cs, end_iova);
 652    tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff));
 653    tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0));
 654    tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
 655
 656    /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */
 657    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 658    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C);
 659    tu_cs_emit_qw(cs, result_iova);
 660    tu_cs_emit_qw(cs, result_iova);
 661    tu_cs_emit_qw(cs, end_iova);
 662    tu_cs_emit_qw(cs, begin_iova);
 663
 664    tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
 665
 666    if (pass)
 667       /* Technically, queries should be tracked per-subpass, but here we track
 668        * at the render pass level to simply the code a bit. This is safe
 669        * because the only commands that use the available bit are
 670        * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which
 671        * cannot be invoked from inside a render pass scope.
 672        */
 673       cs = &cmdbuf->draw_epilogue_cs;
 674
 675    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 676    tu_cs_emit_qw(cs, available_iova);
 677    tu_cs_emit_qw(cs, 0x1);
 678 }
 679
 680 static void
 681 emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf,
 682                    struct tu_query_pool *pool,
 683                    uint32_t query,
 684                    uint32_t stream_id)
 685 {
 686    struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs;
 687
 688    uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0);
 689    uint64_t result_written_iova = query_result_iova(pool, query, 0);
 690    uint64_t result_generated_iova = query_result_iova(pool, query, 1);
 691    uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0);
 692    uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1);
 693    uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0);
 694    uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1);
 695    uint64_t available_iova = query_available_iova(pool, query);
 696
 697    tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS_LO(end_iova));
 698    tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS);
 699
 700    tu_cs_emit_wfi(cs);
 701    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
 702
 703    /* Set the count of written primitives */
 704    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 705    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
 706                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
 707    tu_cs_emit_qw(cs, result_written_iova);
 708    tu_cs_emit_qw(cs, result_written_iova);
 709    tu_cs_emit_qw(cs, end_written_iova);
 710    tu_cs_emit_qw(cs, begin_written_iova);
 711
 712    tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS);
 713
 714    /* Set the count of generated primitives */
 715    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9);
 716    tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C |
 717                   CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000);
 718    tu_cs_emit_qw(cs, result_generated_iova);
 719    tu_cs_emit_qw(cs, result_generated_iova);
 720    tu_cs_emit_qw(cs, end_generated_iova);
 721    tu_cs_emit_qw(cs, begin_generated_iova);
 722
 723    /* Set the availability to 1 */
 724    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 725    tu_cs_emit_qw(cs, available_iova);
 726    tu_cs_emit_qw(cs, 0x1);
 727 }
 728
 729 /* Implement this bit of spec text from section 17.2 "Query Operation":
 730  *
 731  *     If queries are used while executing a render pass instance that has
 732  *     multiview enabled, the query uses N consecutive query indices in the
 733  *     query pool (starting at query) where N is the number of bits set in the
 734  *     view mask in the subpass the query is used in. How the numerical
 735  *     results of the query are distributed among the queries is
 736  *     implementation-dependent. For example, some implementations may write
 737  *     each view’s results to a distinct query, while other implementations
 738  *     may write the total result to the first query and write zero to the
 739  *     other queries. However, the sum of the results in all the queries must
 740  *     accurately reflect the total result of the query summed over all views.
 741  *     Applications can sum the results from all the queries to compute the
 742  *     total result.
 743  *
 744  * Since we execute all views at once, we write zero to the other queries.
 745  * Furthermore, because queries must be reset before use, and we set the
 746  * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available.
 747  */
 748
 749 static void
 750 handle_multiview_queries(struct tu_cmd_buffer *cmd,
 751                          struct tu_query_pool *pool,
 752                          uint32_t query)
 753 {
 754    if (!cmd->state.pass || !cmd->state.subpass->multiview_mask)
 755       return;
 756
 757    unsigned views = util_bitcount(cmd->state.subpass->multiview_mask);
 758    struct tu_cs *cs = &cmd->draw_epilogue_cs;
 759
 760    for (uint32_t i = 1; i < views; i++) {
 761       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 762       tu_cs_emit_qw(cs, query_available_iova(pool, query + i));
 763       tu_cs_emit_qw(cs, 0x1);
 764    }
 765 }
 766
 767 void
 768 tu_CmdEndQuery(VkCommandBuffer commandBuffer,
 769                VkQueryPool queryPool,
 770                uint32_t query)
 771 {
 772    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 773    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 774    assert(query < pool->size);
 775
 776    switch (pool->type) {
 777    case VK_QUERY_TYPE_OCCLUSION:
 778       emit_end_occlusion_query(cmdbuf, pool, query);
 779       break;
 780    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 781       emit_end_xfb_query(cmdbuf, pool, query, 0);
 782       break;
 783    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 784    case VK_QUERY_TYPE_TIMESTAMP:
 785       unreachable("Unimplemented query type");
 786    default:
 787       assert(!"Invalid query type");
 788    }
 789
 790    handle_multiview_queries(cmdbuf, pool, query);
 791
 792    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 793 }
 794
 795 void
 796 tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,
 797                          VkQueryPool queryPool,
 798                          uint32_t query,
 799                          uint32_t index)
 800 {
 801    TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer);
 802    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 803    assert(query < pool->size);
 804
 805    switch (pool->type) {
 806    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 807       assert(index <= 4);
 808       emit_end_xfb_query(cmdbuf, pool, query, index);
 809       break;
 810    default:
 811       assert(!"Invalid query type");
 812    }
 813
 814    tu_bo_list_add(&cmdbuf->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 815 }
 816
 817 void
 818 tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer,
 819                      VkPipelineStageFlagBits pipelineStage,
 820                      VkQueryPool queryPool,
 821                      uint32_t query)
 822 {
 823    TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
 824    TU_FROM_HANDLE(tu_query_pool, pool, queryPool);
 825
 826    tu_bo_list_add(&cmd->bo_list, &pool->bo, MSM_SUBMIT_BO_WRITE);
 827
 828    /* Inside a render pass, just write the timestamp multiple times so that
 829     * the user gets the last one if we use GMEM. There isn't really much
 830     * better we can do, and this seems to be what the blob does too.
 831     */
 832    struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs;
 833
 834    /* Stages that will already have been executed by the time the CP executes
 835     * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw
 836     * indirect stage counts as top-of-pipe too.
 837     */
 838    VkPipelineStageFlags top_of_pipe_flags =
 839       VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT |
 840       VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
 841
 842    if (pipelineStage & ~top_of_pipe_flags) {
 843       /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM
 844        * does CP_WAIT_FOR_ME internally, which will wait for the WFI to
 845        * complete.
 846        *
 847        * Stalling the CP like this is really unfortunate, but I don't think
 848        * there's a better solution that allows all 48 bits of precision
 849        * because CP_EVENT_WRITE doesn't support 64-bit timestamps.
 850        */
 851       tu_cs_emit_wfi(cs);
 852    }
 853
 854    tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3);
 855    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
 856                   CP_REG_TO_MEM_0_CNT(2) |
 857                   CP_REG_TO_MEM_0_64B);
 858    tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
 859
 860    /* Only flag availability once the entire renderpass is done, similar to
 861     * the begin/end path.
 862     */
 863    cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs;
 864
 865    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
 866    tu_cs_emit_qw(cs, query_available_iova(pool, query));
 867    tu_cs_emit_qw(cs, 0x1);
 868
 869    /* From the spec for vkCmdWriteTimestamp:
 870     *
 871     *    If vkCmdWriteTimestamp is called while executing a render pass
 872     *    instance that has multiview enabled, the timestamp uses N consecutive
 873     *    query indices in the query pool (starting at query) where N is the
 874     *    number of bits set in the view mask of the subpass the command is
 875     *    executed in. The resulting query values are determined by an
 876     *    implementation-dependent choice of one of the following behaviors:
 877     *
 878     *    -   The first query is a timestamp value and (if more than one bit is
 879     *        set in the view mask) zero is written to the remaining queries.
 880     *        If two timestamps are written in the same subpass, the sum of the
 881     *        execution time of all views between those commands is the
 882     *        difference between the first query written by each command.
 883     *
 884     *    -   All N queries are timestamp values. If two timestamps are written
 885     *        in the same subpass, the sum of the execution time of all views
 886     *        between those commands is the sum of the difference between
 887     *        corresponding queries written by each command. The difference
 888     *        between corresponding queries may be the execution time of a
 889     *        single view.
 890     *
 891     * We execute all views in the same draw call, so we implement the first
 892     * option, the same as regular queries.
 893     */
 894    handle_multiview_queries(cmd, pool, query);
 895 }