src/intel/vulkan/genX_query.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <fcntl.h>
  29
  30 #include "anv_private.h"
  31
  32 #include "genxml/gen_macros.h"
  33 #include "genxml/genX_pack.h"
  34
  35 VkResult genX(CreateQueryPool)(
  36     VkDevice                                    _device,
  37     const VkQueryPoolCreateInfo*                pCreateInfo,
  38     const VkAllocationCallbacks*                pAllocator,
  39     VkQueryPool*                                pQueryPool)
  40 {
  41    ANV_FROM_HANDLE(anv_device, device, _device);
  42    const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
  43    struct anv_query_pool *pool;
  44    VkResult result;
  45
  46    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
  47
  48    /* Query pool slots are made up of some number of 64-bit values packed
  49     * tightly together.  The first 64-bit value is always the "available" bit
  50     * which is 0 when the query is unavailable and 1 when it is available.
  51     * The 64-bit values that follow are determined by the type of query.
  52     */
  53    uint32_t uint64s_per_slot = 1;
  54
  55    VkQueryPipelineStatisticFlags pipeline_statistics = 0;
  56    switch (pCreateInfo->queryType) {
  57    case VK_QUERY_TYPE_OCCLUSION:
  58       /* Occlusion queries have two values: begin and end. */
  59       uint64s_per_slot += 2;
  60       break;
  61    case VK_QUERY_TYPE_TIMESTAMP:
  62       /* Timestamps just have the one timestamp value */
  63       uint64s_per_slot += 1;
  64       break;
  65    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  66       pipeline_statistics = pCreateInfo->pipelineStatistics;
  67       /* We're going to trust this field implicitly so we need to ensure that
  68        * no unhandled extension bits leak in.
  69        */
  70       pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
  71
  72       /* Statistics queries have a min and max for every statistic */
  73       uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
  74       break;
  75    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
  76       /* Transform feedback queries are 4 values, begin/end for
  77        * written/available.
  78        */
  79       uint64s_per_slot += 4;
  80       break;
  81    default:
  82       assert(!"Invalid query type");
  83    }
  84
  85    pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
  86                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
  87    if (pool == NULL)
  88       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  89
  90    pool->type = pCreateInfo->queryType;
  91    pool->pipeline_statistics = pipeline_statistics;
  92    pool->stride = uint64s_per_slot * sizeof(uint64_t);
  93    pool->slots = pCreateInfo->queryCount;
  94
  95    uint64_t size = pool->slots * pool->stride;
  96    result = anv_bo_init_new(&pool->bo, device, size);
  97    if (result != VK_SUCCESS)
  98       goto fail;
  99
 100    if (pdevice->supports_48bit_addresses)
 101       pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
 102
 103    if (pdevice->use_softpin)
 104       pool->bo.flags |= EXEC_OBJECT_PINNED;
 105
 106    if (pdevice->has_exec_async)
 107       pool->bo.flags |= EXEC_OBJECT_ASYNC;
 108
 109    anv_vma_alloc(device, &pool->bo);
 110
 111    /* For query pools, we set the caching mode to I915_CACHING_CACHED.  On LLC
 112     * platforms, this does nothing.  On non-LLC platforms, this means snooping
 113     * which comes at a slight cost.  However, the buffers aren't big, won't be
 114     * written frequently, and trying to handle the flushing manually without
 115     * doing too much flushing is extremely painful.
 116     */
 117    anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
 118
 119    pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
 120
 121    *pQueryPool = anv_query_pool_to_handle(pool);
 122
 123    return VK_SUCCESS;
 124
 125  fail:
 126    vk_free2(&device->alloc, pAllocator, pool);
 127
 128    return result;
 129 }
 130
 131 void genX(DestroyQueryPool)(
 132     VkDevice                                    _device,
 133     VkQueryPool                                 _pool,
 134     const VkAllocationCallbacks*                pAllocator)
 135 {
 136    ANV_FROM_HANDLE(anv_device, device, _device);
 137    ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
 138
 139    if (!pool)
 140       return;
 141
 142    anv_gem_munmap(pool->bo.map, pool->bo.size);
 143    anv_vma_free(device, &pool->bo);
 144    anv_gem_close(device, pool->bo.gem_handle);
 145    vk_free2(&device->alloc, pAllocator, pool);
 146 }
 147
 148 static struct anv_address
 149 anv_query_address(struct anv_query_pool *pool, uint32_t query)
 150 {
 151    return (struct anv_address) {
 152       .bo = &pool->bo,
 153       .offset = query * pool->stride,
 154    };
 155 }
 156
 157 static void
 158 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
 159                        uint32_t value_index, uint64_t result)
 160 {
 161    if (flags & VK_QUERY_RESULT_64_BIT) {
 162       uint64_t *dst64 = dst_slot;
 163       dst64[value_index] = result;
 164    } else {
 165       uint32_t *dst32 = dst_slot;
 166       dst32[value_index] = result;
 167    }
 168 }
 169
 170 static bool
 171 query_is_available(uint64_t *slot)
 172 {
 173    return *(volatile uint64_t *)slot;
 174 }
 175
 176 static VkResult
 177 wait_for_available(struct anv_device *device,
 178                    struct anv_query_pool *pool, uint64_t *slot)
 179 {
 180    while (true) {
 181       if (query_is_available(slot))
 182          return VK_SUCCESS;
 183
 184       int ret = anv_gem_busy(device, pool->bo.gem_handle);
 185       if (ret == 1) {
 186          /* The BO is still busy, keep waiting. */
 187          continue;
 188       } else if (ret == -1) {
 189          /* We don't know the real error. */
 190          return anv_device_set_lost(device, "gem wait failed: %m");
 191       } else {
 192          assert(ret == 0);
 193          /* The BO is no longer busy. */
 194          if (query_is_available(slot)) {
 195             return VK_SUCCESS;
 196          } else {
 197             VkResult status = anv_device_query_status(device);
 198             if (status != VK_SUCCESS)
 199                return status;
 200
 201             /* If we haven't seen availability yet, then we never will.  This
 202              * can only happen if we have a client error where they call
 203              * GetQueryPoolResults on a query that they haven't submitted to
 204              * the GPU yet.  The spec allows us to do anything in this case,
 205              * but returning VK_SUCCESS doesn't seem right and we shouldn't
 206              * just keep spinning.
 207              */
 208             return VK_NOT_READY;
 209          }
 210       }
 211    }
 212 }
 213
 214 VkResult genX(GetQueryPoolResults)(
 215     VkDevice                                    _device,
 216     VkQueryPool                                 queryPool,
 217     uint32_t                                    firstQuery,
 218     uint32_t                                    queryCount,
 219     size_t                                      dataSize,
 220     void*                                       pData,
 221     VkDeviceSize                                stride,
 222     VkQueryResultFlags                          flags)
 223 {
 224    ANV_FROM_HANDLE(anv_device, device, _device);
 225    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 226
 227    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
 228           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
 229           pool->type == VK_QUERY_TYPE_TIMESTAMP ||
 230           pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
 231
 232    if (anv_device_is_lost(device))
 233       return VK_ERROR_DEVICE_LOST;
 234
 235    if (pData == NULL)
 236       return VK_SUCCESS;
 237
 238    void *data_end = pData + dataSize;
 239
 240    VkResult status = VK_SUCCESS;
 241    for (uint32_t i = 0; i < queryCount; i++) {
 242       uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
 243
 244       /* Availability is always at the start of the slot */
 245       bool available = slot[0];
 246
 247       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
 248          status = wait_for_available(device, pool, slot);
 249          if (status != VK_SUCCESS)
 250             return status;
 251
 252          available = true;
 253       }
 254
 255       /* From the Vulkan 1.0.42 spec:
 256        *
 257        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
 258        *    both not set then no result values are written to pData for
 259        *    queries that are in the unavailable state at the time of the call,
 260        *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
 261        *    availability state is still written to pData for those queries if
 262        *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
 263        */
 264       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
 265
 266       uint32_t idx = 0;
 267       switch (pool->type) {
 268       case VK_QUERY_TYPE_OCCLUSION:
 269          if (write_results)
 270             cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
 271          idx++;
 272          break;
 273
 274       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 275          uint32_t statistics = pool->pipeline_statistics;
 276          while (statistics) {
 277             uint32_t stat = u_bit_scan(&statistics);
 278             if (write_results) {
 279                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
 280
 281                /* WaDividePSInvocationCountBy4:HSW,BDW */
 282                if ((device->info.gen == 8 || device->info.is_haswell) &&
 283                    (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
 284                   result >>= 2;
 285
 286                cpu_write_query_result(pData, flags, idx, result);
 287             }
 288             idx++;
 289          }
 290          assert(idx == util_bitcount(pool->pipeline_statistics));
 291          break;
 292       }
 293
 294       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 295          if (write_results)
 296             cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
 297          idx++;
 298          if (write_results)
 299             cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
 300          idx++;
 301          break;
 302
 303       case VK_QUERY_TYPE_TIMESTAMP:
 304          if (write_results)
 305             cpu_write_query_result(pData, flags, idx, slot[1]);
 306          idx++;
 307          break;
 308
 309       default:
 310          unreachable("invalid pool type");
 311       }
 312
 313       if (!write_results)
 314          status = VK_NOT_READY;
 315
 316       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
 317          cpu_write_query_result(pData, flags, idx, available);
 318
 319       pData += stride;
 320       if (pData >= data_end)
 321          break;
 322    }
 323
 324    return status;
 325 }
 326
 327 static void
 328 emit_srm32(struct anv_batch *batch, struct anv_address addr, uint32_t reg)
 329 {
 330    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 331       srm.MemoryAddress    = addr;
 332       srm.RegisterAddress  = reg;
 333    }
 334 }
 335
 336 static void
 337 emit_srm64(struct anv_batch *batch, struct anv_address addr, uint32_t reg)
 338 {
 339    emit_srm32(batch, anv_address_add(addr, 0), reg + 0);
 340    emit_srm32(batch, anv_address_add(addr, 4), reg + 4);
 341 }
 342
 343 static void
 344 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
 345                     struct anv_address addr)
 346 {
 347    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 348       pc.DestinationAddressType  = DAT_PPGTT;
 349       pc.PostSyncOperation       = WritePSDepthCount;
 350       pc.DepthStallEnable        = true;
 351       pc.Address                 = addr;
 352
 353       if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
 354          pc.CommandStreamerStallEnable = true;
 355    }
 356 }
 357
 358 static void
 359 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
 360                         struct anv_address addr)
 361 {
 362    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 363       pc.DestinationAddressType  = DAT_PPGTT;
 364       pc.PostSyncOperation       = WriteImmediateData;
 365       pc.Address                 = addr;
 366       pc.ImmediateData           = 1;
 367    }
 368 }
 369
 370 /**
 371  * Goes through a series of consecutive query indices in the given pool
 372  * setting all element values to 0 and emitting them as available.
 373  */
 374 static void
 375 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
 376                   struct anv_query_pool *pool,
 377                   uint32_t first_index, uint32_t num_queries)
 378 {
 379    for (uint32_t i = 0; i < num_queries; i++) {
 380       struct anv_address slot_addr =
 381          anv_query_address(pool, first_index + i);
 382       genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8),
 383                                  0, pool->stride - 8);
 384       emit_query_availability(cmd_buffer, slot_addr);
 385    }
 386 }
 387
 388 void genX(CmdResetQueryPool)(
 389     VkCommandBuffer                             commandBuffer,
 390     VkQueryPool                                 queryPool,
 391     uint32_t                                    firstQuery,
 392     uint32_t                                    queryCount)
 393 {
 394    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 395    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 396
 397    for (uint32_t i = 0; i < queryCount; i++) {
 398       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
 399          sdm.Address = anv_query_address(pool, firstQuery + i);
 400          sdm.ImmediateData = 0;
 401       }
 402    }
 403 }
 404
 405 static const uint32_t vk_pipeline_stat_to_reg[] = {
 406    GENX(IA_VERTICES_COUNT_num),
 407    GENX(IA_PRIMITIVES_COUNT_num),
 408    GENX(VS_INVOCATION_COUNT_num),
 409    GENX(GS_INVOCATION_COUNT_num),
 410    GENX(GS_PRIMITIVES_COUNT_num),
 411    GENX(CL_INVOCATION_COUNT_num),
 412    GENX(CL_PRIMITIVES_COUNT_num),
 413    GENX(PS_INVOCATION_COUNT_num),
 414    GENX(HS_INVOCATION_COUNT_num),
 415    GENX(DS_INVOCATION_COUNT_num),
 416    GENX(CS_INVOCATION_COUNT_num),
 417 };
 418
 419 static void
 420 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
 421                    struct anv_address addr)
 422 {
 423    STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
 424                  (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
 425
 426    assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
 427    emit_srm64(&cmd_buffer->batch, addr, vk_pipeline_stat_to_reg[stat]);
 428 }
 429
 430 static void
 431 emit_xfb_query(struct anv_cmd_buffer *cmd_buffer, uint32_t stream,
 432                struct anv_address addr)
 433 {
 434    assert(stream < MAX_XFB_STREAMS);
 435
 436    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
 437       lrm.RegisterAddress  = GENX(SO_NUM_PRIMS_WRITTEN0_num) + 0 + stream * 8;
 438       lrm.MemoryAddress    = anv_address_add(addr, 0);
 439    }
 440    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
 441       lrm.RegisterAddress  = GENX(SO_NUM_PRIMS_WRITTEN0_num) + 4 + stream * 8;
 442       lrm.MemoryAddress    = anv_address_add(addr, 4);
 443    }
 444
 445    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
 446       lrm.RegisterAddress  = GENX(SO_PRIM_STORAGE_NEEDED0_num) + 0 + stream * 8;
 447       lrm.MemoryAddress    = anv_address_add(addr, 16);
 448    }
 449    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
 450       lrm.RegisterAddress  = GENX(SO_PRIM_STORAGE_NEEDED0_num) + 4 + stream * 8;
 451       lrm.MemoryAddress    = anv_address_add(addr, 20);
 452    }
 453 }
 454
 455 void genX(CmdBeginQuery)(
 456     VkCommandBuffer                             commandBuffer,
 457     VkQueryPool                                 queryPool,
 458     uint32_t                                    query,
 459     VkQueryControlFlags                         flags)
 460 {
 461    genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
 462 }
 463
 464 void genX(CmdBeginQueryIndexedEXT)(
 465     VkCommandBuffer                             commandBuffer,
 466     VkQueryPool                                 queryPool,
 467     uint32_t                                    query,
 468     VkQueryControlFlags                         flags,
 469     uint32_t                                    index)
 470 {
 471    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 472    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 473    struct anv_address query_addr = anv_query_address(pool, query);
 474
 475    switch (pool->type) {
 476    case VK_QUERY_TYPE_OCCLUSION:
 477       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
 478       break;
 479
 480    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 481       /* TODO: This might only be necessary for certain stats */
 482       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 483          pc.CommandStreamerStallEnable = true;
 484          pc.StallAtPixelScoreboard = true;
 485       }
 486
 487       uint32_t statistics = pool->pipeline_statistics;
 488       uint32_t offset = 8;
 489       while (statistics) {
 490          uint32_t stat = u_bit_scan(&statistics);
 491          emit_pipeline_stat(cmd_buffer, stat,
 492                             anv_address_add(query_addr, offset));
 493          offset += 16;
 494       }
 495       break;
 496    }
 497
 498    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 499       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 500          pc.CommandStreamerStallEnable = true;
 501          pc.StallAtPixelScoreboard = true;
 502       }
 503       emit_xfb_query(cmd_buffer, index, anv_address_add(query_addr, 8));
 504       break;
 505
 506    default:
 507       unreachable("");
 508    }
 509 }
 510
 511 void genX(CmdEndQuery)(
 512     VkCommandBuffer                             commandBuffer,
 513     VkQueryPool                                 queryPool,
 514     VkQueryControlFlags                         flags)
 515 {
 516    genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, flags, 0);
 517 }
 518
 519 void genX(CmdEndQueryIndexedEXT)(
 520     VkCommandBuffer                             commandBuffer,
 521     VkQueryPool                                 queryPool,
 522     uint32_t                                    query,
 523     uint32_t                                    index)
 524 {
 525    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 526    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 527    struct anv_address query_addr = anv_query_address(pool, query);
 528
 529    switch (pool->type) {
 530    case VK_QUERY_TYPE_OCCLUSION:
 531       emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
 532       emit_query_availability(cmd_buffer, query_addr);
 533       break;
 534
 535    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 536       /* TODO: This might only be necessary for certain stats */
 537       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 538          pc.CommandStreamerStallEnable = true;
 539          pc.StallAtPixelScoreboard = true;
 540       }
 541
 542       uint32_t statistics = pool->pipeline_statistics;
 543       uint32_t offset = 16;
 544       while (statistics) {
 545          uint32_t stat = u_bit_scan(&statistics);
 546          emit_pipeline_stat(cmd_buffer, stat,
 547                             anv_address_add(query_addr, offset));
 548          offset += 16;
 549       }
 550
 551       emit_query_availability(cmd_buffer, query_addr);
 552       break;
 553    }
 554
 555    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 556       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 557          pc.CommandStreamerStallEnable = true;
 558          pc.StallAtPixelScoreboard = true;
 559       }
 560
 561       emit_xfb_query(cmd_buffer, index, anv_address_add(query_addr, 16));
 562       emit_query_availability(cmd_buffer, query_addr);
 563       break;
 564
 565    default:
 566       unreachable("");
 567    }
 568
 569    /* When multiview is active the spec requires that N consecutive query
 570     * indices are used, where N is the number of active views in the subpass.
 571     * The spec allows that we only write the results to one of the queries
 572     * but we still need to manage result availability for all the query indices.
 573     * Since we only emit a single query for all active views in the
 574     * first index, mark the other query indices as being already available
 575     * with result 0.
 576     */
 577    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
 578       const uint32_t num_queries =
 579          util_bitcount(cmd_buffer->state.subpass->view_mask);
 580       if (num_queries > 1)
 581          emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
 582    }
 583 }
 584
 585 #define TIMESTAMP 0x2358
 586
 587 void genX(CmdWriteTimestamp)(
 588     VkCommandBuffer                             commandBuffer,
 589     VkPipelineStageFlagBits                     pipelineStage,
 590     VkQueryPool                                 queryPool,
 591     uint32_t                                    query)
 592 {
 593    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 594    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 595    struct anv_address query_addr = anv_query_address(pool, query);
 596
 597    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
 598
 599    switch (pipelineStage) {
 600    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
 601       emit_srm64(&cmd_buffer->batch, anv_address_add(query_addr, 8), TIMESTAMP);
 602       break;
 603
 604    default:
 605       /* Everything else is bottom-of-pipe */
 606       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 607          pc.DestinationAddressType  = DAT_PPGTT;
 608          pc.PostSyncOperation       = WriteTimestamp;
 609          pc.Address                 = anv_address_add(query_addr, 8);
 610
 611          if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
 612             pc.CommandStreamerStallEnable = true;
 613       }
 614       break;
 615    }
 616
 617    emit_query_availability(cmd_buffer, query_addr);
 618
 619    /* When multiview is active the spec requires that N consecutive query
 620     * indices are used, where N is the number of active views in the subpass.
 621     * The spec allows that we only write the results to one of the queries
 622     * but we still need to manage result availability for all the query indices.
 623     * Since we only emit a single query for all active views in the
 624     * first index, mark the other query indices as being already available
 625     * with result 0.
 626     */
 627    if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
 628       const uint32_t num_queries =
 629          util_bitcount(cmd_buffer->state.subpass->view_mask);
 630       if (num_queries > 1)
 631          emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
 632    }
 633 }
 634
 635 #if GEN_GEN > 7 || GEN_IS_HASWELL
 636
 637 static uint32_t
 638 mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
 639 {
 640    struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
 641       .ALUOpcode = opcode,
 642       .Operand1 = operand1,
 643       .Operand2 = operand2,
 644    };
 645
 646    uint32_t dw;
 647    GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
 648
 649    return dw;
 650 }
 651
 652 #define CS_GPR(n) (0x2600 + (n) * 8)
 653
 654 static void
 655 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
 656                       struct anv_address addr)
 657 {
 658    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 659       lrm.RegisterAddress  = reg;
 660       lrm.MemoryAddress    = anv_address_add(addr, 0);
 661    }
 662    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 663       lrm.RegisterAddress  = reg + 4;
 664       lrm.MemoryAddress    = anv_address_add(addr, 4);
 665    }
 666 }
 667
 668 static void
 669 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
 670 {
 671    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
 672       lri.RegisterOffset   = reg;
 673       lri.DataDWord        = imm;
 674    }
 675 }
 676
 677 static void
 678 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
 679 {
 680    emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
 681    emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
 682 }
 683
 684 static void
 685 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
 686 {
 687    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
 688       lrr.SourceRegisterAddress      = src;
 689       lrr.DestinationRegisterAddress = dst;
 690    }
 691 }
 692
 693 /*
 694  * GPR0 = GPR0 & ((1ull << n) - 1);
 695  */
 696 static void
 697 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
 698 {
 699    assert(n < 64);
 700    emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 701
 702    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
 703    if (!dw) {
 704       anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
 705       return;
 706    }
 707
 708    dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
 709    dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1);
 710    dw[3] = mi_alu(MI_ALU_AND, 0, 0);
 711    dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
 712 }
 713
 714 /*
 715  * GPR0 = GPR0 << 30;
 716  */
 717 static void
 718 shl_gpr0_by_30_bits(struct anv_batch *batch)
 719 {
 720    /* First we mask 34 bits of GPR0 to prevent overflow */
 721    keep_gpr0_lower_n_bits(batch, 34);
 722
 723    const uint32_t outer_count = 5;
 724    const uint32_t inner_count = 6;
 725    STATIC_ASSERT(outer_count * inner_count == 30);
 726    const uint32_t cmd_len = 1 + inner_count * 4;
 727
 728    /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
 729     * 30 left shifts.
 730     */
 731    for (int o = 0; o < outer_count; o++) {
 732       /* Submit one MI_MATH to shift left by 6 bits */
 733       uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
 734       if (!dw) {
 735          anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
 736          return;
 737       }
 738
 739       dw++;
 740       for (int i = 0; i < inner_count; i++, dw += 4) {
 741          dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
 742          dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
 743          dw[2] = mi_alu(MI_ALU_ADD, 0, 0);
 744          dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
 745       }
 746    }
 747 }
 748
 749 /*
 750  * GPR0 = GPR0 >> 2;
 751  *
 752  * Note that the upper 30 bits of GPR are lost!
 753  */
 754 static void
 755 shr_gpr0_by_2_bits(struct anv_batch *batch)
 756 {
 757    shl_gpr0_by_30_bits(batch);
 758    emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
 759    emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
 760 }
 761
 762 static void
 763 gpu_write_query_result(struct anv_batch *batch,
 764                        struct anv_address dst_addr,
 765                        VkQueryResultFlags flags,
 766                        uint32_t value_index, uint32_t reg)
 767 {
 768    if (flags & VK_QUERY_RESULT_64_BIT) {
 769       emit_srm64(batch, anv_address_add(dst_addr, value_index * 8), reg);
 770    } else {
 771       emit_srm32(batch, anv_address_add(dst_addr, value_index * 4), reg);
 772    }
 773 }
 774
 775 static void
 776 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
 777                      struct anv_address addr)
 778 {
 779    emit_load_alu_reg_u64(batch, CS_GPR(0), anv_address_add(addr, 0));
 780    emit_load_alu_reg_u64(batch, CS_GPR(1), anv_address_add(addr, 8));
 781
 782    /* FIXME: We need to clamp the result for 32 bit. */
 783
 784    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
 785    if (!dw) {
 786       anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
 787       return;
 788    }
 789
 790    dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1);
 791    dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
 792    dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
 793    dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU);
 794 }
 795
 796 void genX(CmdCopyQueryPoolResults)(
 797     VkCommandBuffer                             commandBuffer,
 798     VkQueryPool                                 queryPool,
 799     uint32_t                                    firstQuery,
 800     uint32_t                                    queryCount,
 801     VkBuffer                                    destBuffer,
 802     VkDeviceSize                                destOffset,
 803     VkDeviceSize                                destStride,
 804     VkQueryResultFlags                          flags)
 805 {
 806    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 807    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 808    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
 809
 810    /* If render target writes are ongoing, request a render target cache flush
 811     * to ensure proper ordering of the commands from the 3d pipe and the
 812     * command streamer.
 813     */
 814    if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
 815       cmd_buffer->state.pending_pipe_bits |=
 816          ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
 817    }
 818
 819    if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
 820        (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) {
 821       cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
 822       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
 823    }
 824
 825    struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
 826    for (uint32_t i = 0; i < queryCount; i++) {
 827       struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
 828       uint32_t idx = 0;
 829       switch (pool->type) {
 830       case VK_QUERY_TYPE_OCCLUSION:
 831          compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
 832                               anv_address_add(query_addr, 8));
 833          gpu_write_query_result(&cmd_buffer->batch, dest_addr,
 834                                 flags, idx++, CS_GPR(2));
 835          break;
 836
 837       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 838          uint32_t statistics = pool->pipeline_statistics;
 839          while (statistics) {
 840             uint32_t stat = u_bit_scan(&statistics);
 841
 842             compute_query_result(&cmd_buffer->batch, MI_ALU_REG0,
 843                                  anv_address_add(query_addr, idx * 16 + 8));
 844
 845             /* WaDividePSInvocationCountBy4:HSW,BDW */
 846             if ((cmd_buffer->device->info.gen == 8 ||
 847                  cmd_buffer->device->info.is_haswell) &&
 848                 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
 849                shr_gpr0_by_2_bits(&cmd_buffer->batch);
 850             }
 851
 852             gpu_write_query_result(&cmd_buffer->batch, dest_addr,
 853                                    flags, idx++, CS_GPR(0));
 854          }
 855          assert(idx == util_bitcount(pool->pipeline_statistics));
 856          break;
 857       }
 858
 859       case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
 860          compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
 861                               anv_address_add(query_addr, 8));
 862          gpu_write_query_result(&cmd_buffer->batch, dest_addr,
 863                                 flags, idx++, CS_GPR(2));
 864          compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
 865                               anv_address_add(query_addr, 24));
 866          gpu_write_query_result(&cmd_buffer->batch, dest_addr,
 867                                 flags, idx++, CS_GPR(2));
 868          break;
 869
 870       case VK_QUERY_TYPE_TIMESTAMP:
 871          emit_load_alu_reg_u64(&cmd_buffer->batch,
 872                                CS_GPR(2), anv_address_add(query_addr, 8));
 873          gpu_write_query_result(&cmd_buffer->batch, dest_addr,
 874                                 flags, 0, CS_GPR(2));
 875          break;
 876
 877       default:
 878          unreachable("unhandled query type");
 879       }
 880
 881       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 882          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), query_addr);
 883          gpu_write_query_result(&cmd_buffer->batch, dest_addr,
 884                                 flags, idx, CS_GPR(0));
 885       }
 886
 887       dest_addr = anv_address_add(dest_addr, destStride);
 888    }
 889 }
 890
 891 #else
 892 void genX(CmdCopyQueryPoolResults)(
 893     VkCommandBuffer                             commandBuffer,
 894     VkQueryPool                                 queryPool,
 895     uint32_t                                    firstQuery,
 896     uint32_t                                    queryCount,
 897     VkBuffer                                    destBuffer,
 898     VkDeviceSize                                destOffset,
 899     VkDeviceSize                                destStride,
 900     VkQueryResultFlags                          flags)
 901 {
 902    anv_finishme("Queries not yet supported on Ivy Bridge");
 903 }
 904 #endif