src/intel/vulkan/genX_query.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <fcntl.h>
  29
  30 #include "anv_private.h"
  31
  32 #include "genxml/gen_macros.h"
  33 #include "genxml/genX_pack.h"
  34
  35 VkResult genX(CreateQueryPool)(
  36     VkDevice                                    _device,
  37     const VkQueryPoolCreateInfo*                pCreateInfo,
  38     const VkAllocationCallbacks*                pAllocator,
  39     VkQueryPool*                                pQueryPool)
  40 {
  41    ANV_FROM_HANDLE(anv_device, device, _device);
  42    struct anv_query_pool *pool;
  43    VkResult result;
  44
  45    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
  46
  47    /* Query pool slots are made up of some number of 64-bit values packed
  48     * tightly together.  The first 64-bit value is always the "available" bit
  49     * which is 0 when the query is unavailable and 1 when it is available.
  50     * The 64-bit values that follow are determined by the type of query.
  51     */
  52    uint32_t uint64s_per_slot = 1;
  53
  54    VkQueryPipelineStatisticFlags pipeline_statistics = 0;
  55    switch (pCreateInfo->queryType) {
  56    case VK_QUERY_TYPE_OCCLUSION:
  57       /* Occlusion queries have two values: begin and end. */
  58       uint64s_per_slot += 2;
  59       break;
  60    case VK_QUERY_TYPE_TIMESTAMP:
  61       /* Timestamps just have the one timestamp value */
  62       uint64s_per_slot += 1;
  63       break;
  64    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  65       pipeline_statistics = pCreateInfo->pipelineStatistics;
  66       /* We're going to trust this field implicitly so we need to ensure that
  67        * no unhandled extension bits leak in.
  68        */
  69       pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
  70
  71       /* Statistics queries have a min and max for every statistic */
  72       uint64s_per_slot += 2 * _mesa_bitcount(pipeline_statistics);
  73       break;
  74    default:
  75       assert(!"Invalid query type");
  76    }
  77
  78    pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
  79                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
  80    if (pool == NULL)
  81       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  82
  83    pool->type = pCreateInfo->queryType;
  84    pool->pipeline_statistics = pipeline_statistics;
  85    pool->stride = uint64s_per_slot * sizeof(uint64_t);
  86    pool->slots = pCreateInfo->queryCount;
  87
  88    uint64_t size = pool->slots * pool->stride;
  89    result = anv_bo_init_new(&pool->bo, device, size);
  90    if (result != VK_SUCCESS)
  91       goto fail;
  92
  93    /* For query pools, we set the caching mode to I915_CACHING_CACHED.  On LLC
  94     * platforms, this does nothing.  On non-LLC platforms, this means snooping
  95     * which comes at a slight cost.  However, the buffers aren't big, won't be
  96     * written frequently, and trying to handle the flushing manually without
  97     * doing too much flushing is extremely painful.
  98     */
  99    anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
 100
 101    pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
 102
 103    *pQueryPool = anv_query_pool_to_handle(pool);
 104
 105    return VK_SUCCESS;
 106
 107  fail:
 108    vk_free2(&device->alloc, pAllocator, pool);
 109
 110    return result;
 111 }
 112
 113 void genX(DestroyQueryPool)(
 114     VkDevice                                    _device,
 115     VkQueryPool                                 _pool,
 116     const VkAllocationCallbacks*                pAllocator)
 117 {
 118    ANV_FROM_HANDLE(anv_device, device, _device);
 119    ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
 120
 121    if (!pool)
 122       return;
 123
 124    anv_gem_munmap(pool->bo.map, pool->bo.size);
 125    anv_gem_close(device, pool->bo.gem_handle);
 126    vk_free2(&device->alloc, pAllocator, pool);
 127 }
 128
 129 static void
 130 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
 131                        uint32_t value_index, uint64_t result)
 132 {
 133    if (flags & VK_QUERY_RESULT_64_BIT) {
 134       uint64_t *dst64 = dst_slot;
 135       dst64[value_index] = result;
 136    } else {
 137       uint32_t *dst32 = dst_slot;
 138       dst32[value_index] = result;
 139    }
 140 }
 141
 142 static bool
 143 query_is_available(uint64_t *slot)
 144 {
 145    return *(volatile uint64_t *)slot;
 146 }
 147
 148 static VkResult
 149 wait_for_available(struct anv_device *device,
 150                    struct anv_query_pool *pool, uint64_t *slot)
 151 {
 152    while (true) {
 153       if (query_is_available(slot))
 154          return VK_SUCCESS;
 155
 156       int ret = anv_gem_busy(device, pool->bo.gem_handle);
 157       if (ret == 1) {
 158          /* The BO is still busy, keep waiting. */
 159          continue;
 160       } else if (ret == -1) {
 161          /* We don't know the real error. */
 162          device->lost = true;
 163          return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
 164       } else {
 165          assert(ret == 0);
 166          /* The BO is no longer busy. */
 167          if (query_is_available(slot)) {
 168             return VK_SUCCESS;
 169          } else {
 170             VkResult status = anv_device_query_status(device);
 171             if (status != VK_SUCCESS)
 172                return status;
 173
 174             /* If we haven't seen availability yet, then we never will.  This
 175              * can only happen if we have a client error where they call
 176              * GetQueryPoolResults on a query that they haven't submitted to
 177              * the GPU yet.  The spec allows us to do anything in this case,
 178              * but returning VK_SUCCESS doesn't seem right and we shouldn't
 179              * just keep spinning.
 180              */
 181             return VK_NOT_READY;
 182          }
 183       }
 184    }
 185 }
 186
 187 VkResult genX(GetQueryPoolResults)(
 188     VkDevice                                    _device,
 189     VkQueryPool                                 queryPool,
 190     uint32_t                                    firstQuery,
 191     uint32_t                                    queryCount,
 192     size_t                                      dataSize,
 193     void*                                       pData,
 194     VkDeviceSize                                stride,
 195     VkQueryResultFlags                          flags)
 196 {
 197    ANV_FROM_HANDLE(anv_device, device, _device);
 198    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 199
 200    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
 201           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
 202           pool->type == VK_QUERY_TYPE_TIMESTAMP);
 203
 204    if (unlikely(device->lost))
 205       return VK_ERROR_DEVICE_LOST;
 206
 207    if (pData == NULL)
 208       return VK_SUCCESS;
 209
 210    void *data_end = pData + dataSize;
 211
 212    VkResult status = VK_SUCCESS;
 213    for (uint32_t i = 0; i < queryCount; i++) {
 214       uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
 215
 216       /* Availability is always at the start of the slot */
 217       bool available = slot[0];
 218
 219       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
 220          status = wait_for_available(device, pool, slot);
 221          if (status != VK_SUCCESS)
 222             return status;
 223
 224          available = true;
 225       }
 226
 227       /* From the Vulkan 1.0.42 spec:
 228        *
 229        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
 230        *    both not set then no result values are written to pData for
 231        *    queries that are in the unavailable state at the time of the call,
 232        *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
 233        *    availability state is still written to pData for those queries if
 234        *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
 235        */
 236       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
 237
 238       if (write_results) {
 239          switch (pool->type) {
 240          case VK_QUERY_TYPE_OCCLUSION: {
 241             cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]);
 242             break;
 243          }
 244
 245          case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 246             uint32_t statistics = pool->pipeline_statistics;
 247             uint32_t idx = 0;
 248             while (statistics) {
 249                uint32_t stat = u_bit_scan(&statistics);
 250                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
 251
 252                /* WaDividePSInvocationCountBy4:HSW,BDW */
 253                if ((device->info.gen == 8 || device->info.is_haswell) &&
 254                    (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
 255                   result >>= 2;
 256
 257                cpu_write_query_result(pData, flags, idx, result);
 258
 259                idx++;
 260             }
 261             assert(idx == _mesa_bitcount(pool->pipeline_statistics));
 262             break;
 263          }
 264
 265          case VK_QUERY_TYPE_TIMESTAMP: {
 266             cpu_write_query_result(pData, flags, 0, slot[1]);
 267             break;
 268          }
 269          default:
 270             unreachable("invalid pool type");
 271          }
 272       } else {
 273          status = VK_NOT_READY;
 274       }
 275
 276       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 277          uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
 278                         _mesa_bitcount(pool->pipeline_statistics) : 1;
 279          cpu_write_query_result(pData, flags, idx, available);
 280       }
 281
 282       pData += stride;
 283       if (pData >= data_end)
 284          break;
 285    }
 286
 287    return status;
 288 }
 289
 290 static void
 291 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
 292                     struct anv_bo *bo, uint32_t offset)
 293 {
 294    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 295       pc.DestinationAddressType  = DAT_PPGTT;
 296       pc.PostSyncOperation       = WritePSDepthCount;
 297       pc.DepthStallEnable        = true;
 298       pc.Address                 = (struct anv_address) { bo, offset };
 299
 300       if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
 301          pc.CommandStreamerStallEnable = true;
 302    }
 303 }
 304
 305 static void
 306 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
 307                         struct anv_bo *bo, uint32_t offset)
 308 {
 309    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 310       pc.DestinationAddressType  = DAT_PPGTT;
 311       pc.PostSyncOperation       = WriteImmediateData;
 312       pc.Address                 = (struct anv_address) { bo, offset };
 313       pc.ImmediateData           = 1;
 314    }
 315 }
 316
 317 void genX(CmdResetQueryPool)(
 318     VkCommandBuffer                             commandBuffer,
 319     VkQueryPool                                 queryPool,
 320     uint32_t                                    firstQuery,
 321     uint32_t                                    queryCount)
 322 {
 323    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 324    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 325
 326    for (uint32_t i = 0; i < queryCount; i++) {
 327       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
 328          sdm.Address = (struct anv_address) {
 329             .bo = &pool->bo,
 330             .offset = (firstQuery + i) * pool->stride,
 331          };
 332          sdm.ImmediateData = 0;
 333       }
 334    }
 335 }
 336
 337 static const uint32_t vk_pipeline_stat_to_reg[] = {
 338    GENX(IA_VERTICES_COUNT_num),
 339    GENX(IA_PRIMITIVES_COUNT_num),
 340    GENX(VS_INVOCATION_COUNT_num),
 341    GENX(GS_INVOCATION_COUNT_num),
 342    GENX(GS_PRIMITIVES_COUNT_num),
 343    GENX(CL_INVOCATION_COUNT_num),
 344    GENX(CL_PRIMITIVES_COUNT_num),
 345    GENX(PS_INVOCATION_COUNT_num),
 346    GENX(HS_INVOCATION_COUNT_num),
 347    GENX(DS_INVOCATION_COUNT_num),
 348    GENX(CS_INVOCATION_COUNT_num),
 349 };
 350
 351 static void
 352 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
 353                    struct anv_bo *bo, uint32_t offset)
 354 {
 355    STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
 356                  (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
 357
 358    assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
 359    uint32_t reg = vk_pipeline_stat_to_reg[stat];
 360
 361    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
 362       lrm.RegisterAddress  = reg,
 363       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
 364    }
 365    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
 366       lrm.RegisterAddress  = reg + 4,
 367       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
 368    }
 369 }
 370
 371 void genX(CmdBeginQuery)(
 372     VkCommandBuffer                             commandBuffer,
 373     VkQueryPool                                 queryPool,
 374     uint32_t                                    query,
 375     VkQueryControlFlags                         flags)
 376 {
 377    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 378    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 379
 380    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
 381     * that the pipelining of the depth write breaks. What we see is that
 382     * samples from the render pass clear leaks into the first query
 383     * immediately after the clear. Doing a pipecontrol with a post-sync
 384     * operation and DepthStallEnable seems to work around the issue.
 385     */
 386    if (cmd_buffer->state.need_query_wa) {
 387       cmd_buffer->state.need_query_wa = false;
 388       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 389          pc.DepthCacheFlushEnable   = true;
 390          pc.DepthStallEnable        = true;
 391       }
 392    }
 393
 394    switch (pool->type) {
 395    case VK_QUERY_TYPE_OCCLUSION:
 396       emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8);
 397       break;
 398
 399    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 400       /* TODO: This might only be necessary for certain stats */
 401       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 402          pc.CommandStreamerStallEnable = true;
 403          pc.StallAtPixelScoreboard = true;
 404       }
 405
 406       uint32_t statistics = pool->pipeline_statistics;
 407       uint32_t offset = query * pool->stride + 8;
 408       while (statistics) {
 409          uint32_t stat = u_bit_scan(&statistics);
 410          emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
 411          offset += 16;
 412       }
 413       break;
 414    }
 415
 416    default:
 417       unreachable("");
 418    }
 419 }
 420
 421 void genX(CmdEndQuery)(
 422     VkCommandBuffer                             commandBuffer,
 423     VkQueryPool                                 queryPool,
 424     uint32_t                                    query)
 425 {
 426    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 427    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 428
 429    switch (pool->type) {
 430    case VK_QUERY_TYPE_OCCLUSION:
 431       emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16);
 432       emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
 433       break;
 434
 435    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 436       /* TODO: This might only be necessary for certain stats */
 437       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 438          pc.CommandStreamerStallEnable = true;
 439          pc.StallAtPixelScoreboard = true;
 440       }
 441
 442       uint32_t statistics = pool->pipeline_statistics;
 443       uint32_t offset = query * pool->stride + 16;
 444       while (statistics) {
 445          uint32_t stat = u_bit_scan(&statistics);
 446          emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
 447          offset += 16;
 448       }
 449
 450       emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
 451       break;
 452    }
 453
 454    default:
 455       unreachable("");
 456    }
 457 }
 458
 459 #define TIMESTAMP 0x2358
 460
 461 void genX(CmdWriteTimestamp)(
 462     VkCommandBuffer                             commandBuffer,
 463     VkPipelineStageFlagBits                     pipelineStage,
 464     VkQueryPool                                 queryPool,
 465     uint32_t                                    query)
 466 {
 467    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 468    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 469    uint32_t offset = query * pool->stride;
 470
 471    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
 472
 473    switch (pipelineStage) {
 474    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
 475       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 476          srm.RegisterAddress  = TIMESTAMP;
 477          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 8 };
 478       }
 479       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 480          srm.RegisterAddress  = TIMESTAMP + 4;
 481          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 12 };
 482       }
 483       break;
 484
 485    default:
 486       /* Everything else is bottom-of-pipe */
 487       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 488          pc.DestinationAddressType  = DAT_PPGTT;
 489          pc.PostSyncOperation       = WriteTimestamp;
 490          pc.Address = (struct anv_address) { &pool->bo, offset + 8 };
 491
 492          if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
 493             pc.CommandStreamerStallEnable = true;
 494       }
 495       break;
 496    }
 497
 498    emit_query_availability(cmd_buffer, &pool->bo, offset);
 499 }
 500
 501 #if GEN_GEN > 7 || GEN_IS_HASWELL
 502
 503 static inline uint32_t
 504 mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
 505 {
 506    struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
 507       .ALUOpcode = opcode,
 508       .Operand1 = operand1,
 509       .Operand2 = operand2,
 510    };
 511
 512    uint32_t dw;
 513    GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
 514
 515    return dw;
 516 }
 517
 518 #define CS_GPR(n) (0x2600 + (n) * 8)
 519
 520 static void
 521 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
 522                       struct anv_bo *bo, uint32_t offset)
 523 {
 524    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 525       lrm.RegisterAddress  = reg,
 526       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
 527    }
 528    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 529       lrm.RegisterAddress  = reg + 4;
 530       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
 531    }
 532 }
 533
 534 static void
 535 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
 536 {
 537    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
 538       lri.RegisterOffset   = reg;
 539       lri.DataDWord        = imm;
 540    }
 541 }
 542
 543 static void
 544 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
 545 {
 546    emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
 547    emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
 548 }
 549
 550 static void
 551 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
 552 {
 553    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
 554       lrr.SourceRegisterAddress      = src;
 555       lrr.DestinationRegisterAddress = dst;
 556    }
 557 }
 558
 559 /*
 560  * GPR0 = GPR0 & ((1ull << n) - 1);
 561  */
 562 static void
 563 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
 564 {
 565    assert(n < 64);
 566    emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 567
 568    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
 569    if (!dw) {
 570       anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
 571       return;
 572    }
 573
 574    dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
 575    dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1);
 576    dw[3] = mi_alu(MI_ALU_AND, 0, 0);
 577    dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
 578 }
 579
 580 /*
 581  * GPR0 = GPR0 << 30;
 582  */
 583 static void
 584 shl_gpr0_by_30_bits(struct anv_batch *batch)
 585 {
 586    /* First we mask 34 bits of GPR0 to prevent overflow */
 587    keep_gpr0_lower_n_bits(batch, 34);
 588
 589    const uint32_t outer_count = 5;
 590    const uint32_t inner_count = 6;
 591    STATIC_ASSERT(outer_count * inner_count == 30);
 592    const uint32_t cmd_len = 1 + inner_count * 4;
 593
 594    /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
 595     * 30 left shifts.
 596     */
 597    for (int o = 0; o < outer_count; o++) {
 598       /* Submit one MI_MATH to shift left by 6 bits */
 599       uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
 600       if (!dw) {
 601          anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
 602          return;
 603       }
 604
 605       dw++;
 606       for (int i = 0; i < inner_count; i++, dw += 4) {
 607          dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
 608          dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
 609          dw[2] = mi_alu(MI_ALU_ADD, 0, 0);
 610          dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
 611       }
 612    }
 613 }
 614
 615 /*
 616  * GPR0 = GPR0 >> 2;
 617  *
 618  * Note that the upper 30 bits of GPR are lost!
 619  */
 620 static void
 621 shr_gpr0_by_2_bits(struct anv_batch *batch)
 622 {
 623    shl_gpr0_by_30_bits(batch);
 624    emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
 625    emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
 626 }
 627
 628 static void
 629 gpu_write_query_result(struct anv_batch *batch,
 630                        struct anv_buffer *dst_buffer, uint32_t dst_offset,
 631                        VkQueryResultFlags flags,
 632                        uint32_t value_index, uint32_t reg)
 633 {
 634    if (flags & VK_QUERY_RESULT_64_BIT)
 635       dst_offset += value_index * 8;
 636    else
 637       dst_offset += value_index * 4;
 638
 639    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 640       srm.RegisterAddress  = reg;
 641       srm.MemoryAddress    = (struct anv_address) {
 642          .bo = dst_buffer->bo,
 643          .offset = dst_buffer->offset + dst_offset,
 644       };
 645    }
 646
 647    if (flags & VK_QUERY_RESULT_64_BIT) {
 648       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 649          srm.RegisterAddress  = reg + 4;
 650          srm.MemoryAddress    = (struct anv_address) {
 651             .bo = dst_buffer->bo,
 652             .offset = dst_buffer->offset + dst_offset + 4,
 653          };
 654       }
 655    }
 656 }
 657
 658 static void
 659 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
 660                      struct anv_bo *bo, uint32_t offset)
 661 {
 662    emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
 663    emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
 664
 665    /* FIXME: We need to clamp the result for 32 bit. */
 666
 667    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
 668    if (!dw) {
 669       anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
 670       return;
 671    }
 672
 673    dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1);
 674    dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
 675    dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
 676    dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU);
 677 }
 678
 679 void genX(CmdCopyQueryPoolResults)(
 680     VkCommandBuffer                             commandBuffer,
 681     VkQueryPool                                 queryPool,
 682     uint32_t                                    firstQuery,
 683     uint32_t                                    queryCount,
 684     VkBuffer                                    destBuffer,
 685     VkDeviceSize                                destOffset,
 686     VkDeviceSize                                destStride,
 687     VkQueryResultFlags                          flags)
 688 {
 689    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 690    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 691    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
 692    uint32_t slot_offset;
 693
 694    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
 695       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 696          pc.CommandStreamerStallEnable = true;
 697          pc.StallAtPixelScoreboard     = true;
 698       }
 699    }
 700
 701    for (uint32_t i = 0; i < queryCount; i++) {
 702       slot_offset = (firstQuery + i) * pool->stride;
 703       switch (pool->type) {
 704       case VK_QUERY_TYPE_OCCLUSION:
 705          compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
 706                               &pool->bo, slot_offset + 8);
 707          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
 708                                 flags, 0, CS_GPR(2));
 709          break;
 710
 711       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 712          uint32_t statistics = pool->pipeline_statistics;
 713          uint32_t idx = 0;
 714          while (statistics) {
 715             uint32_t stat = u_bit_scan(&statistics);
 716
 717             compute_query_result(&cmd_buffer->batch, MI_ALU_REG0,
 718                                  &pool->bo, slot_offset + idx * 16 + 8);
 719
 720             /* WaDividePSInvocationCountBy4:HSW,BDW */
 721             if ((cmd_buffer->device->info.gen == 8 ||
 722                  cmd_buffer->device->info.is_haswell) &&
 723                 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
 724                shr_gpr0_by_2_bits(&cmd_buffer->batch);
 725             }
 726
 727             gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
 728                                    flags, idx, CS_GPR(0));
 729
 730             idx++;
 731          }
 732          assert(idx == _mesa_bitcount(pool->pipeline_statistics));
 733          break;
 734       }
 735
 736       case VK_QUERY_TYPE_TIMESTAMP:
 737          emit_load_alu_reg_u64(&cmd_buffer->batch,
 738                                CS_GPR(2), &pool->bo, slot_offset + 8);
 739          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
 740                                 flags, 0, CS_GPR(2));
 741          break;
 742
 743       default:
 744          unreachable("unhandled query type");
 745       }
 746
 747       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 748          uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
 749                         _mesa_bitcount(pool->pipeline_statistics) : 1;
 750
 751          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
 752                                &pool->bo, slot_offset);
 753          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
 754                                 flags, idx, CS_GPR(0));
 755       }
 756
 757       destOffset += destStride;
 758    }
 759 }
 760
 761 #else
 762 void genX(CmdCopyQueryPoolResults)(
 763     VkCommandBuffer                             commandBuffer,
 764     VkQueryPool                                 queryPool,
 765     uint32_t                                    firstQuery,
 766     uint32_t                                    queryCount,
 767     VkBuffer                                    destBuffer,
 768     VkDeviceSize                                destOffset,
 769     VkDeviceSize                                destStride,
 770     VkQueryResultFlags                          flags)
 771 {
 772    anv_finishme("Queries not yet supported on Ivy Bridge");
 773 }
 774 #endif