src/intel/vulkan/genX_query.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25 #include <stdbool.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <fcntl.h>
  29
  30 #include "anv_private.h"
  31
  32 #include "genxml/gen_macros.h"
  33 #include "genxml/genX_pack.h"
  34
  35 VkResult genX(CreateQueryPool)(
  36     VkDevice                                    _device,
  37     const VkQueryPoolCreateInfo*                pCreateInfo,
  38     const VkAllocationCallbacks*                pAllocator,
  39     VkQueryPool*                                pQueryPool)
  40 {
  41    ANV_FROM_HANDLE(anv_device, device, _device);
  42    struct anv_query_pool *pool;
  43    VkResult result;
  44
  45    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
  46
  47    /* Query pool slots are made up of some number of 64-bit values packed
  48     * tightly together.  The first 64-bit value is always the "available" bit
  49     * which is 0 when the query is unavailable and 1 when it is available.
  50     * The 64-bit values that follow are determined by the type of query.
  51     */
  52    uint32_t uint64s_per_slot = 1;
  53
  54    VkQueryPipelineStatisticFlags pipeline_statistics = 0;
  55    switch (pCreateInfo->queryType) {
  56    case VK_QUERY_TYPE_OCCLUSION:
  57       /* Occlusion queries have two values: begin and end. */
  58       uint64s_per_slot += 2;
  59       break;
  60    case VK_QUERY_TYPE_TIMESTAMP:
  61       /* Timestamps just have the one timestamp value */
  62       uint64s_per_slot += 1;
  63       break;
  64    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
  65       pipeline_statistics = pCreateInfo->pipelineStatistics;
  66       /* We're going to trust this field implicitly so we need to ensure that
  67        * no unhandled extension bits leak in.
  68        */
  69       pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
  70
  71       /* Statistics queries have a min and max for every statistic */
  72       uint64s_per_slot += 2 * _mesa_bitcount(pipeline_statistics);
  73       break;
  74    default:
  75       assert(!"Invalid query type");
  76    }
  77
  78    pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
  79                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
  80    if (pool == NULL)
  81       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
  82
  83    pool->type = pCreateInfo->queryType;
  84    pool->pipeline_statistics = pipeline_statistics;
  85    pool->stride = uint64s_per_slot * sizeof(uint64_t);
  86    pool->slots = pCreateInfo->queryCount;
  87
  88    uint64_t size = pool->slots * pool->stride;
  89    result = anv_bo_init_new(&pool->bo, device, size);
  90    if (result != VK_SUCCESS)
  91       goto fail;
  92
  93    /* For query pools, we set the caching mode to I915_CACHING_CACHED.  On LLC
  94     * platforms, this does nothing.  On non-LLC platforms, this means snooping
  95     * which comes at a slight cost.  However, the buffers aren't big, won't be
  96     * written frequently, and trying to handle the flushing manually without
  97     * doing too much flushing is extremely painful.
  98     */
  99    anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
 100
 101    pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
 102
 103    *pQueryPool = anv_query_pool_to_handle(pool);
 104
 105    return VK_SUCCESS;
 106
 107  fail:
 108    vk_free2(&device->alloc, pAllocator, pool);
 109
 110    return result;
 111 }
 112
 113 void genX(DestroyQueryPool)(
 114     VkDevice                                    _device,
 115     VkQueryPool                                 _pool,
 116     const VkAllocationCallbacks*                pAllocator)
 117 {
 118    ANV_FROM_HANDLE(anv_device, device, _device);
 119    ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
 120
 121    if (!pool)
 122       return;
 123
 124    anv_gem_munmap(pool->bo.map, pool->bo.size);
 125    anv_gem_close(device, pool->bo.gem_handle);
 126    vk_free2(&device->alloc, pAllocator, pool);
 127 }
 128
 129 static void
 130 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
 131                        uint32_t value_index, uint64_t result)
 132 {
 133    if (flags & VK_QUERY_RESULT_64_BIT) {
 134       uint64_t *dst64 = dst_slot;
 135       dst64[value_index] = result;
 136    } else {
 137       uint32_t *dst32 = dst_slot;
 138       dst32[value_index] = result;
 139    }
 140 }
 141
 142 static bool
 143 query_is_available(uint64_t *slot)
 144 {
 145    return *(volatile uint64_t *)slot;
 146 }
 147
 148 static VkResult
 149 wait_for_available(struct anv_device *device,
 150                    struct anv_query_pool *pool, uint64_t *slot)
 151 {
 152    while (true) {
 153       if (query_is_available(slot))
 154          return VK_SUCCESS;
 155
 156       int ret = anv_gem_busy(device, pool->bo.gem_handle);
 157       if (ret == 1) {
 158          /* The BO is still busy, keep waiting. */
 159          continue;
 160       } else if (ret == -1) {
 161          /* We don't know the real error. */
 162          device->lost = true;
 163          return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
 164       } else {
 165          assert(ret == 0);
 166          /* The BO is no longer busy. */
 167          if (query_is_available(slot)) {
 168             return VK_SUCCESS;
 169          } else {
 170             VkResult status = anv_device_query_status(device);
 171             if (status != VK_SUCCESS)
 172                return status;
 173
 174             /* If we haven't seen availability yet, then we never will.  This
 175              * can only happen if we have a client error where they call
 176              * GetQueryPoolResults on a query that they haven't submitted to
 177              * the GPU yet.  The spec allows us to do anything in this case,
 178              * but returning VK_SUCCESS doesn't seem right and we shouldn't
 179              * just keep spinning.
 180              */
 181             return VK_NOT_READY;
 182          }
 183       }
 184    }
 185 }
 186
 187 VkResult genX(GetQueryPoolResults)(
 188     VkDevice                                    _device,
 189     VkQueryPool                                 queryPool,
 190     uint32_t                                    firstQuery,
 191     uint32_t                                    queryCount,
 192     size_t                                      dataSize,
 193     void*                                       pData,
 194     VkDeviceSize                                stride,
 195     VkQueryResultFlags                          flags)
 196 {
 197    ANV_FROM_HANDLE(anv_device, device, _device);
 198    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 199
 200    assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
 201           pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
 202           pool->type == VK_QUERY_TYPE_TIMESTAMP);
 203
 204    if (unlikely(device->lost))
 205       return VK_ERROR_DEVICE_LOST;
 206
 207    if (pData == NULL)
 208       return VK_SUCCESS;
 209
 210    void *data_end = pData + dataSize;
 211
 212    VkResult status = VK_SUCCESS;
 213    for (uint32_t i = 0; i < queryCount; i++) {
 214       uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
 215
 216       /* Availability is always at the start of the slot */
 217       bool available = slot[0];
 218
 219       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
 220          status = wait_for_available(device, pool, slot);
 221          if (status != VK_SUCCESS)
 222             return status;
 223
 224          available = true;
 225       }
 226
 227       /* From the Vulkan 1.0.42 spec:
 228        *
 229        *    "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
 230        *    both not set then no result values are written to pData for
 231        *    queries that are in the unavailable state at the time of the call,
 232        *    and vkGetQueryPoolResults returns VK_NOT_READY. However,
 233        *    availability state is still written to pData for those queries if
 234        *    VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
 235        */
 236       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
 237
 238       if (write_results) {
 239          switch (pool->type) {
 240          case VK_QUERY_TYPE_OCCLUSION: {
 241             cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]);
 242             break;
 243          }
 244
 245          case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 246             uint32_t statistics = pool->pipeline_statistics;
 247             uint32_t idx = 0;
 248             while (statistics) {
 249                uint32_t stat = u_bit_scan(&statistics);
 250                uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
 251
 252                /* WaDividePSInvocationCountBy4:HSW,BDW */
 253                if ((device->info.gen == 8 || device->info.is_haswell) &&
 254                    (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
 255                   result >>= 2;
 256
 257                cpu_write_query_result(pData, flags, idx, result);
 258
 259                idx++;
 260             }
 261             assert(idx == _mesa_bitcount(pool->pipeline_statistics));
 262             break;
 263          }
 264
 265          case VK_QUERY_TYPE_TIMESTAMP: {
 266             cpu_write_query_result(pData, flags, 0, slot[1]);
 267             break;
 268          }
 269          default:
 270             unreachable("invalid pool type");
 271          }
 272       } else {
 273          status = VK_NOT_READY;
 274       }
 275
 276       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 277          uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
 278                         _mesa_bitcount(pool->pipeline_statistics) : 1;
 279          cpu_write_query_result(pData, flags, idx, available);
 280       }
 281
 282       pData += stride;
 283       if (pData >= data_end)
 284          break;
 285    }
 286
 287    return status;
 288 }
 289
 290 static void
 291 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
 292                     struct anv_bo *bo, uint32_t offset)
 293 {
 294    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 295       pc.DestinationAddressType  = DAT_PPGTT;
 296       pc.PostSyncOperation       = WritePSDepthCount;
 297       pc.DepthStallEnable        = true;
 298       pc.Address                 = (struct anv_address) { bo, offset };
 299
 300       if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
 301          pc.CommandStreamerStallEnable = true;
 302    }
 303 }
 304
 305 static void
 306 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
 307                         struct anv_bo *bo, uint32_t offset)
 308 {
 309    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 310       pc.DestinationAddressType  = DAT_PPGTT;
 311       pc.PostSyncOperation       = WriteImmediateData;
 312       pc.Address                 = (struct anv_address) { bo, offset };
 313       pc.ImmediateData           = 1;
 314    }
 315 }
 316
 317 void genX(CmdResetQueryPool)(
 318     VkCommandBuffer                             commandBuffer,
 319     VkQueryPool                                 queryPool,
 320     uint32_t                                    firstQuery,
 321     uint32_t                                    queryCount)
 322 {
 323    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 324    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 325
 326    for (uint32_t i = 0; i < queryCount; i++) {
 327       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
 328          sdm.Address = (struct anv_address) {
 329             .bo = &pool->bo,
 330             .offset = (firstQuery + i) * pool->stride,
 331          };
 332          sdm.ImmediateData = 0;
 333       }
 334    }
 335 }
 336
 337 static const uint32_t vk_pipeline_stat_to_reg[] = {
 338    GENX(IA_VERTICES_COUNT_num),
 339    GENX(IA_PRIMITIVES_COUNT_num),
 340    GENX(VS_INVOCATION_COUNT_num),
 341    GENX(GS_INVOCATION_COUNT_num),
 342    GENX(GS_PRIMITIVES_COUNT_num),
 343    GENX(CL_INVOCATION_COUNT_num),
 344    GENX(CL_PRIMITIVES_COUNT_num),
 345    GENX(PS_INVOCATION_COUNT_num),
 346    GENX(HS_INVOCATION_COUNT_num),
 347    GENX(DS_INVOCATION_COUNT_num),
 348    GENX(CS_INVOCATION_COUNT_num),
 349 };
 350
 351 static void
 352 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
 353                    struct anv_bo *bo, uint32_t offset)
 354 {
 355    STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
 356                  (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
 357
 358    assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
 359    uint32_t reg = vk_pipeline_stat_to_reg[stat];
 360
 361    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
 362       lrm.RegisterAddress  = reg,
 363       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
 364    }
 365    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
 366       lrm.RegisterAddress  = reg + 4,
 367       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
 368    }
 369 }
 370
 371 void genX(CmdBeginQuery)(
 372     VkCommandBuffer                             commandBuffer,
 373     VkQueryPool                                 queryPool,
 374     uint32_t                                    query,
 375     VkQueryControlFlags                         flags)
 376 {
 377    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 378    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 379
 380    /* Workaround: When meta uses the pipeline with the VS disabled, it seems
 381     * that the pipelining of the depth write breaks. What we see is that
 382     * samples from the render pass clear leaks into the first query
 383     * immediately after the clear. Doing a pipecontrol with a post-sync
 384     * operation and DepthStallEnable seems to work around the issue.
 385     */
 386    if (cmd_buffer->state.need_query_wa) {
 387       cmd_buffer->state.need_query_wa = false;
 388       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 389          pc.DepthCacheFlushEnable   = true;
 390          pc.DepthStallEnable        = true;
 391       }
 392    }
 393
 394    switch (pool->type) {
 395    case VK_QUERY_TYPE_OCCLUSION:
 396       emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8);
 397       break;
 398
 399    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 400       /* TODO: This might only be necessary for certain stats */
 401       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 402          pc.CommandStreamerStallEnable = true;
 403          pc.StallAtPixelScoreboard = true;
 404       }
 405
 406       uint32_t statistics = pool->pipeline_statistics;
 407       uint32_t offset = query * pool->stride + 8;
 408       while (statistics) {
 409          uint32_t stat = u_bit_scan(&statistics);
 410          emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
 411          offset += 16;
 412       }
 413       break;
 414    }
 415
 416    default:
 417       unreachable("");
 418    }
 419 }
 420
 421 void genX(CmdEndQuery)(
 422     VkCommandBuffer                             commandBuffer,
 423     VkQueryPool                                 queryPool,
 424     uint32_t                                    query)
 425 {
 426    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 427    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 428
 429    switch (pool->type) {
 430    case VK_QUERY_TYPE_OCCLUSION:
 431       emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16);
 432       emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
 433       break;
 434
 435    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 436       /* TODO: This might only be necessary for certain stats */
 437       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 438          pc.CommandStreamerStallEnable = true;
 439          pc.StallAtPixelScoreboard = true;
 440       }
 441
 442       uint32_t statistics = pool->pipeline_statistics;
 443       uint32_t offset = query * pool->stride + 16;
 444       while (statistics) {
 445          uint32_t stat = u_bit_scan(&statistics);
 446          emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
 447          offset += 16;
 448       }
 449
 450       emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
 451       break;
 452    }
 453
 454    default:
 455       unreachable("");
 456    }
 457 }
 458
 459 #define TIMESTAMP 0x2358
 460
 461 void genX(CmdWriteTimestamp)(
 462     VkCommandBuffer                             commandBuffer,
 463     VkPipelineStageFlagBits                     pipelineStage,
 464     VkQueryPool                                 queryPool,
 465     uint32_t                                    query)
 466 {
 467    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 468    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 469    uint32_t offset = query * pool->stride;
 470
 471    assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
 472
 473    switch (pipelineStage) {
 474    case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
 475       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 476          srm.RegisterAddress  = TIMESTAMP;
 477          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 8 };
 478       }
 479       anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 480          srm.RegisterAddress  = TIMESTAMP + 4;
 481          srm.MemoryAddress    = (struct anv_address) { &pool->bo, offset + 12 };
 482       }
 483       break;
 484
 485    default:
 486       /* Everything else is bottom-of-pipe */
 487       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 488          pc.DestinationAddressType  = DAT_PPGTT;
 489          pc.PostSyncOperation       = WriteTimestamp;
 490          pc.Address = (struct anv_address) { &pool->bo, offset + 8 };
 491
 492          if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
 493             pc.CommandStreamerStallEnable = true;
 494       }
 495       break;
 496    }
 497
 498    emit_query_availability(cmd_buffer, &pool->bo, offset);
 499 }
 500
 501 #if GEN_GEN > 7 || GEN_IS_HASWELL
 502
 503 #define alu_opcode(v)   __gen_uint((v),  20, 31)
 504 #define alu_operand1(v) __gen_uint((v),  10, 19)
 505 #define alu_operand2(v) __gen_uint((v),   0,  9)
 506 #define alu(opcode, operand1, operand2) \
 507    alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
 508
 509 #define OPCODE_NOOP      0x000
 510 #define OPCODE_LOAD      0x080
 511 #define OPCODE_LOADINV   0x480
 512 #define OPCODE_LOAD0     0x081
 513 #define OPCODE_LOAD1     0x481
 514 #define OPCODE_ADD       0x100
 515 #define OPCODE_SUB       0x101
 516 #define OPCODE_AND       0x102
 517 #define OPCODE_OR        0x103
 518 #define OPCODE_XOR       0x104
 519 #define OPCODE_STORE     0x180
 520 #define OPCODE_STOREINV  0x580
 521
 522 #define OPERAND_R0   0x00
 523 #define OPERAND_R1   0x01
 524 #define OPERAND_R2   0x02
 525 #define OPERAND_R3   0x03
 526 #define OPERAND_R4   0x04
 527 #define OPERAND_SRCA 0x20
 528 #define OPERAND_SRCB 0x21
 529 #define OPERAND_ACCU 0x31
 530 #define OPERAND_ZF   0x32
 531 #define OPERAND_CF   0x33
 532
 533 #define CS_GPR(n) (0x2600 + (n) * 8)
 534
 535 static void
 536 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
 537                       struct anv_bo *bo, uint32_t offset)
 538 {
 539    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 540       lrm.RegisterAddress  = reg,
 541       lrm.MemoryAddress    = (struct anv_address) { bo, offset };
 542    }
 543    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 544       lrm.RegisterAddress  = reg + 4;
 545       lrm.MemoryAddress    = (struct anv_address) { bo, offset + 4 };
 546    }
 547 }
 548
 549 static void
 550 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
 551 {
 552    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
 553       lri.RegisterOffset   = reg;
 554       lri.DataDWord        = imm;
 555    }
 556 }
 557
 558 static void
 559 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
 560 {
 561    emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
 562    emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
 563 }
 564
 565 static void
 566 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
 567 {
 568    anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
 569       lrr.SourceRegisterAddress      = src;
 570       lrr.DestinationRegisterAddress = dst;
 571    }
 572 }
 573
 574 /*
 575  * GPR0 = GPR0 & ((1ull << n) - 1);
 576  */
 577 static void
 578 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
 579 {
 580    assert(n < 64);
 581    emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
 582
 583    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
 584    dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
 585    dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
 586    dw[3] = alu(OPCODE_AND, 0, 0);
 587    dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
 588 }
 589
 590 /*
 591  * GPR0 = GPR0 << 30;
 592  */
 593 static void
 594 shl_gpr0_by_30_bits(struct anv_batch *batch)
 595 {
 596    /* First we mask 34 bits of GPR0 to prevent overflow */
 597    keep_gpr0_lower_n_bits(batch, 34);
 598
 599    const uint32_t outer_count = 5;
 600    const uint32_t inner_count = 6;
 601    STATIC_ASSERT(outer_count * inner_count == 30);
 602    const uint32_t cmd_len = 1 + inner_count * 4;
 603
 604    /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
 605     * 30 left shifts.
 606     */
 607    for (int o = 0; o < outer_count; o++) {
 608       /* Submit one MI_MATH to shift left by 6 bits */
 609       uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
 610       dw++;
 611       for (int i = 0; i < inner_count; i++, dw += 4) {
 612          dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
 613          dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
 614          dw[2] = alu(OPCODE_ADD, 0, 0);
 615          dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
 616       }
 617    }
 618 }
 619
 620 /*
 621  * GPR0 = GPR0 >> 2;
 622  *
 623  * Note that the upper 30 bits of GPR are lost!
 624  */
 625 static void
 626 shr_gpr0_by_2_bits(struct anv_batch *batch)
 627 {
 628    shl_gpr0_by_30_bits(batch);
 629    emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
 630    emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
 631 }
 632
 633 static void
 634 gpu_write_query_result(struct anv_batch *batch,
 635                        struct anv_buffer *dst_buffer, uint32_t dst_offset,
 636                        VkQueryResultFlags flags,
 637                        uint32_t value_index, uint32_t reg)
 638 {
 639    if (flags & VK_QUERY_RESULT_64_BIT)
 640       dst_offset += value_index * 8;
 641    else
 642       dst_offset += value_index * 4;
 643
 644    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 645       srm.RegisterAddress  = reg;
 646       srm.MemoryAddress    = (struct anv_address) {
 647          .bo = dst_buffer->bo,
 648          .offset = dst_buffer->offset + dst_offset,
 649       };
 650    }
 651
 652    if (flags & VK_QUERY_RESULT_64_BIT) {
 653       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
 654          srm.RegisterAddress  = reg + 4;
 655          srm.MemoryAddress    = (struct anv_address) {
 656             .bo = dst_buffer->bo,
 657             .offset = dst_buffer->offset + dst_offset + 4,
 658          };
 659       }
 660    }
 661 }
 662
 663 static void
 664 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
 665                      struct anv_bo *bo, uint32_t offset)
 666 {
 667    emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
 668    emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
 669
 670    /* FIXME: We need to clamp the result for 32 bit. */
 671
 672    uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
 673    if (!dw) {
 674       anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
 675       return;
 676    }
 677
 678    dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
 679    dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
 680    dw[3] = alu(OPCODE_SUB, 0, 0);
 681    dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
 682 }
 683
 684 void genX(CmdCopyQueryPoolResults)(
 685     VkCommandBuffer                             commandBuffer,
 686     VkQueryPool                                 queryPool,
 687     uint32_t                                    firstQuery,
 688     uint32_t                                    queryCount,
 689     VkBuffer                                    destBuffer,
 690     VkDeviceSize                                destOffset,
 691     VkDeviceSize                                destStride,
 692     VkQueryResultFlags                          flags)
 693 {
 694    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
 695    ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
 696    ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
 697    uint32_t slot_offset;
 698
 699    if (flags & VK_QUERY_RESULT_WAIT_BIT) {
 700       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
 701          pc.CommandStreamerStallEnable = true;
 702          pc.StallAtPixelScoreboard     = true;
 703       }
 704    }
 705
 706    for (uint32_t i = 0; i < queryCount; i++) {
 707       slot_offset = (firstQuery + i) * pool->stride;
 708       switch (pool->type) {
 709       case VK_QUERY_TYPE_OCCLUSION:
 710          compute_query_result(&cmd_buffer->batch, OPERAND_R2,
 711                               &pool->bo, slot_offset + 8);
 712          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
 713                                 flags, 0, CS_GPR(2));
 714          break;
 715
 716       case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
 717          uint32_t statistics = pool->pipeline_statistics;
 718          uint32_t idx = 0;
 719          while (statistics) {
 720             uint32_t stat = u_bit_scan(&statistics);
 721
 722             compute_query_result(&cmd_buffer->batch, OPERAND_R0,
 723                                  &pool->bo, slot_offset + idx * 16 + 8);
 724
 725             /* WaDividePSInvocationCountBy4:HSW,BDW */
 726             if ((cmd_buffer->device->info.gen == 8 ||
 727                  cmd_buffer->device->info.is_haswell) &&
 728                 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
 729                shr_gpr0_by_2_bits(&cmd_buffer->batch);
 730             }
 731
 732             gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
 733                                    flags, idx, CS_GPR(0));
 734
 735             idx++;
 736          }
 737          assert(idx == _mesa_bitcount(pool->pipeline_statistics));
 738          break;
 739       }
 740
 741       case VK_QUERY_TYPE_TIMESTAMP:
 742          emit_load_alu_reg_u64(&cmd_buffer->batch,
 743                                CS_GPR(2), &pool->bo, slot_offset + 8);
 744          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
 745                                 flags, 0, CS_GPR(2));
 746          break;
 747
 748       default:
 749          unreachable("unhandled query type");
 750       }
 751
 752       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 753          uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
 754                         _mesa_bitcount(pool->pipeline_statistics) : 1;
 755
 756          emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
 757                                &pool->bo, slot_offset);
 758          gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
 759                                 flags, idx, CS_GPR(0));
 760       }
 761
 762       destOffset += destStride;
 763    }
 764 }
 765
 766 #else
 767 void genX(CmdCopyQueryPoolResults)(
 768     VkCommandBuffer                             commandBuffer,
 769     VkQueryPool                                 queryPool,
 770     uint32_t                                    firstQuery,
 771     uint32_t                                    queryCount,
 772     VkBuffer                                    destBuffer,
 773     VkDeviceSize                                destOffset,
 774     VkDeviceSize                                destStride,
 775     VkQueryResultFlags                          flags)
 776 {
 777    anv_finishme("Queries not yet supported on Ivy Bridge");
 778 }
 779 #endif