2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include "anv_private.h"
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
35 VkResult
genX(CreateQueryPool
)(
37 const VkQueryPoolCreateInfo
* pCreateInfo
,
38 const VkAllocationCallbacks
* pAllocator
,
39 VkQueryPool
* pQueryPool
)
41 ANV_FROM_HANDLE(anv_device
, device
, _device
);
42 struct anv_query_pool
*pool
;
45 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
47 /* Query pool slots are made up of some number of 64-bit values packed
48 * tightly together. The first 64-bit value is always the "available" bit
49 * which is 0 when the query is unavailable and 1 when it is available.
50 * The 64-bit values that follow are determined by the type of query.
52 uint32_t uint64s_per_slot
= 1;
54 VkQueryPipelineStatisticFlags pipeline_statistics
= 0;
55 switch (pCreateInfo
->queryType
) {
56 case VK_QUERY_TYPE_OCCLUSION
:
57 /* Occlusion queries have two values: begin and end. */
58 uint64s_per_slot
+= 2;
60 case VK_QUERY_TYPE_TIMESTAMP
:
61 /* Timestamps just have the one timestamp value */
62 uint64s_per_slot
+= 1;
64 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
65 pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
66 /* We're going to trust this field implicitly so we need to ensure that
67 * no unhandled extension bits leak in.
69 pipeline_statistics
&= ANV_PIPELINE_STATISTICS_MASK
;
71 /* Statistics queries have a min and max for every statistic */
72 uint64s_per_slot
+= 2 * _mesa_bitcount(pipeline_statistics
);
75 assert(!"Invalid query type");
78 pool
= vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*pool
), 8,
79 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
81 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
83 pool
->type
= pCreateInfo
->queryType
;
84 pool
->pipeline_statistics
= pipeline_statistics
;
85 pool
->stride
= uint64s_per_slot
* sizeof(uint64_t);
86 pool
->slots
= pCreateInfo
->queryCount
;
88 uint64_t size
= pool
->slots
* pool
->stride
;
89 result
= anv_bo_init_new(&pool
->bo
, device
, size
);
90 if (result
!= VK_SUCCESS
)
93 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
94 * platforms, this does nothing. On non-LLC platforms, this means snooping
95 * which comes at a slight cost. However, the buffers aren't big, won't be
96 * written frequently, and trying to handle the flushing manually without
97 * doing too much flushing is extremely painful.
99 anv_gem_set_caching(device
, pool
->bo
.gem_handle
, I915_CACHING_CACHED
);
101 pool
->bo
.map
= anv_gem_mmap(device
, pool
->bo
.gem_handle
, 0, size
, 0);
103 *pQueryPool
= anv_query_pool_to_handle(pool
);
108 vk_free2(&device
->alloc
, pAllocator
, pool
);
113 void genX(DestroyQueryPool
)(
116 const VkAllocationCallbacks
* pAllocator
)
118 ANV_FROM_HANDLE(anv_device
, device
, _device
);
119 ANV_FROM_HANDLE(anv_query_pool
, pool
, _pool
);
124 anv_gem_munmap(pool
->bo
.map
, pool
->bo
.size
);
125 anv_gem_close(device
, pool
->bo
.gem_handle
);
126 vk_free2(&device
->alloc
, pAllocator
, pool
);
130 cpu_write_query_result(void *dst_slot
, VkQueryResultFlags flags
,
131 uint32_t value_index
, uint64_t result
)
133 if (flags
& VK_QUERY_RESULT_64_BIT
) {
134 uint64_t *dst64
= dst_slot
;
135 dst64
[value_index
] = result
;
137 uint32_t *dst32
= dst_slot
;
138 dst32
[value_index
] = result
;
143 query_is_available(uint64_t *slot
)
145 return *(volatile uint64_t *)slot
;
149 wait_for_available(struct anv_device
*device
,
150 struct anv_query_pool
*pool
, uint64_t *slot
)
153 if (query_is_available(slot
))
156 int ret
= anv_gem_busy(device
, pool
->bo
.gem_handle
);
158 /* The BO is still busy, keep waiting. */
160 } else if (ret
== -1) {
161 /* We don't know the real error. */
163 return vk_errorf(VK_ERROR_DEVICE_LOST
, "gem wait failed: %m");
166 /* The BO is no longer busy. */
167 if (query_is_available(slot
)) {
170 VkResult status
= anv_device_query_status(device
);
171 if (status
!= VK_SUCCESS
)
174 /* If we haven't seen availability yet, then we never will. This
175 * can only happen if we have a client error where they call
176 * GetQueryPoolResults on a query that they haven't submitted to
177 * the GPU yet. The spec allows us to do anything in this case,
178 * but returning VK_SUCCESS doesn't seem right and we shouldn't
179 * just keep spinning.
187 VkResult
genX(GetQueryPoolResults
)(
189 VkQueryPool queryPool
,
195 VkQueryResultFlags flags
)
197 ANV_FROM_HANDLE(anv_device
, device
, _device
);
198 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
200 assert(pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
201 pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
||
202 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
204 if (unlikely(device
->lost
))
205 return VK_ERROR_DEVICE_LOST
;
210 void *data_end
= pData
+ dataSize
;
212 VkResult status
= VK_SUCCESS
;
213 for (uint32_t i
= 0; i
< queryCount
; i
++) {
214 uint64_t *slot
= pool
->bo
.map
+ (firstQuery
+ i
) * pool
->stride
;
216 /* Availability is always at the start of the slot */
217 bool available
= slot
[0];
219 if (!available
&& (flags
& VK_QUERY_RESULT_WAIT_BIT
)) {
220 status
= wait_for_available(device
, pool
, slot
);
221 if (status
!= VK_SUCCESS
)
227 /* From the Vulkan 1.0.42 spec:
229 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
230 * both not set then no result values are written to pData for
231 * queries that are in the unavailable state at the time of the call,
232 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
233 * availability state is still written to pData for those queries if
234 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
236 bool write_results
= available
|| (flags
& VK_QUERY_RESULT_PARTIAL_BIT
);
239 switch (pool
->type
) {
240 case VK_QUERY_TYPE_OCCLUSION
: {
241 cpu_write_query_result(pData
, flags
, 0, slot
[2] - slot
[1]);
245 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
246 uint32_t statistics
= pool
->pipeline_statistics
;
249 uint32_t stat
= u_bit_scan(&statistics
);
250 uint64_t result
= slot
[idx
* 2 + 2] - slot
[idx
* 2 + 1];
252 /* WaDividePSInvocationCountBy4:HSW,BDW */
253 if ((device
->info
.gen
== 8 || device
->info
.is_haswell
) &&
254 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
)
257 cpu_write_query_result(pData
, flags
, idx
, result
);
261 assert(idx
== _mesa_bitcount(pool
->pipeline_statistics
));
265 case VK_QUERY_TYPE_TIMESTAMP
: {
266 cpu_write_query_result(pData
, flags
, 0, slot
[1]);
270 unreachable("invalid pool type");
273 status
= VK_NOT_READY
;
276 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
277 uint32_t idx
= (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) ?
278 _mesa_bitcount(pool
->pipeline_statistics
) : 1;
279 cpu_write_query_result(pData
, flags
, idx
, available
);
283 if (pData
>= data_end
)
291 emit_ps_depth_count(struct anv_cmd_buffer
*cmd_buffer
,
292 struct anv_bo
*bo
, uint32_t offset
)
294 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
295 pc
.DestinationAddressType
= DAT_PPGTT
;
296 pc
.PostSyncOperation
= WritePSDepthCount
;
297 pc
.DepthStallEnable
= true;
298 pc
.Address
= (struct anv_address
) { bo
, offset
};
300 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
301 pc
.CommandStreamerStallEnable
= true;
306 emit_query_availability(struct anv_cmd_buffer
*cmd_buffer
,
307 struct anv_bo
*bo
, uint32_t offset
)
309 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
310 pc
.DestinationAddressType
= DAT_PPGTT
;
311 pc
.PostSyncOperation
= WriteImmediateData
;
312 pc
.Address
= (struct anv_address
) { bo
, offset
};
313 pc
.ImmediateData
= 1;
317 void genX(CmdResetQueryPool
)(
318 VkCommandBuffer commandBuffer
,
319 VkQueryPool queryPool
,
323 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
324 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
326 for (uint32_t i
= 0; i
< queryCount
; i
++) {
327 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_DATA_IMM
), sdm
) {
328 sdm
.Address
= (struct anv_address
) {
330 .offset
= (firstQuery
+ i
) * pool
->stride
,
332 sdm
.ImmediateData
= 0;
337 static const uint32_t vk_pipeline_stat_to_reg
[] = {
338 GENX(IA_VERTICES_COUNT_num
),
339 GENX(IA_PRIMITIVES_COUNT_num
),
340 GENX(VS_INVOCATION_COUNT_num
),
341 GENX(GS_INVOCATION_COUNT_num
),
342 GENX(GS_PRIMITIVES_COUNT_num
),
343 GENX(CL_INVOCATION_COUNT_num
),
344 GENX(CL_PRIMITIVES_COUNT_num
),
345 GENX(PS_INVOCATION_COUNT_num
),
346 GENX(HS_INVOCATION_COUNT_num
),
347 GENX(DS_INVOCATION_COUNT_num
),
348 GENX(CS_INVOCATION_COUNT_num
),
352 emit_pipeline_stat(struct anv_cmd_buffer
*cmd_buffer
, uint32_t stat
,
353 struct anv_bo
*bo
, uint32_t offset
)
355 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK
==
356 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg
)) - 1);
358 assert(stat
< ARRAY_SIZE(vk_pipeline_stat_to_reg
));
359 uint32_t reg
= vk_pipeline_stat_to_reg
[stat
];
361 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
362 lrm
.RegisterAddress
= reg
,
363 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
};
365 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
366 lrm
.RegisterAddress
= reg
+ 4,
367 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
+ 4 };
371 void genX(CmdBeginQuery
)(
372 VkCommandBuffer commandBuffer
,
373 VkQueryPool queryPool
,
375 VkQueryControlFlags flags
)
377 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
378 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
380 /* Workaround: When meta uses the pipeline with the VS disabled, it seems
381 * that the pipelining of the depth write breaks. What we see is that
382 * samples from the render pass clear leaks into the first query
383 * immediately after the clear. Doing a pipecontrol with a post-sync
384 * operation and DepthStallEnable seems to work around the issue.
386 if (cmd_buffer
->state
.need_query_wa
) {
387 cmd_buffer
->state
.need_query_wa
= false;
388 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
389 pc
.DepthCacheFlushEnable
= true;
390 pc
.DepthStallEnable
= true;
394 switch (pool
->type
) {
395 case VK_QUERY_TYPE_OCCLUSION
:
396 emit_ps_depth_count(cmd_buffer
, &pool
->bo
, query
* pool
->stride
+ 8);
399 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
400 /* TODO: This might only be necessary for certain stats */
401 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
402 pc
.CommandStreamerStallEnable
= true;
403 pc
.StallAtPixelScoreboard
= true;
406 uint32_t statistics
= pool
->pipeline_statistics
;
407 uint32_t offset
= query
* pool
->stride
+ 8;
409 uint32_t stat
= u_bit_scan(&statistics
);
410 emit_pipeline_stat(cmd_buffer
, stat
, &pool
->bo
, offset
);
421 void genX(CmdEndQuery
)(
422 VkCommandBuffer commandBuffer
,
423 VkQueryPool queryPool
,
426 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
427 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
429 switch (pool
->type
) {
430 case VK_QUERY_TYPE_OCCLUSION
:
431 emit_ps_depth_count(cmd_buffer
, &pool
->bo
, query
* pool
->stride
+ 16);
432 emit_query_availability(cmd_buffer
, &pool
->bo
, query
* pool
->stride
);
435 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
436 /* TODO: This might only be necessary for certain stats */
437 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
438 pc
.CommandStreamerStallEnable
= true;
439 pc
.StallAtPixelScoreboard
= true;
442 uint32_t statistics
= pool
->pipeline_statistics
;
443 uint32_t offset
= query
* pool
->stride
+ 16;
445 uint32_t stat
= u_bit_scan(&statistics
);
446 emit_pipeline_stat(cmd_buffer
, stat
, &pool
->bo
, offset
);
450 emit_query_availability(cmd_buffer
, &pool
->bo
, query
* pool
->stride
);
459 #define TIMESTAMP 0x2358
461 void genX(CmdWriteTimestamp
)(
462 VkCommandBuffer commandBuffer
,
463 VkPipelineStageFlagBits pipelineStage
,
464 VkQueryPool queryPool
,
467 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
468 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
469 uint32_t offset
= query
* pool
->stride
;
471 assert(pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
473 switch (pipelineStage
) {
474 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
:
475 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
476 srm
.RegisterAddress
= TIMESTAMP
;
477 srm
.MemoryAddress
= (struct anv_address
) { &pool
->bo
, offset
+ 8 };
479 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
480 srm
.RegisterAddress
= TIMESTAMP
+ 4;
481 srm
.MemoryAddress
= (struct anv_address
) { &pool
->bo
, offset
+ 12 };
486 /* Everything else is bottom-of-pipe */
487 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
488 pc
.DestinationAddressType
= DAT_PPGTT
;
489 pc
.PostSyncOperation
= WriteTimestamp
;
490 pc
.Address
= (struct anv_address
) { &pool
->bo
, offset
+ 8 };
492 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
493 pc
.CommandStreamerStallEnable
= true;
498 emit_query_availability(cmd_buffer
, &pool
->bo
, offset
);
501 #if GEN_GEN > 7 || GEN_IS_HASWELL
503 static inline uint32_t
504 mi_alu(uint32_t opcode
, uint32_t operand1
, uint32_t operand2
)
506 struct GENX(MI_MATH_ALU_INSTRUCTION
) instr
= {
508 .Operand1
= operand1
,
509 .Operand2
= operand2
,
513 GENX(MI_MATH_ALU_INSTRUCTION_pack
)(NULL
, &dw
, &instr
);
518 #define CS_GPR(n) (0x2600 + (n) * 8)
521 emit_load_alu_reg_u64(struct anv_batch
*batch
, uint32_t reg
,
522 struct anv_bo
*bo
, uint32_t offset
)
524 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_MEM
), lrm
) {
525 lrm
.RegisterAddress
= reg
,
526 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
};
528 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_MEM
), lrm
) {
529 lrm
.RegisterAddress
= reg
+ 4;
530 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
+ 4 };
535 emit_load_alu_reg_imm32(struct anv_batch
*batch
, uint32_t reg
, uint32_t imm
)
537 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_IMM
), lri
) {
538 lri
.RegisterOffset
= reg
;
544 emit_load_alu_reg_imm64(struct anv_batch
*batch
, uint32_t reg
, uint64_t imm
)
546 emit_load_alu_reg_imm32(batch
, reg
, (uint32_t)imm
);
547 emit_load_alu_reg_imm32(batch
, reg
+ 4, (uint32_t)(imm
>> 32));
551 emit_load_alu_reg_reg32(struct anv_batch
*batch
, uint32_t src
, uint32_t dst
)
553 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_REG
), lrr
) {
554 lrr
.SourceRegisterAddress
= src
;
555 lrr
.DestinationRegisterAddress
= dst
;
560 * GPR0 = GPR0 & ((1ull << n) - 1);
563 keep_gpr0_lower_n_bits(struct anv_batch
*batch
, uint32_t n
)
566 emit_load_alu_reg_imm64(batch
, CS_GPR(1), (1ull << n
) - 1);
568 uint32_t *dw
= anv_batch_emitn(batch
, 5, GENX(MI_MATH
));
570 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
574 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG0
);
575 dw
[2] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG1
);
576 dw
[3] = mi_alu(MI_ALU_AND
, 0, 0);
577 dw
[4] = mi_alu(MI_ALU_STORE
, MI_ALU_REG0
, MI_ALU_ACCU
);
584 shl_gpr0_by_30_bits(struct anv_batch
*batch
)
586 /* First we mask 34 bits of GPR0 to prevent overflow */
587 keep_gpr0_lower_n_bits(batch
, 34);
589 const uint32_t outer_count
= 5;
590 const uint32_t inner_count
= 6;
591 STATIC_ASSERT(outer_count
* inner_count
== 30);
592 const uint32_t cmd_len
= 1 + inner_count
* 4;
594 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
597 for (int o
= 0; o
< outer_count
; o
++) {
598 /* Submit one MI_MATH to shift left by 6 bits */
599 uint32_t *dw
= anv_batch_emitn(batch
, cmd_len
, GENX(MI_MATH
));
601 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
606 for (int i
= 0; i
< inner_count
; i
++, dw
+= 4) {
607 dw
[0] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG0
);
608 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG0
);
609 dw
[2] = mi_alu(MI_ALU_ADD
, 0, 0);
610 dw
[3] = mi_alu(MI_ALU_STORE
, MI_ALU_REG0
, MI_ALU_ACCU
);
618 * Note that the upper 30 bits of GPR are lost!
621 shr_gpr0_by_2_bits(struct anv_batch
*batch
)
623 shl_gpr0_by_30_bits(batch
);
624 emit_load_alu_reg_reg32(batch
, CS_GPR(0) + 4, CS_GPR(0));
625 emit_load_alu_reg_imm32(batch
, CS_GPR(0) + 4, 0);
629 gpu_write_query_result(struct anv_batch
*batch
,
630 struct anv_buffer
*dst_buffer
, uint32_t dst_offset
,
631 VkQueryResultFlags flags
,
632 uint32_t value_index
, uint32_t reg
)
634 if (flags
& VK_QUERY_RESULT_64_BIT
)
635 dst_offset
+= value_index
* 8;
637 dst_offset
+= value_index
* 4;
639 anv_batch_emit(batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
640 srm
.RegisterAddress
= reg
;
641 srm
.MemoryAddress
= (struct anv_address
) {
642 .bo
= dst_buffer
->bo
,
643 .offset
= dst_buffer
->offset
+ dst_offset
,
647 if (flags
& VK_QUERY_RESULT_64_BIT
) {
648 anv_batch_emit(batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
649 srm
.RegisterAddress
= reg
+ 4;
650 srm
.MemoryAddress
= (struct anv_address
) {
651 .bo
= dst_buffer
->bo
,
652 .offset
= dst_buffer
->offset
+ dst_offset
+ 4,
659 compute_query_result(struct anv_batch
*batch
, uint32_t dst_reg
,
660 struct anv_bo
*bo
, uint32_t offset
)
662 emit_load_alu_reg_u64(batch
, CS_GPR(0), bo
, offset
);
663 emit_load_alu_reg_u64(batch
, CS_GPR(1), bo
, offset
+ 8);
665 /* FIXME: We need to clamp the result for 32 bit. */
667 uint32_t *dw
= anv_batch_emitn(batch
, 5, GENX(MI_MATH
));
669 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
673 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG1
);
674 dw
[2] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG0
);
675 dw
[3] = mi_alu(MI_ALU_SUB
, 0, 0);
676 dw
[4] = mi_alu(MI_ALU_STORE
, dst_reg
, MI_ALU_ACCU
);
679 void genX(CmdCopyQueryPoolResults
)(
680 VkCommandBuffer commandBuffer
,
681 VkQueryPool queryPool
,
685 VkDeviceSize destOffset
,
686 VkDeviceSize destStride
,
687 VkQueryResultFlags flags
)
689 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
690 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
691 ANV_FROM_HANDLE(anv_buffer
, buffer
, destBuffer
);
692 uint32_t slot_offset
;
694 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
695 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
696 pc
.CommandStreamerStallEnable
= true;
697 pc
.StallAtPixelScoreboard
= true;
701 for (uint32_t i
= 0; i
< queryCount
; i
++) {
702 slot_offset
= (firstQuery
+ i
) * pool
->stride
;
703 switch (pool
->type
) {
704 case VK_QUERY_TYPE_OCCLUSION
:
705 compute_query_result(&cmd_buffer
->batch
, MI_ALU_REG2
,
706 &pool
->bo
, slot_offset
+ 8);
707 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
708 flags
, 0, CS_GPR(2));
711 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
712 uint32_t statistics
= pool
->pipeline_statistics
;
715 uint32_t stat
= u_bit_scan(&statistics
);
717 compute_query_result(&cmd_buffer
->batch
, MI_ALU_REG0
,
718 &pool
->bo
, slot_offset
+ idx
* 16 + 8);
720 /* WaDividePSInvocationCountBy4:HSW,BDW */
721 if ((cmd_buffer
->device
->info
.gen
== 8 ||
722 cmd_buffer
->device
->info
.is_haswell
) &&
723 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
) {
724 shr_gpr0_by_2_bits(&cmd_buffer
->batch
);
727 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
728 flags
, idx
, CS_GPR(0));
732 assert(idx
== _mesa_bitcount(pool
->pipeline_statistics
));
736 case VK_QUERY_TYPE_TIMESTAMP
:
737 emit_load_alu_reg_u64(&cmd_buffer
->batch
,
738 CS_GPR(2), &pool
->bo
, slot_offset
+ 8);
739 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
740 flags
, 0, CS_GPR(2));
744 unreachable("unhandled query type");
747 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
748 uint32_t idx
= (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) ?
749 _mesa_bitcount(pool
->pipeline_statistics
) : 1;
751 emit_load_alu_reg_u64(&cmd_buffer
->batch
, CS_GPR(0),
752 &pool
->bo
, slot_offset
);
753 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
754 flags
, idx
, CS_GPR(0));
757 destOffset
+= destStride
;
762 void genX(CmdCopyQueryPoolResults
)(
763 VkCommandBuffer commandBuffer
,
764 VkQueryPool queryPool
,
768 VkDeviceSize destOffset
,
769 VkDeviceSize destStride
,
770 VkQueryResultFlags flags
)
772 anv_finishme("Queries not yet supported on Ivy Bridge");