2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include "anv_private.h"
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
35 VkResult
genX(CreateQueryPool
)(
37 const VkQueryPoolCreateInfo
* pCreateInfo
,
38 const VkAllocationCallbacks
* pAllocator
,
39 VkQueryPool
* pQueryPool
)
41 ANV_FROM_HANDLE(anv_device
, device
, _device
);
42 struct anv_query_pool
*pool
;
45 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
47 /* Query pool slots are made up of some number of 64-bit values packed
48 * tightly together. The first 64-bit value is always the "available" bit
49 * which is 0 when the query is unavailable and 1 when it is available.
50 * The 64-bit values that follow are determined by the type of query.
52 uint32_t uint64s_per_slot
= 1;
54 VkQueryPipelineStatisticFlags pipeline_statistics
= 0;
55 switch (pCreateInfo
->queryType
) {
56 case VK_QUERY_TYPE_OCCLUSION
:
57 /* Occlusion queries have two values: begin and end. */
58 uint64s_per_slot
+= 2;
60 case VK_QUERY_TYPE_TIMESTAMP
:
61 /* Timestamps just have the one timestamp value */
62 uint64s_per_slot
+= 1;
64 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
65 pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
66 /* We're going to trust this field implicitly so we need to ensure that
67 * no unhandled extension bits leak in.
69 pipeline_statistics
&= ANV_PIPELINE_STATISTICS_MASK
;
71 /* Statistics queries have a min and max for every statistic */
72 uint64s_per_slot
+= 2 * _mesa_bitcount(pipeline_statistics
);
75 assert(!"Invalid query type");
78 pool
= vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*pool
), 8,
79 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
81 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
83 pool
->type
= pCreateInfo
->queryType
;
84 pool
->pipeline_statistics
= pipeline_statistics
;
85 pool
->stride
= uint64s_per_slot
* sizeof(uint64_t);
86 pool
->slots
= pCreateInfo
->queryCount
;
88 uint64_t size
= pool
->slots
* pool
->stride
;
89 result
= anv_bo_init_new(&pool
->bo
, device
, size
);
90 if (result
!= VK_SUCCESS
)
93 pool
->bo
.map
= anv_gem_mmap(device
, pool
->bo
.gem_handle
, 0, size
, 0);
95 *pQueryPool
= anv_query_pool_to_handle(pool
);
100 vk_free2(&device
->alloc
, pAllocator
, pool
);
105 void genX(DestroyQueryPool
)(
108 const VkAllocationCallbacks
* pAllocator
)
110 ANV_FROM_HANDLE(anv_device
, device
, _device
);
111 ANV_FROM_HANDLE(anv_query_pool
, pool
, _pool
);
116 anv_gem_munmap(pool
->bo
.map
, pool
->bo
.size
);
117 anv_gem_close(device
, pool
->bo
.gem_handle
);
118 vk_free2(&device
->alloc
, pAllocator
, pool
);
122 cpu_write_query_result(void *dst_slot
, VkQueryResultFlags flags
,
123 uint32_t value_index
, uint64_t result
)
125 if (flags
& VK_QUERY_RESULT_64_BIT
) {
126 uint64_t *dst64
= dst_slot
;
127 dst64
[value_index
] = result
;
129 uint32_t *dst32
= dst_slot
;
130 dst32
[value_index
] = result
;
134 VkResult
genX(GetQueryPoolResults
)(
136 VkQueryPool queryPool
,
142 VkQueryResultFlags flags
)
144 ANV_FROM_HANDLE(anv_device
, device
, _device
);
145 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
146 int64_t timeout
= INT64_MAX
;
149 assert(pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
150 pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
||
151 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
153 if (unlikely(device
->lost
))
154 return VK_ERROR_DEVICE_LOST
;
159 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
160 ret
= anv_gem_wait(device
, pool
->bo
.gem_handle
, &timeout
);
162 /* We don't know the real error. */
163 return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY
,
164 "gem_wait failed %m");
168 void *data_end
= pData
+ dataSize
;
170 if (!device
->info
.has_llc
) {
171 uint64_t offset
= firstQuery
* pool
->stride
;
172 uint64_t size
= queryCount
* pool
->stride
;
173 anv_invalidate_range(pool
->bo
.map
+ offset
,
174 MIN2(size
, pool
->bo
.size
- offset
));
177 VkResult status
= VK_SUCCESS
;
178 for (uint32_t i
= 0; i
< queryCount
; i
++) {
179 uint64_t *slot
= pool
->bo
.map
+ (firstQuery
+ i
) * pool
->stride
;
181 /* Availability is always at the start of the slot */
182 bool available
= slot
[0];
184 /* From the Vulkan 1.0.42 spec:
186 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
187 * both not set then no result values are written to pData for
188 * queries that are in the unavailable state at the time of the call,
189 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
190 * availability state is still written to pData for those queries if
191 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
193 bool write_results
= available
|| (flags
& VK_QUERY_RESULT_PARTIAL_BIT
);
196 switch (pool
->type
) {
197 case VK_QUERY_TYPE_OCCLUSION
: {
198 cpu_write_query_result(pData
, flags
, 0, slot
[2] - slot
[1]);
202 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
203 uint32_t statistics
= pool
->pipeline_statistics
;
206 uint32_t stat
= u_bit_scan(&statistics
);
207 uint64_t result
= slot
[idx
* 2 + 2] - slot
[idx
* 2 + 1];
209 /* WaDividePSInvocationCountBy4:HSW,BDW */
210 if ((device
->info
.gen
== 8 || device
->info
.is_haswell
) &&
211 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
)
214 cpu_write_query_result(pData
, flags
, idx
, result
);
218 assert(idx
== _mesa_bitcount(pool
->pipeline_statistics
));
222 case VK_QUERY_TYPE_TIMESTAMP
: {
223 cpu_write_query_result(pData
, flags
, 0, slot
[1]);
227 unreachable("invalid pool type");
230 status
= VK_NOT_READY
;
233 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
234 uint32_t idx
= (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) ?
235 _mesa_bitcount(pool
->pipeline_statistics
) : 1;
236 cpu_write_query_result(pData
, flags
, idx
, available
);
240 if (pData
>= data_end
)
248 emit_ps_depth_count(struct anv_cmd_buffer
*cmd_buffer
,
249 struct anv_bo
*bo
, uint32_t offset
)
251 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
252 pc
.DestinationAddressType
= DAT_PPGTT
;
253 pc
.PostSyncOperation
= WritePSDepthCount
;
254 pc
.DepthStallEnable
= true;
255 pc
.Address
= (struct anv_address
) { bo
, offset
};
257 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
258 pc
.CommandStreamerStallEnable
= true;
263 emit_query_availability(struct anv_cmd_buffer
*cmd_buffer
,
264 struct anv_bo
*bo
, uint32_t offset
)
266 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
267 pc
.DestinationAddressType
= DAT_PPGTT
;
268 pc
.PostSyncOperation
= WriteImmediateData
;
269 pc
.Address
= (struct anv_address
) { bo
, offset
};
270 pc
.ImmediateData
= 1;
274 void genX(CmdResetQueryPool
)(
275 VkCommandBuffer commandBuffer
,
276 VkQueryPool queryPool
,
280 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
281 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
283 for (uint32_t i
= 0; i
< queryCount
; i
++) {
284 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_DATA_IMM
), sdm
) {
285 sdm
.Address
= (struct anv_address
) {
287 .offset
= (firstQuery
+ i
) * pool
->stride
,
289 sdm
.ImmediateData
= 0;
294 static const uint32_t vk_pipeline_stat_to_reg
[] = {
295 GENX(IA_VERTICES_COUNT_num
),
296 GENX(IA_PRIMITIVES_COUNT_num
),
297 GENX(VS_INVOCATION_COUNT_num
),
298 GENX(GS_INVOCATION_COUNT_num
),
299 GENX(GS_PRIMITIVES_COUNT_num
),
300 GENX(CL_INVOCATION_COUNT_num
),
301 GENX(CL_PRIMITIVES_COUNT_num
),
302 GENX(PS_INVOCATION_COUNT_num
),
303 GENX(HS_INVOCATION_COUNT_num
),
304 GENX(DS_INVOCATION_COUNT_num
),
305 GENX(CS_INVOCATION_COUNT_num
),
309 emit_pipeline_stat(struct anv_cmd_buffer
*cmd_buffer
, uint32_t stat
,
310 struct anv_bo
*bo
, uint32_t offset
)
312 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK
==
313 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg
)) - 1);
315 assert(stat
< ARRAY_SIZE(vk_pipeline_stat_to_reg
));
316 uint32_t reg
= vk_pipeline_stat_to_reg
[stat
];
318 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
319 lrm
.RegisterAddress
= reg
,
320 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
};
322 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
323 lrm
.RegisterAddress
= reg
+ 4,
324 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
+ 4 };
328 void genX(CmdBeginQuery
)(
329 VkCommandBuffer commandBuffer
,
330 VkQueryPool queryPool
,
332 VkQueryControlFlags flags
)
334 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
335 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
337 /* Workaround: When meta uses the pipeline with the VS disabled, it seems
338 * that the pipelining of the depth write breaks. What we see is that
339 * samples from the render pass clear leaks into the first query
340 * immediately after the clear. Doing a pipecontrol with a post-sync
341 * operation and DepthStallEnable seems to work around the issue.
343 if (cmd_buffer
->state
.need_query_wa
) {
344 cmd_buffer
->state
.need_query_wa
= false;
345 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
346 pc
.DepthCacheFlushEnable
= true;
347 pc
.DepthStallEnable
= true;
351 switch (pool
->type
) {
352 case VK_QUERY_TYPE_OCCLUSION
:
353 emit_ps_depth_count(cmd_buffer
, &pool
->bo
, query
* pool
->stride
+ 8);
356 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
357 /* TODO: This might only be necessary for certain stats */
358 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
359 pc
.CommandStreamerStallEnable
= true;
360 pc
.StallAtPixelScoreboard
= true;
363 uint32_t statistics
= pool
->pipeline_statistics
;
364 uint32_t offset
= query
* pool
->stride
+ 8;
366 uint32_t stat
= u_bit_scan(&statistics
);
367 emit_pipeline_stat(cmd_buffer
, stat
, &pool
->bo
, offset
);
378 void genX(CmdEndQuery
)(
379 VkCommandBuffer commandBuffer
,
380 VkQueryPool queryPool
,
383 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
384 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
386 switch (pool
->type
) {
387 case VK_QUERY_TYPE_OCCLUSION
:
388 emit_ps_depth_count(cmd_buffer
, &pool
->bo
, query
* pool
->stride
+ 16);
389 emit_query_availability(cmd_buffer
, &pool
->bo
, query
* pool
->stride
);
392 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
393 /* TODO: This might only be necessary for certain stats */
394 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
395 pc
.CommandStreamerStallEnable
= true;
396 pc
.StallAtPixelScoreboard
= true;
399 uint32_t statistics
= pool
->pipeline_statistics
;
400 uint32_t offset
= query
* pool
->stride
+ 16;
402 uint32_t stat
= u_bit_scan(&statistics
);
403 emit_pipeline_stat(cmd_buffer
, stat
, &pool
->bo
, offset
);
407 emit_query_availability(cmd_buffer
, &pool
->bo
, query
* pool
->stride
);
416 #define TIMESTAMP 0x2358
418 void genX(CmdWriteTimestamp
)(
419 VkCommandBuffer commandBuffer
,
420 VkPipelineStageFlagBits pipelineStage
,
421 VkQueryPool queryPool
,
424 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
425 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
426 uint32_t offset
= query
* pool
->stride
;
428 assert(pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
430 switch (pipelineStage
) {
431 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
:
432 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
433 srm
.RegisterAddress
= TIMESTAMP
;
434 srm
.MemoryAddress
= (struct anv_address
) { &pool
->bo
, offset
+ 8 };
436 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
437 srm
.RegisterAddress
= TIMESTAMP
+ 4;
438 srm
.MemoryAddress
= (struct anv_address
) { &pool
->bo
, offset
+ 12 };
443 /* Everything else is bottom-of-pipe */
444 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
445 pc
.DestinationAddressType
= DAT_PPGTT
;
446 pc
.PostSyncOperation
= WriteTimestamp
;
447 pc
.Address
= (struct anv_address
) { &pool
->bo
, offset
+ 8 };
449 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
450 pc
.CommandStreamerStallEnable
= true;
455 emit_query_availability(cmd_buffer
, &pool
->bo
, offset
);
458 #if GEN_GEN > 7 || GEN_IS_HASWELL
460 #define alu_opcode(v) __gen_uint((v), 20, 31)
461 #define alu_operand1(v) __gen_uint((v), 10, 19)
462 #define alu_operand2(v) __gen_uint((v), 0, 9)
463 #define alu(opcode, operand1, operand2) \
464 alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
466 #define OPCODE_NOOP 0x000
467 #define OPCODE_LOAD 0x080
468 #define OPCODE_LOADINV 0x480
469 #define OPCODE_LOAD0 0x081
470 #define OPCODE_LOAD1 0x481
471 #define OPCODE_ADD 0x100
472 #define OPCODE_SUB 0x101
473 #define OPCODE_AND 0x102
474 #define OPCODE_OR 0x103
475 #define OPCODE_XOR 0x104
476 #define OPCODE_STORE 0x180
477 #define OPCODE_STOREINV 0x580
479 #define OPERAND_R0 0x00
480 #define OPERAND_R1 0x01
481 #define OPERAND_R2 0x02
482 #define OPERAND_R3 0x03
483 #define OPERAND_R4 0x04
484 #define OPERAND_SRCA 0x20
485 #define OPERAND_SRCB 0x21
486 #define OPERAND_ACCU 0x31
487 #define OPERAND_ZF 0x32
488 #define OPERAND_CF 0x33
490 #define CS_GPR(n) (0x2600 + (n) * 8)
493 emit_load_alu_reg_u64(struct anv_batch
*batch
, uint32_t reg
,
494 struct anv_bo
*bo
, uint32_t offset
)
496 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_MEM
), lrm
) {
497 lrm
.RegisterAddress
= reg
,
498 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
};
500 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_MEM
), lrm
) {
501 lrm
.RegisterAddress
= reg
+ 4;
502 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
+ 4 };
507 emit_load_alu_reg_imm32(struct anv_batch
*batch
, uint32_t reg
, uint32_t imm
)
509 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_IMM
), lri
) {
510 lri
.RegisterOffset
= reg
;
516 emit_load_alu_reg_imm64(struct anv_batch
*batch
, uint32_t reg
, uint64_t imm
)
518 emit_load_alu_reg_imm32(batch
, reg
, (uint32_t)imm
);
519 emit_load_alu_reg_imm32(batch
, reg
+ 4, (uint32_t)(imm
>> 32));
523 emit_load_alu_reg_reg32(struct anv_batch
*batch
, uint32_t src
, uint32_t dst
)
525 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_REG
), lrr
) {
526 lrr
.SourceRegisterAddress
= src
;
527 lrr
.DestinationRegisterAddress
= dst
;
532 * GPR0 = GPR0 & ((1ull << n) - 1);
535 keep_gpr0_lower_n_bits(struct anv_batch
*batch
, uint32_t n
)
538 emit_load_alu_reg_imm64(batch
, CS_GPR(1), (1ull << n
) - 1);
540 uint32_t *dw
= anv_batch_emitn(batch
, 5, GENX(MI_MATH
));
541 dw
[1] = alu(OPCODE_LOAD
, OPERAND_SRCA
, OPERAND_R0
);
542 dw
[2] = alu(OPCODE_LOAD
, OPERAND_SRCB
, OPERAND_R1
);
543 dw
[3] = alu(OPCODE_AND
, 0, 0);
544 dw
[4] = alu(OPCODE_STORE
, OPERAND_R0
, OPERAND_ACCU
);
551 shl_gpr0_by_30_bits(struct anv_batch
*batch
)
553 /* First we mask 34 bits of GPR0 to prevent overflow */
554 keep_gpr0_lower_n_bits(batch
, 34);
556 const uint32_t outer_count
= 5;
557 const uint32_t inner_count
= 6;
558 STATIC_ASSERT(outer_count
* inner_count
== 30);
559 const uint32_t cmd_len
= 1 + inner_count
* 4;
561 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
564 for (int o
= 0; o
< outer_count
; o
++) {
565 /* Submit one MI_MATH to shift left by 6 bits */
566 uint32_t *dw
= anv_batch_emitn(batch
, cmd_len
, GENX(MI_MATH
));
568 for (int i
= 0; i
< inner_count
; i
++, dw
+= 4) {
569 dw
[0] = alu(OPCODE_LOAD
, OPERAND_SRCA
, OPERAND_R0
);
570 dw
[1] = alu(OPCODE_LOAD
, OPERAND_SRCB
, OPERAND_R0
);
571 dw
[2] = alu(OPCODE_ADD
, 0, 0);
572 dw
[3] = alu(OPCODE_STORE
, OPERAND_R0
, OPERAND_ACCU
);
580 * Note that the upper 30 bits of GPR are lost!
583 shr_gpr0_by_2_bits(struct anv_batch
*batch
)
585 shl_gpr0_by_30_bits(batch
);
586 emit_load_alu_reg_reg32(batch
, CS_GPR(0) + 4, CS_GPR(0));
587 emit_load_alu_reg_imm32(batch
, CS_GPR(0) + 4, 0);
591 gpu_write_query_result(struct anv_batch
*batch
,
592 struct anv_buffer
*dst_buffer
, uint32_t dst_offset
,
593 VkQueryResultFlags flags
,
594 uint32_t value_index
, uint32_t reg
)
596 if (flags
& VK_QUERY_RESULT_64_BIT
)
597 dst_offset
+= value_index
* 8;
599 dst_offset
+= value_index
* 4;
601 anv_batch_emit(batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
602 srm
.RegisterAddress
= reg
;
603 srm
.MemoryAddress
= (struct anv_address
) {
604 .bo
= dst_buffer
->bo
,
605 .offset
= dst_buffer
->offset
+ dst_offset
,
609 if (flags
& VK_QUERY_RESULT_64_BIT
) {
610 anv_batch_emit(batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
611 srm
.RegisterAddress
= reg
+ 4;
612 srm
.MemoryAddress
= (struct anv_address
) {
613 .bo
= dst_buffer
->bo
,
614 .offset
= dst_buffer
->offset
+ dst_offset
+ 4,
621 compute_query_result(struct anv_batch
*batch
, uint32_t dst_reg
,
622 struct anv_bo
*bo
, uint32_t offset
)
624 emit_load_alu_reg_u64(batch
, CS_GPR(0), bo
, offset
);
625 emit_load_alu_reg_u64(batch
, CS_GPR(1), bo
, offset
+ 8);
627 /* FIXME: We need to clamp the result for 32 bit. */
629 uint32_t *dw
= anv_batch_emitn(batch
, 5, GENX(MI_MATH
));
631 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
635 dw
[1] = alu(OPCODE_LOAD
, OPERAND_SRCA
, OPERAND_R1
);
636 dw
[2] = alu(OPCODE_LOAD
, OPERAND_SRCB
, OPERAND_R0
);
637 dw
[3] = alu(OPCODE_SUB
, 0, 0);
638 dw
[4] = alu(OPCODE_STORE
, dst_reg
, OPERAND_ACCU
);
641 void genX(CmdCopyQueryPoolResults
)(
642 VkCommandBuffer commandBuffer
,
643 VkQueryPool queryPool
,
647 VkDeviceSize destOffset
,
648 VkDeviceSize destStride
,
649 VkQueryResultFlags flags
)
651 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
652 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
653 ANV_FROM_HANDLE(anv_buffer
, buffer
, destBuffer
);
654 uint32_t slot_offset
;
656 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
657 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
658 pc
.CommandStreamerStallEnable
= true;
659 pc
.StallAtPixelScoreboard
= true;
663 for (uint32_t i
= 0; i
< queryCount
; i
++) {
664 slot_offset
= (firstQuery
+ i
) * pool
->stride
;
665 switch (pool
->type
) {
666 case VK_QUERY_TYPE_OCCLUSION
:
667 compute_query_result(&cmd_buffer
->batch
, OPERAND_R2
,
668 &pool
->bo
, slot_offset
+ 8);
669 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
670 flags
, 0, CS_GPR(2));
673 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
674 uint32_t statistics
= pool
->pipeline_statistics
;
677 uint32_t stat
= u_bit_scan(&statistics
);
679 compute_query_result(&cmd_buffer
->batch
, OPERAND_R0
,
680 &pool
->bo
, slot_offset
+ idx
* 16 + 8);
682 /* WaDividePSInvocationCountBy4:HSW,BDW */
683 if ((cmd_buffer
->device
->info
.gen
== 8 ||
684 cmd_buffer
->device
->info
.is_haswell
) &&
685 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
) {
686 shr_gpr0_by_2_bits(&cmd_buffer
->batch
);
689 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
690 flags
, idx
, CS_GPR(0));
694 assert(idx
== _mesa_bitcount(pool
->pipeline_statistics
));
698 case VK_QUERY_TYPE_TIMESTAMP
:
699 emit_load_alu_reg_u64(&cmd_buffer
->batch
,
700 CS_GPR(2), &pool
->bo
, slot_offset
+ 8);
701 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
702 flags
, 0, CS_GPR(2));
706 unreachable("unhandled query type");
709 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
710 uint32_t idx
= (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) ?
711 _mesa_bitcount(pool
->pipeline_statistics
) : 1;
713 emit_load_alu_reg_u64(&cmd_buffer
->batch
, CS_GPR(0),
714 &pool
->bo
, slot_offset
);
715 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
716 flags
, idx
, CS_GPR(0));
719 destOffset
+= destStride
;
724 void genX(CmdCopyQueryPoolResults
)(
725 VkCommandBuffer commandBuffer
,
726 VkQueryPool queryPool
,
730 VkDeviceSize destOffset
,
731 VkDeviceSize destStride
,
732 VkQueryResultFlags flags
)
734 anv_finishme("Queries not yet supported on Ivy Bridge");