2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include "anv_private.h"
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
35 VkResult
genX(CreateQueryPool
)(
37 const VkQueryPoolCreateInfo
* pCreateInfo
,
38 const VkAllocationCallbacks
* pAllocator
,
39 VkQueryPool
* pQueryPool
)
41 ANV_FROM_HANDLE(anv_device
, device
, _device
);
42 const struct anv_physical_device
*pdevice
= &device
->instance
->physicalDevice
;
43 struct anv_query_pool
*pool
;
46 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
48 /* Query pool slots are made up of some number of 64-bit values packed
49 * tightly together. The first 64-bit value is always the "available" bit
50 * which is 0 when the query is unavailable and 1 when it is available.
51 * The 64-bit values that follow are determined by the type of query.
53 uint32_t uint64s_per_slot
= 1;
55 VkQueryPipelineStatisticFlags pipeline_statistics
= 0;
56 switch (pCreateInfo
->queryType
) {
57 case VK_QUERY_TYPE_OCCLUSION
:
58 /* Occlusion queries have two values: begin and end. */
59 uint64s_per_slot
+= 2;
61 case VK_QUERY_TYPE_TIMESTAMP
:
62 /* Timestamps just have the one timestamp value */
63 uint64s_per_slot
+= 1;
65 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
66 pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
67 /* We're going to trust this field implicitly so we need to ensure that
68 * no unhandled extension bits leak in.
70 pipeline_statistics
&= ANV_PIPELINE_STATISTICS_MASK
;
72 /* Statistics queries have a min and max for every statistic */
73 uint64s_per_slot
+= 2 * _mesa_bitcount(pipeline_statistics
);
76 assert(!"Invalid query type");
79 pool
= vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*pool
), 8,
80 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
82 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
84 pool
->type
= pCreateInfo
->queryType
;
85 pool
->pipeline_statistics
= pipeline_statistics
;
86 pool
->stride
= uint64s_per_slot
* sizeof(uint64_t);
87 pool
->slots
= pCreateInfo
->queryCount
;
89 uint64_t size
= pool
->slots
* pool
->stride
;
90 result
= anv_bo_init_new(&pool
->bo
, device
, size
);
91 if (result
!= VK_SUCCESS
)
94 if (pdevice
->supports_48bit_addresses
)
95 pool
->bo
.flags
|= EXEC_OBJECT_SUPPORTS_48B_ADDRESS
;
97 if (pdevice
->use_softpin
)
98 pool
->bo
.flags
|= EXEC_OBJECT_PINNED
;
100 if (pdevice
->has_exec_async
)
101 pool
->bo
.flags
|= EXEC_OBJECT_ASYNC
;
103 anv_vma_alloc(device
, &pool
->bo
);
105 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
106 * platforms, this does nothing. On non-LLC platforms, this means snooping
107 * which comes at a slight cost. However, the buffers aren't big, won't be
108 * written frequently, and trying to handle the flushing manually without
109 * doing too much flushing is extremely painful.
111 anv_gem_set_caching(device
, pool
->bo
.gem_handle
, I915_CACHING_CACHED
);
113 pool
->bo
.map
= anv_gem_mmap(device
, pool
->bo
.gem_handle
, 0, size
, 0);
115 *pQueryPool
= anv_query_pool_to_handle(pool
);
120 vk_free2(&device
->alloc
, pAllocator
, pool
);
125 void genX(DestroyQueryPool
)(
128 const VkAllocationCallbacks
* pAllocator
)
130 ANV_FROM_HANDLE(anv_device
, device
, _device
);
131 ANV_FROM_HANDLE(anv_query_pool
, pool
, _pool
);
136 anv_gem_munmap(pool
->bo
.map
, pool
->bo
.size
);
137 anv_vma_free(device
, &pool
->bo
);
138 anv_gem_close(device
, pool
->bo
.gem_handle
);
139 vk_free2(&device
->alloc
, pAllocator
, pool
);
143 cpu_write_query_result(void *dst_slot
, VkQueryResultFlags flags
,
144 uint32_t value_index
, uint64_t result
)
146 if (flags
& VK_QUERY_RESULT_64_BIT
) {
147 uint64_t *dst64
= dst_slot
;
148 dst64
[value_index
] = result
;
150 uint32_t *dst32
= dst_slot
;
151 dst32
[value_index
] = result
;
156 query_is_available(uint64_t *slot
)
158 return *(volatile uint64_t *)slot
;
162 wait_for_available(struct anv_device
*device
,
163 struct anv_query_pool
*pool
, uint64_t *slot
)
166 if (query_is_available(slot
))
169 int ret
= anv_gem_busy(device
, pool
->bo
.gem_handle
);
171 /* The BO is still busy, keep waiting. */
173 } else if (ret
== -1) {
174 /* We don't know the real error. */
176 return vk_errorf(device
->instance
, device
, VK_ERROR_DEVICE_LOST
,
177 "gem wait failed: %m");
180 /* The BO is no longer busy. */
181 if (query_is_available(slot
)) {
184 VkResult status
= anv_device_query_status(device
);
185 if (status
!= VK_SUCCESS
)
188 /* If we haven't seen availability yet, then we never will. This
189 * can only happen if we have a client error where they call
190 * GetQueryPoolResults on a query that they haven't submitted to
191 * the GPU yet. The spec allows us to do anything in this case,
192 * but returning VK_SUCCESS doesn't seem right and we shouldn't
193 * just keep spinning.
201 VkResult
genX(GetQueryPoolResults
)(
203 VkQueryPool queryPool
,
209 VkQueryResultFlags flags
)
211 ANV_FROM_HANDLE(anv_device
, device
, _device
);
212 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
214 assert(pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
215 pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
||
216 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
218 if (unlikely(device
->lost
))
219 return VK_ERROR_DEVICE_LOST
;
224 void *data_end
= pData
+ dataSize
;
226 VkResult status
= VK_SUCCESS
;
227 for (uint32_t i
= 0; i
< queryCount
; i
++) {
228 uint64_t *slot
= pool
->bo
.map
+ (firstQuery
+ i
) * pool
->stride
;
230 /* Availability is always at the start of the slot */
231 bool available
= slot
[0];
233 if (!available
&& (flags
& VK_QUERY_RESULT_WAIT_BIT
)) {
234 status
= wait_for_available(device
, pool
, slot
);
235 if (status
!= VK_SUCCESS
)
241 /* From the Vulkan 1.0.42 spec:
243 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
244 * both not set then no result values are written to pData for
245 * queries that are in the unavailable state at the time of the call,
246 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
247 * availability state is still written to pData for those queries if
248 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
250 bool write_results
= available
|| (flags
& VK_QUERY_RESULT_PARTIAL_BIT
);
253 switch (pool
->type
) {
254 case VK_QUERY_TYPE_OCCLUSION
: {
255 cpu_write_query_result(pData
, flags
, 0, slot
[2] - slot
[1]);
259 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
260 uint32_t statistics
= pool
->pipeline_statistics
;
263 uint32_t stat
= u_bit_scan(&statistics
);
264 uint64_t result
= slot
[idx
* 2 + 2] - slot
[idx
* 2 + 1];
266 /* WaDividePSInvocationCountBy4:HSW,BDW */
267 if ((device
->info
.gen
== 8 || device
->info
.is_haswell
) &&
268 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
)
271 cpu_write_query_result(pData
, flags
, idx
, result
);
275 assert(idx
== _mesa_bitcount(pool
->pipeline_statistics
));
279 case VK_QUERY_TYPE_TIMESTAMP
: {
280 cpu_write_query_result(pData
, flags
, 0, slot
[1]);
284 unreachable("invalid pool type");
287 status
= VK_NOT_READY
;
290 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
291 uint32_t idx
= (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) ?
292 _mesa_bitcount(pool
->pipeline_statistics
) : 1;
293 cpu_write_query_result(pData
, flags
, idx
, available
);
297 if (pData
>= data_end
)
305 emit_ps_depth_count(struct anv_cmd_buffer
*cmd_buffer
,
306 struct anv_bo
*bo
, uint32_t offset
)
308 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
309 pc
.DestinationAddressType
= DAT_PPGTT
;
310 pc
.PostSyncOperation
= WritePSDepthCount
;
311 pc
.DepthStallEnable
= true;
312 pc
.Address
= (struct anv_address
) { bo
, offset
};
314 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
315 pc
.CommandStreamerStallEnable
= true;
320 emit_query_availability(struct anv_cmd_buffer
*cmd_buffer
,
321 struct anv_bo
*bo
, uint32_t offset
)
323 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
324 pc
.DestinationAddressType
= DAT_PPGTT
;
325 pc
.PostSyncOperation
= WriteImmediateData
;
326 pc
.Address
= (struct anv_address
) { bo
, offset
};
327 pc
.ImmediateData
= 1;
332 * Goes through a series of consecutive query indices in the given pool
333 * setting all element values to 0 and emitting them as available.
336 emit_zero_queries(struct anv_cmd_buffer
*cmd_buffer
,
337 struct anv_query_pool
*pool
,
338 uint32_t first_index
, uint32_t num_queries
)
340 const uint32_t num_elements
= pool
->stride
/ sizeof(uint64_t);
342 for (uint32_t i
= 0; i
< num_queries
; i
++) {
343 uint32_t slot_offset
= (first_index
+ i
) * pool
->stride
;
344 for (uint32_t j
= 1; j
< num_elements
; j
++) {
345 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_DATA_IMM
), sdi
) {
346 sdi
.Address
.bo
= &pool
->bo
;
347 sdi
.Address
.offset
= slot_offset
+ j
* sizeof(uint64_t);
348 sdi
.ImmediateData
= 0ull;
351 emit_query_availability(cmd_buffer
, &pool
->bo
, slot_offset
);
355 void genX(CmdResetQueryPool
)(
356 VkCommandBuffer commandBuffer
,
357 VkQueryPool queryPool
,
361 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
362 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
364 for (uint32_t i
= 0; i
< queryCount
; i
++) {
365 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_DATA_IMM
), sdm
) {
366 sdm
.Address
= (struct anv_address
) {
368 .offset
= (firstQuery
+ i
) * pool
->stride
,
370 sdm
.ImmediateData
= 0;
375 static const uint32_t vk_pipeline_stat_to_reg
[] = {
376 GENX(IA_VERTICES_COUNT_num
),
377 GENX(IA_PRIMITIVES_COUNT_num
),
378 GENX(VS_INVOCATION_COUNT_num
),
379 GENX(GS_INVOCATION_COUNT_num
),
380 GENX(GS_PRIMITIVES_COUNT_num
),
381 GENX(CL_INVOCATION_COUNT_num
),
382 GENX(CL_PRIMITIVES_COUNT_num
),
383 GENX(PS_INVOCATION_COUNT_num
),
384 GENX(HS_INVOCATION_COUNT_num
),
385 GENX(DS_INVOCATION_COUNT_num
),
386 GENX(CS_INVOCATION_COUNT_num
),
390 emit_pipeline_stat(struct anv_cmd_buffer
*cmd_buffer
, uint32_t stat
,
391 struct anv_bo
*bo
, uint32_t offset
)
393 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK
==
394 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg
)) - 1);
396 assert(stat
< ARRAY_SIZE(vk_pipeline_stat_to_reg
));
397 uint32_t reg
= vk_pipeline_stat_to_reg
[stat
];
399 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
400 lrm
.RegisterAddress
= reg
,
401 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
};
403 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
404 lrm
.RegisterAddress
= reg
+ 4,
405 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
+ 4 };
409 void genX(CmdBeginQuery
)(
410 VkCommandBuffer commandBuffer
,
411 VkQueryPool queryPool
,
413 VkQueryControlFlags flags
)
415 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
416 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
418 switch (pool
->type
) {
419 case VK_QUERY_TYPE_OCCLUSION
:
420 emit_ps_depth_count(cmd_buffer
, &pool
->bo
, query
* pool
->stride
+ 8);
423 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
424 /* TODO: This might only be necessary for certain stats */
425 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
426 pc
.CommandStreamerStallEnable
= true;
427 pc
.StallAtPixelScoreboard
= true;
430 uint32_t statistics
= pool
->pipeline_statistics
;
431 uint32_t offset
= query
* pool
->stride
+ 8;
433 uint32_t stat
= u_bit_scan(&statistics
);
434 emit_pipeline_stat(cmd_buffer
, stat
, &pool
->bo
, offset
);
445 void genX(CmdEndQuery
)(
446 VkCommandBuffer commandBuffer
,
447 VkQueryPool queryPool
,
450 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
451 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
453 switch (pool
->type
) {
454 case VK_QUERY_TYPE_OCCLUSION
:
455 emit_ps_depth_count(cmd_buffer
, &pool
->bo
, query
* pool
->stride
+ 16);
456 emit_query_availability(cmd_buffer
, &pool
->bo
, query
* pool
->stride
);
459 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
460 /* TODO: This might only be necessary for certain stats */
461 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
462 pc
.CommandStreamerStallEnable
= true;
463 pc
.StallAtPixelScoreboard
= true;
466 uint32_t statistics
= pool
->pipeline_statistics
;
467 uint32_t offset
= query
* pool
->stride
+ 16;
469 uint32_t stat
= u_bit_scan(&statistics
);
470 emit_pipeline_stat(cmd_buffer
, stat
, &pool
->bo
, offset
);
474 emit_query_availability(cmd_buffer
, &pool
->bo
, query
* pool
->stride
);
482 /* When multiview is active the spec requires that N consecutive query
483 * indices are used, where N is the number of active views in the subpass.
484 * The spec allows that we only write the results to one of the queries
485 * but we still need to manage result availability for all the query indices.
486 * Since we only emit a single query for all active views in the
487 * first index, mark the other query indices as being already available
490 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
491 const uint32_t num_queries
=
492 _mesa_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
494 emit_zero_queries(cmd_buffer
, pool
, query
+ 1, num_queries
- 1);
498 #define TIMESTAMP 0x2358
500 void genX(CmdWriteTimestamp
)(
501 VkCommandBuffer commandBuffer
,
502 VkPipelineStageFlagBits pipelineStage
,
503 VkQueryPool queryPool
,
506 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
507 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
508 uint32_t offset
= query
* pool
->stride
;
510 assert(pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
512 switch (pipelineStage
) {
513 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
:
514 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
515 srm
.RegisterAddress
= TIMESTAMP
;
516 srm
.MemoryAddress
= (struct anv_address
) { &pool
->bo
, offset
+ 8 };
518 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
519 srm
.RegisterAddress
= TIMESTAMP
+ 4;
520 srm
.MemoryAddress
= (struct anv_address
) { &pool
->bo
, offset
+ 12 };
525 /* Everything else is bottom-of-pipe */
526 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
527 pc
.DestinationAddressType
= DAT_PPGTT
;
528 pc
.PostSyncOperation
= WriteTimestamp
;
529 pc
.Address
= (struct anv_address
) { &pool
->bo
, offset
+ 8 };
531 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
532 pc
.CommandStreamerStallEnable
= true;
537 emit_query_availability(cmd_buffer
, &pool
->bo
, offset
);
539 /* When multiview is active the spec requires that N consecutive query
540 * indices are used, where N is the number of active views in the subpass.
541 * The spec allows that we only write the results to one of the queries
542 * but we still need to manage result availability for all the query indices.
543 * Since we only emit a single query for all active views in the
544 * first index, mark the other query indices as being already available
547 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
548 const uint32_t num_queries
=
549 _mesa_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
551 emit_zero_queries(cmd_buffer
, pool
, query
+ 1, num_queries
- 1);
555 #if GEN_GEN > 7 || GEN_IS_HASWELL
558 mi_alu(uint32_t opcode
, uint32_t operand1
, uint32_t operand2
)
560 struct GENX(MI_MATH_ALU_INSTRUCTION
) instr
= {
562 .Operand1
= operand1
,
563 .Operand2
= operand2
,
567 GENX(MI_MATH_ALU_INSTRUCTION_pack
)(NULL
, &dw
, &instr
);
572 #define CS_GPR(n) (0x2600 + (n) * 8)
575 emit_load_alu_reg_u64(struct anv_batch
*batch
, uint32_t reg
,
576 struct anv_bo
*bo
, uint32_t offset
)
578 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_MEM
), lrm
) {
579 lrm
.RegisterAddress
= reg
,
580 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
};
582 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_MEM
), lrm
) {
583 lrm
.RegisterAddress
= reg
+ 4;
584 lrm
.MemoryAddress
= (struct anv_address
) { bo
, offset
+ 4 };
589 emit_load_alu_reg_imm32(struct anv_batch
*batch
, uint32_t reg
, uint32_t imm
)
591 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_IMM
), lri
) {
592 lri
.RegisterOffset
= reg
;
598 emit_load_alu_reg_imm64(struct anv_batch
*batch
, uint32_t reg
, uint64_t imm
)
600 emit_load_alu_reg_imm32(batch
, reg
, (uint32_t)imm
);
601 emit_load_alu_reg_imm32(batch
, reg
+ 4, (uint32_t)(imm
>> 32));
605 emit_load_alu_reg_reg32(struct anv_batch
*batch
, uint32_t src
, uint32_t dst
)
607 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_REG
), lrr
) {
608 lrr
.SourceRegisterAddress
= src
;
609 lrr
.DestinationRegisterAddress
= dst
;
614 * GPR0 = GPR0 & ((1ull << n) - 1);
617 keep_gpr0_lower_n_bits(struct anv_batch
*batch
, uint32_t n
)
620 emit_load_alu_reg_imm64(batch
, CS_GPR(1), (1ull << n
) - 1);
622 uint32_t *dw
= anv_batch_emitn(batch
, 5, GENX(MI_MATH
));
624 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
628 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG0
);
629 dw
[2] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG1
);
630 dw
[3] = mi_alu(MI_ALU_AND
, 0, 0);
631 dw
[4] = mi_alu(MI_ALU_STORE
, MI_ALU_REG0
, MI_ALU_ACCU
);
638 shl_gpr0_by_30_bits(struct anv_batch
*batch
)
640 /* First we mask 34 bits of GPR0 to prevent overflow */
641 keep_gpr0_lower_n_bits(batch
, 34);
643 const uint32_t outer_count
= 5;
644 const uint32_t inner_count
= 6;
645 STATIC_ASSERT(outer_count
* inner_count
== 30);
646 const uint32_t cmd_len
= 1 + inner_count
* 4;
648 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
651 for (int o
= 0; o
< outer_count
; o
++) {
652 /* Submit one MI_MATH to shift left by 6 bits */
653 uint32_t *dw
= anv_batch_emitn(batch
, cmd_len
, GENX(MI_MATH
));
655 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
660 for (int i
= 0; i
< inner_count
; i
++, dw
+= 4) {
661 dw
[0] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG0
);
662 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG0
);
663 dw
[2] = mi_alu(MI_ALU_ADD
, 0, 0);
664 dw
[3] = mi_alu(MI_ALU_STORE
, MI_ALU_REG0
, MI_ALU_ACCU
);
672 * Note that the upper 30 bits of GPR are lost!
675 shr_gpr0_by_2_bits(struct anv_batch
*batch
)
677 shl_gpr0_by_30_bits(batch
);
678 emit_load_alu_reg_reg32(batch
, CS_GPR(0) + 4, CS_GPR(0));
679 emit_load_alu_reg_imm32(batch
, CS_GPR(0) + 4, 0);
683 gpu_write_query_result(struct anv_batch
*batch
,
684 struct anv_buffer
*dst_buffer
, uint32_t dst_offset
,
685 VkQueryResultFlags flags
,
686 uint32_t value_index
, uint32_t reg
)
688 if (flags
& VK_QUERY_RESULT_64_BIT
)
689 dst_offset
+= value_index
* 8;
691 dst_offset
+= value_index
* 4;
693 anv_batch_emit(batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
694 srm
.RegisterAddress
= reg
;
695 srm
.MemoryAddress
= anv_address_add(dst_buffer
->address
, dst_offset
);
698 if (flags
& VK_QUERY_RESULT_64_BIT
) {
699 anv_batch_emit(batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
700 srm
.RegisterAddress
= reg
+ 4;
701 srm
.MemoryAddress
= anv_address_add(dst_buffer
->address
,
708 compute_query_result(struct anv_batch
*batch
, uint32_t dst_reg
,
709 struct anv_bo
*bo
, uint32_t offset
)
711 emit_load_alu_reg_u64(batch
, CS_GPR(0), bo
, offset
);
712 emit_load_alu_reg_u64(batch
, CS_GPR(1), bo
, offset
+ 8);
714 /* FIXME: We need to clamp the result for 32 bit. */
716 uint32_t *dw
= anv_batch_emitn(batch
, 5, GENX(MI_MATH
));
718 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
722 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG1
);
723 dw
[2] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG0
);
724 dw
[3] = mi_alu(MI_ALU_SUB
, 0, 0);
725 dw
[4] = mi_alu(MI_ALU_STORE
, dst_reg
, MI_ALU_ACCU
);
728 void genX(CmdCopyQueryPoolResults
)(
729 VkCommandBuffer commandBuffer
,
730 VkQueryPool queryPool
,
734 VkDeviceSize destOffset
,
735 VkDeviceSize destStride
,
736 VkQueryResultFlags flags
)
738 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
739 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
740 ANV_FROM_HANDLE(anv_buffer
, buffer
, destBuffer
);
741 uint32_t slot_offset
;
743 if (flags
& VK_QUERY_RESULT_WAIT_BIT
) {
744 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
745 pc
.CommandStreamerStallEnable
= true;
746 pc
.StallAtPixelScoreboard
= true;
750 for (uint32_t i
= 0; i
< queryCount
; i
++) {
751 slot_offset
= (firstQuery
+ i
) * pool
->stride
;
752 switch (pool
->type
) {
753 case VK_QUERY_TYPE_OCCLUSION
:
754 compute_query_result(&cmd_buffer
->batch
, MI_ALU_REG2
,
755 &pool
->bo
, slot_offset
+ 8);
756 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
757 flags
, 0, CS_GPR(2));
760 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
761 uint32_t statistics
= pool
->pipeline_statistics
;
764 uint32_t stat
= u_bit_scan(&statistics
);
766 compute_query_result(&cmd_buffer
->batch
, MI_ALU_REG0
,
767 &pool
->bo
, slot_offset
+ idx
* 16 + 8);
769 /* WaDividePSInvocationCountBy4:HSW,BDW */
770 if ((cmd_buffer
->device
->info
.gen
== 8 ||
771 cmd_buffer
->device
->info
.is_haswell
) &&
772 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
) {
773 shr_gpr0_by_2_bits(&cmd_buffer
->batch
);
776 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
777 flags
, idx
, CS_GPR(0));
781 assert(idx
== _mesa_bitcount(pool
->pipeline_statistics
));
785 case VK_QUERY_TYPE_TIMESTAMP
:
786 emit_load_alu_reg_u64(&cmd_buffer
->batch
,
787 CS_GPR(2), &pool
->bo
, slot_offset
+ 8);
788 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
789 flags
, 0, CS_GPR(2));
793 unreachable("unhandled query type");
796 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
797 uint32_t idx
= (pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
) ?
798 _mesa_bitcount(pool
->pipeline_statistics
) : 1;
800 emit_load_alu_reg_u64(&cmd_buffer
->batch
, CS_GPR(0),
801 &pool
->bo
, slot_offset
);
802 gpu_write_query_result(&cmd_buffer
->batch
, buffer
, destOffset
,
803 flags
, idx
, CS_GPR(0));
806 destOffset
+= destStride
;
811 void genX(CmdCopyQueryPoolResults
)(
812 VkCommandBuffer commandBuffer
,
813 VkQueryPool queryPool
,
817 VkDeviceSize destOffset
,
818 VkDeviceSize destStride
,
819 VkQueryResultFlags flags
)
821 anv_finishme("Queries not yet supported on Ivy Bridge");