2 * Copyright © 2015 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 #include "anv_private.h"
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
35 VkResult
genX(CreateQueryPool
)(
37 const VkQueryPoolCreateInfo
* pCreateInfo
,
38 const VkAllocationCallbacks
* pAllocator
,
39 VkQueryPool
* pQueryPool
)
41 ANV_FROM_HANDLE(anv_device
, device
, _device
);
42 const struct anv_physical_device
*pdevice
= &device
->instance
->physicalDevice
;
43 struct anv_query_pool
*pool
;
46 assert(pCreateInfo
->sType
== VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO
);
48 /* Query pool slots are made up of some number of 64-bit values packed
49 * tightly together. The first 64-bit value is always the "available" bit
50 * which is 0 when the query is unavailable and 1 when it is available.
51 * The 64-bit values that follow are determined by the type of query.
53 uint32_t uint64s_per_slot
= 1;
55 VkQueryPipelineStatisticFlags pipeline_statistics
= 0;
56 switch (pCreateInfo
->queryType
) {
57 case VK_QUERY_TYPE_OCCLUSION
:
58 /* Occlusion queries have two values: begin and end. */
59 uint64s_per_slot
+= 2;
61 case VK_QUERY_TYPE_TIMESTAMP
:
62 /* Timestamps just have the one timestamp value */
63 uint64s_per_slot
+= 1;
65 case VK_QUERY_TYPE_PIPELINE_STATISTICS
:
66 pipeline_statistics
= pCreateInfo
->pipelineStatistics
;
67 /* We're going to trust this field implicitly so we need to ensure that
68 * no unhandled extension bits leak in.
70 pipeline_statistics
&= ANV_PIPELINE_STATISTICS_MASK
;
72 /* Statistics queries have a min and max for every statistic */
73 uint64s_per_slot
+= 2 * util_bitcount(pipeline_statistics
);
75 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
76 /* Transform feedback queries are 4 values, begin/end for
79 uint64s_per_slot
+= 4;
82 assert(!"Invalid query type");
85 pool
= vk_alloc2(&device
->alloc
, pAllocator
, sizeof(*pool
), 8,
86 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT
);
88 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY
);
90 pool
->type
= pCreateInfo
->queryType
;
91 pool
->pipeline_statistics
= pipeline_statistics
;
92 pool
->stride
= uint64s_per_slot
* sizeof(uint64_t);
93 pool
->slots
= pCreateInfo
->queryCount
;
95 uint64_t size
= pool
->slots
* pool
->stride
;
96 result
= anv_bo_init_new(&pool
->bo
, device
, size
);
97 if (result
!= VK_SUCCESS
)
100 if (pdevice
->supports_48bit_addresses
)
101 pool
->bo
.flags
|= EXEC_OBJECT_SUPPORTS_48B_ADDRESS
;
103 if (pdevice
->use_softpin
)
104 pool
->bo
.flags
|= EXEC_OBJECT_PINNED
;
106 if (pdevice
->has_exec_async
)
107 pool
->bo
.flags
|= EXEC_OBJECT_ASYNC
;
109 anv_vma_alloc(device
, &pool
->bo
);
111 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
112 * platforms, this does nothing. On non-LLC platforms, this means snooping
113 * which comes at a slight cost. However, the buffers aren't big, won't be
114 * written frequently, and trying to handle the flushing manually without
115 * doing too much flushing is extremely painful.
117 anv_gem_set_caching(device
, pool
->bo
.gem_handle
, I915_CACHING_CACHED
);
119 pool
->bo
.map
= anv_gem_mmap(device
, pool
->bo
.gem_handle
, 0, size
, 0);
121 *pQueryPool
= anv_query_pool_to_handle(pool
);
126 vk_free2(&device
->alloc
, pAllocator
, pool
);
131 void genX(DestroyQueryPool
)(
134 const VkAllocationCallbacks
* pAllocator
)
136 ANV_FROM_HANDLE(anv_device
, device
, _device
);
137 ANV_FROM_HANDLE(anv_query_pool
, pool
, _pool
);
142 anv_gem_munmap(pool
->bo
.map
, pool
->bo
.size
);
143 anv_vma_free(device
, &pool
->bo
);
144 anv_gem_close(device
, pool
->bo
.gem_handle
);
145 vk_free2(&device
->alloc
, pAllocator
, pool
);
148 static struct anv_address
149 anv_query_address(struct anv_query_pool
*pool
, uint32_t query
)
151 return (struct anv_address
) {
153 .offset
= query
* pool
->stride
,
158 cpu_write_query_result(void *dst_slot
, VkQueryResultFlags flags
,
159 uint32_t value_index
, uint64_t result
)
161 if (flags
& VK_QUERY_RESULT_64_BIT
) {
162 uint64_t *dst64
= dst_slot
;
163 dst64
[value_index
] = result
;
165 uint32_t *dst32
= dst_slot
;
166 dst32
[value_index
] = result
;
171 query_is_available(uint64_t *slot
)
173 return *(volatile uint64_t *)slot
;
177 wait_for_available(struct anv_device
*device
,
178 struct anv_query_pool
*pool
, uint64_t *slot
)
181 if (query_is_available(slot
))
184 int ret
= anv_gem_busy(device
, pool
->bo
.gem_handle
);
186 /* The BO is still busy, keep waiting. */
188 } else if (ret
== -1) {
189 /* We don't know the real error. */
190 return anv_device_set_lost(device
, "gem wait failed: %m");
193 /* The BO is no longer busy. */
194 if (query_is_available(slot
)) {
197 VkResult status
= anv_device_query_status(device
);
198 if (status
!= VK_SUCCESS
)
201 /* If we haven't seen availability yet, then we never will. This
202 * can only happen if we have a client error where they call
203 * GetQueryPoolResults on a query that they haven't submitted to
204 * the GPU yet. The spec allows us to do anything in this case,
205 * but returning VK_SUCCESS doesn't seem right and we shouldn't
206 * just keep spinning.
214 VkResult
genX(GetQueryPoolResults
)(
216 VkQueryPool queryPool
,
222 VkQueryResultFlags flags
)
224 ANV_FROM_HANDLE(anv_device
, device
, _device
);
225 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
227 assert(pool
->type
== VK_QUERY_TYPE_OCCLUSION
||
228 pool
->type
== VK_QUERY_TYPE_PIPELINE_STATISTICS
||
229 pool
->type
== VK_QUERY_TYPE_TIMESTAMP
||
230 pool
->type
== VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
);
232 if (anv_device_is_lost(device
))
233 return VK_ERROR_DEVICE_LOST
;
238 void *data_end
= pData
+ dataSize
;
240 VkResult status
= VK_SUCCESS
;
241 for (uint32_t i
= 0; i
< queryCount
; i
++) {
242 uint64_t *slot
= pool
->bo
.map
+ (firstQuery
+ i
) * pool
->stride
;
244 /* Availability is always at the start of the slot */
245 bool available
= slot
[0];
247 if (!available
&& (flags
& VK_QUERY_RESULT_WAIT_BIT
)) {
248 status
= wait_for_available(device
, pool
, slot
);
249 if (status
!= VK_SUCCESS
)
255 /* From the Vulkan 1.0.42 spec:
257 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
258 * both not set then no result values are written to pData for
259 * queries that are in the unavailable state at the time of the call,
260 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
261 * availability state is still written to pData for those queries if
262 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
264 bool write_results
= available
|| (flags
& VK_QUERY_RESULT_PARTIAL_BIT
);
267 switch (pool
->type
) {
268 case VK_QUERY_TYPE_OCCLUSION
:
270 cpu_write_query_result(pData
, flags
, idx
, slot
[2] - slot
[1]);
274 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
275 uint32_t statistics
= pool
->pipeline_statistics
;
277 uint32_t stat
= u_bit_scan(&statistics
);
279 uint64_t result
= slot
[idx
* 2 + 2] - slot
[idx
* 2 + 1];
281 /* WaDividePSInvocationCountBy4:HSW,BDW */
282 if ((device
->info
.gen
== 8 || device
->info
.is_haswell
) &&
283 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
)
286 cpu_write_query_result(pData
, flags
, idx
, result
);
290 assert(idx
== util_bitcount(pool
->pipeline_statistics
));
294 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
296 cpu_write_query_result(pData
, flags
, idx
, slot
[2] - slot
[1]);
299 cpu_write_query_result(pData
, flags
, idx
, slot
[4] - slot
[3]);
303 case VK_QUERY_TYPE_TIMESTAMP
:
305 cpu_write_query_result(pData
, flags
, idx
, slot
[1]);
310 unreachable("invalid pool type");
314 status
= VK_NOT_READY
;
316 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
)
317 cpu_write_query_result(pData
, flags
, idx
, available
);
320 if (pData
>= data_end
)
328 emit_srm32(struct anv_batch
*batch
, struct anv_address addr
, uint32_t reg
)
330 anv_batch_emit(batch
, GENX(MI_STORE_REGISTER_MEM
), srm
) {
331 srm
.MemoryAddress
= addr
;
332 srm
.RegisterAddress
= reg
;
337 emit_srm64(struct anv_batch
*batch
, struct anv_address addr
, uint32_t reg
)
339 emit_srm32(batch
, anv_address_add(addr
, 0), reg
+ 0);
340 emit_srm32(batch
, anv_address_add(addr
, 4), reg
+ 4);
344 emit_ps_depth_count(struct anv_cmd_buffer
*cmd_buffer
,
345 struct anv_address addr
)
347 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
348 pc
.DestinationAddressType
= DAT_PPGTT
;
349 pc
.PostSyncOperation
= WritePSDepthCount
;
350 pc
.DepthStallEnable
= true;
353 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
354 pc
.CommandStreamerStallEnable
= true;
359 emit_query_availability(struct anv_cmd_buffer
*cmd_buffer
,
360 struct anv_address addr
)
362 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
363 pc
.DestinationAddressType
= DAT_PPGTT
;
364 pc
.PostSyncOperation
= WriteImmediateData
;
366 pc
.ImmediateData
= 1;
371 * Goes through a series of consecutive query indices in the given pool
372 * setting all element values to 0 and emitting them as available.
375 emit_zero_queries(struct anv_cmd_buffer
*cmd_buffer
,
376 struct anv_query_pool
*pool
,
377 uint32_t first_index
, uint32_t num_queries
)
379 for (uint32_t i
= 0; i
< num_queries
; i
++) {
380 struct anv_address slot_addr
=
381 anv_query_address(pool
, first_index
+ i
);
382 genX(cmd_buffer_mi_memset
)(cmd_buffer
, anv_address_add(slot_addr
, 8),
383 0, pool
->stride
- 8);
384 emit_query_availability(cmd_buffer
, slot_addr
);
388 void genX(CmdResetQueryPool
)(
389 VkCommandBuffer commandBuffer
,
390 VkQueryPool queryPool
,
394 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
395 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
397 for (uint32_t i
= 0; i
< queryCount
; i
++) {
398 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_DATA_IMM
), sdm
) {
399 sdm
.Address
= anv_query_address(pool
, firstQuery
+ i
);
400 sdm
.ImmediateData
= 0;
405 static const uint32_t vk_pipeline_stat_to_reg
[] = {
406 GENX(IA_VERTICES_COUNT_num
),
407 GENX(IA_PRIMITIVES_COUNT_num
),
408 GENX(VS_INVOCATION_COUNT_num
),
409 GENX(GS_INVOCATION_COUNT_num
),
410 GENX(GS_PRIMITIVES_COUNT_num
),
411 GENX(CL_INVOCATION_COUNT_num
),
412 GENX(CL_PRIMITIVES_COUNT_num
),
413 GENX(PS_INVOCATION_COUNT_num
),
414 GENX(HS_INVOCATION_COUNT_num
),
415 GENX(DS_INVOCATION_COUNT_num
),
416 GENX(CS_INVOCATION_COUNT_num
),
420 emit_pipeline_stat(struct anv_cmd_buffer
*cmd_buffer
, uint32_t stat
,
421 struct anv_address addr
)
423 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK
==
424 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg
)) - 1);
426 assert(stat
< ARRAY_SIZE(vk_pipeline_stat_to_reg
));
427 emit_srm64(&cmd_buffer
->batch
, addr
, vk_pipeline_stat_to_reg
[stat
]);
431 emit_xfb_query(struct anv_cmd_buffer
*cmd_buffer
, uint32_t stream
,
432 struct anv_address addr
)
434 assert(stream
< MAX_XFB_STREAMS
);
436 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
437 lrm
.RegisterAddress
= GENX(SO_NUM_PRIMS_WRITTEN0_num
) + 0 + stream
* 8;
438 lrm
.MemoryAddress
= anv_address_add(addr
, 0);
440 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
441 lrm
.RegisterAddress
= GENX(SO_NUM_PRIMS_WRITTEN0_num
) + 4 + stream
* 8;
442 lrm
.MemoryAddress
= anv_address_add(addr
, 4);
445 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
446 lrm
.RegisterAddress
= GENX(SO_PRIM_STORAGE_NEEDED0_num
) + 0 + stream
* 8;
447 lrm
.MemoryAddress
= anv_address_add(addr
, 16);
449 anv_batch_emit(&cmd_buffer
->batch
, GENX(MI_STORE_REGISTER_MEM
), lrm
) {
450 lrm
.RegisterAddress
= GENX(SO_PRIM_STORAGE_NEEDED0_num
) + 4 + stream
* 8;
451 lrm
.MemoryAddress
= anv_address_add(addr
, 20);
455 void genX(CmdBeginQuery
)(
456 VkCommandBuffer commandBuffer
,
457 VkQueryPool queryPool
,
459 VkQueryControlFlags flags
)
461 genX(CmdBeginQueryIndexedEXT
)(commandBuffer
, queryPool
, query
, flags
, 0);
464 void genX(CmdBeginQueryIndexedEXT
)(
465 VkCommandBuffer commandBuffer
,
466 VkQueryPool queryPool
,
468 VkQueryControlFlags flags
,
471 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
472 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
473 struct anv_address query_addr
= anv_query_address(pool
, query
);
475 switch (pool
->type
) {
476 case VK_QUERY_TYPE_OCCLUSION
:
477 emit_ps_depth_count(cmd_buffer
, anv_address_add(query_addr
, 8));
480 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
481 /* TODO: This might only be necessary for certain stats */
482 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
483 pc
.CommandStreamerStallEnable
= true;
484 pc
.StallAtPixelScoreboard
= true;
487 uint32_t statistics
= pool
->pipeline_statistics
;
490 uint32_t stat
= u_bit_scan(&statistics
);
491 emit_pipeline_stat(cmd_buffer
, stat
,
492 anv_address_add(query_addr
, offset
));
498 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
499 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
500 pc
.CommandStreamerStallEnable
= true;
501 pc
.StallAtPixelScoreboard
= true;
503 emit_xfb_query(cmd_buffer
, index
, anv_address_add(query_addr
, 8));
511 void genX(CmdEndQuery
)(
512 VkCommandBuffer commandBuffer
,
513 VkQueryPool queryPool
,
514 VkQueryControlFlags flags
)
516 genX(CmdEndQueryIndexedEXT
)(commandBuffer
, queryPool
, flags
, 0);
519 void genX(CmdEndQueryIndexedEXT
)(
520 VkCommandBuffer commandBuffer
,
521 VkQueryPool queryPool
,
525 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
526 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
527 struct anv_address query_addr
= anv_query_address(pool
, query
);
529 switch (pool
->type
) {
530 case VK_QUERY_TYPE_OCCLUSION
:
531 emit_ps_depth_count(cmd_buffer
, anv_address_add(query_addr
, 16));
532 emit_query_availability(cmd_buffer
, query_addr
);
535 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
536 /* TODO: This might only be necessary for certain stats */
537 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
538 pc
.CommandStreamerStallEnable
= true;
539 pc
.StallAtPixelScoreboard
= true;
542 uint32_t statistics
= pool
->pipeline_statistics
;
543 uint32_t offset
= 16;
545 uint32_t stat
= u_bit_scan(&statistics
);
546 emit_pipeline_stat(cmd_buffer
, stat
,
547 anv_address_add(query_addr
, offset
));
551 emit_query_availability(cmd_buffer
, query_addr
);
555 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
556 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
557 pc
.CommandStreamerStallEnable
= true;
558 pc
.StallAtPixelScoreboard
= true;
561 emit_xfb_query(cmd_buffer
, index
, anv_address_add(query_addr
, 16));
562 emit_query_availability(cmd_buffer
, query_addr
);
569 /* When multiview is active the spec requires that N consecutive query
570 * indices are used, where N is the number of active views in the subpass.
571 * The spec allows that we only write the results to one of the queries
572 * but we still need to manage result availability for all the query indices.
573 * Since we only emit a single query for all active views in the
574 * first index, mark the other query indices as being already available
577 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
578 const uint32_t num_queries
=
579 util_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
581 emit_zero_queries(cmd_buffer
, pool
, query
+ 1, num_queries
- 1);
585 #define TIMESTAMP 0x2358
587 void genX(CmdWriteTimestamp
)(
588 VkCommandBuffer commandBuffer
,
589 VkPipelineStageFlagBits pipelineStage
,
590 VkQueryPool queryPool
,
593 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
594 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
595 struct anv_address query_addr
= anv_query_address(pool
, query
);
597 assert(pool
->type
== VK_QUERY_TYPE_TIMESTAMP
);
599 switch (pipelineStage
) {
600 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
:
601 emit_srm64(&cmd_buffer
->batch
, anv_address_add(query_addr
, 8), TIMESTAMP
);
605 /* Everything else is bottom-of-pipe */
606 anv_batch_emit(&cmd_buffer
->batch
, GENX(PIPE_CONTROL
), pc
) {
607 pc
.DestinationAddressType
= DAT_PPGTT
;
608 pc
.PostSyncOperation
= WriteTimestamp
;
609 pc
.Address
= anv_address_add(query_addr
, 8);
611 if (GEN_GEN
== 9 && cmd_buffer
->device
->info
.gt
== 4)
612 pc
.CommandStreamerStallEnable
= true;
617 emit_query_availability(cmd_buffer
, query_addr
);
619 /* When multiview is active the spec requires that N consecutive query
620 * indices are used, where N is the number of active views in the subpass.
621 * The spec allows that we only write the results to one of the queries
622 * but we still need to manage result availability for all the query indices.
623 * Since we only emit a single query for all active views in the
624 * first index, mark the other query indices as being already available
627 if (cmd_buffer
->state
.subpass
&& cmd_buffer
->state
.subpass
->view_mask
) {
628 const uint32_t num_queries
=
629 util_bitcount(cmd_buffer
->state
.subpass
->view_mask
);
631 emit_zero_queries(cmd_buffer
, pool
, query
+ 1, num_queries
- 1);
635 #if GEN_GEN > 7 || GEN_IS_HASWELL
638 mi_alu(uint32_t opcode
, uint32_t operand1
, uint32_t operand2
)
640 struct GENX(MI_MATH_ALU_INSTRUCTION
) instr
= {
642 .Operand1
= operand1
,
643 .Operand2
= operand2
,
647 GENX(MI_MATH_ALU_INSTRUCTION_pack
)(NULL
, &dw
, &instr
);
652 #define CS_GPR(n) (0x2600 + (n) * 8)
655 emit_load_alu_reg_u64(struct anv_batch
*batch
, uint32_t reg
,
656 struct anv_address addr
)
658 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_MEM
), lrm
) {
659 lrm
.RegisterAddress
= reg
;
660 lrm
.MemoryAddress
= anv_address_add(addr
, 0);
662 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_MEM
), lrm
) {
663 lrm
.RegisterAddress
= reg
+ 4;
664 lrm
.MemoryAddress
= anv_address_add(addr
, 4);
669 emit_load_alu_reg_imm32(struct anv_batch
*batch
, uint32_t reg
, uint32_t imm
)
671 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_IMM
), lri
) {
672 lri
.RegisterOffset
= reg
;
678 emit_load_alu_reg_imm64(struct anv_batch
*batch
, uint32_t reg
, uint64_t imm
)
680 emit_load_alu_reg_imm32(batch
, reg
, (uint32_t)imm
);
681 emit_load_alu_reg_imm32(batch
, reg
+ 4, (uint32_t)(imm
>> 32));
685 emit_load_alu_reg_reg32(struct anv_batch
*batch
, uint32_t src
, uint32_t dst
)
687 anv_batch_emit(batch
, GENX(MI_LOAD_REGISTER_REG
), lrr
) {
688 lrr
.SourceRegisterAddress
= src
;
689 lrr
.DestinationRegisterAddress
= dst
;
694 * GPR0 = GPR0 & ((1ull << n) - 1);
697 keep_gpr0_lower_n_bits(struct anv_batch
*batch
, uint32_t n
)
700 emit_load_alu_reg_imm64(batch
, CS_GPR(1), (1ull << n
) - 1);
702 uint32_t *dw
= anv_batch_emitn(batch
, 5, GENX(MI_MATH
));
704 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
708 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG0
);
709 dw
[2] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG1
);
710 dw
[3] = mi_alu(MI_ALU_AND
, 0, 0);
711 dw
[4] = mi_alu(MI_ALU_STORE
, MI_ALU_REG0
, MI_ALU_ACCU
);
718 shl_gpr0_by_30_bits(struct anv_batch
*batch
)
720 /* First we mask 34 bits of GPR0 to prevent overflow */
721 keep_gpr0_lower_n_bits(batch
, 34);
723 const uint32_t outer_count
= 5;
724 const uint32_t inner_count
= 6;
725 STATIC_ASSERT(outer_count
* inner_count
== 30);
726 const uint32_t cmd_len
= 1 + inner_count
* 4;
728 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
731 for (int o
= 0; o
< outer_count
; o
++) {
732 /* Submit one MI_MATH to shift left by 6 bits */
733 uint32_t *dw
= anv_batch_emitn(batch
, cmd_len
, GENX(MI_MATH
));
735 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
740 for (int i
= 0; i
< inner_count
; i
++, dw
+= 4) {
741 dw
[0] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG0
);
742 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG0
);
743 dw
[2] = mi_alu(MI_ALU_ADD
, 0, 0);
744 dw
[3] = mi_alu(MI_ALU_STORE
, MI_ALU_REG0
, MI_ALU_ACCU
);
752 * Note that the upper 30 bits of GPR are lost!
755 shr_gpr0_by_2_bits(struct anv_batch
*batch
)
757 shl_gpr0_by_30_bits(batch
);
758 emit_load_alu_reg_reg32(batch
, CS_GPR(0) + 4, CS_GPR(0));
759 emit_load_alu_reg_imm32(batch
, CS_GPR(0) + 4, 0);
763 gpu_write_query_result(struct anv_batch
*batch
,
764 struct anv_address dst_addr
,
765 VkQueryResultFlags flags
,
766 uint32_t value_index
, uint32_t reg
)
768 if (flags
& VK_QUERY_RESULT_64_BIT
) {
769 emit_srm64(batch
, anv_address_add(dst_addr
, value_index
* 8), reg
);
771 emit_srm32(batch
, anv_address_add(dst_addr
, value_index
* 4), reg
);
776 compute_query_result(struct anv_batch
*batch
, uint32_t dst_reg
,
777 struct anv_address addr
)
779 emit_load_alu_reg_u64(batch
, CS_GPR(0), anv_address_add(addr
, 0));
780 emit_load_alu_reg_u64(batch
, CS_GPR(1), anv_address_add(addr
, 8));
782 /* FIXME: We need to clamp the result for 32 bit. */
784 uint32_t *dw
= anv_batch_emitn(batch
, 5, GENX(MI_MATH
));
786 anv_batch_set_error(batch
, VK_ERROR_OUT_OF_HOST_MEMORY
);
790 dw
[1] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCA
, MI_ALU_REG1
);
791 dw
[2] = mi_alu(MI_ALU_LOAD
, MI_ALU_SRCB
, MI_ALU_REG0
);
792 dw
[3] = mi_alu(MI_ALU_SUB
, 0, 0);
793 dw
[4] = mi_alu(MI_ALU_STORE
, dst_reg
, MI_ALU_ACCU
);
796 void genX(CmdCopyQueryPoolResults
)(
797 VkCommandBuffer commandBuffer
,
798 VkQueryPool queryPool
,
802 VkDeviceSize destOffset
,
803 VkDeviceSize destStride
,
804 VkQueryResultFlags flags
)
806 ANV_FROM_HANDLE(anv_cmd_buffer
, cmd_buffer
, commandBuffer
);
807 ANV_FROM_HANDLE(anv_query_pool
, pool
, queryPool
);
808 ANV_FROM_HANDLE(anv_buffer
, buffer
, destBuffer
);
810 /* If render target writes are ongoing, request a render target cache flush
811 * to ensure proper ordering of the commands from the 3d pipe and the
814 if (cmd_buffer
->state
.pending_pipe_bits
& ANV_PIPE_RENDER_TARGET_BUFFER_WRITES
) {
815 cmd_buffer
->state
.pending_pipe_bits
|=
816 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT
;
819 if ((flags
& VK_QUERY_RESULT_WAIT_BIT
) ||
820 (cmd_buffer
->state
.pending_pipe_bits
& ANV_PIPE_FLUSH_BITS
)) {
821 cmd_buffer
->state
.pending_pipe_bits
|= ANV_PIPE_CS_STALL_BIT
;
822 genX(cmd_buffer_apply_pipe_flushes
)(cmd_buffer
);
825 struct anv_address dest_addr
= anv_address_add(buffer
->address
, destOffset
);
826 for (uint32_t i
= 0; i
< queryCount
; i
++) {
827 struct anv_address query_addr
= anv_query_address(pool
, firstQuery
+ i
);
829 switch (pool
->type
) {
830 case VK_QUERY_TYPE_OCCLUSION
:
831 compute_query_result(&cmd_buffer
->batch
, MI_ALU_REG2
,
832 anv_address_add(query_addr
, 8));
833 gpu_write_query_result(&cmd_buffer
->batch
, dest_addr
,
834 flags
, idx
++, CS_GPR(2));
837 case VK_QUERY_TYPE_PIPELINE_STATISTICS
: {
838 uint32_t statistics
= pool
->pipeline_statistics
;
840 uint32_t stat
= u_bit_scan(&statistics
);
842 compute_query_result(&cmd_buffer
->batch
, MI_ALU_REG0
,
843 anv_address_add(query_addr
, idx
* 16 + 8));
845 /* WaDividePSInvocationCountBy4:HSW,BDW */
846 if ((cmd_buffer
->device
->info
.gen
== 8 ||
847 cmd_buffer
->device
->info
.is_haswell
) &&
848 (1 << stat
) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT
) {
849 shr_gpr0_by_2_bits(&cmd_buffer
->batch
);
852 gpu_write_query_result(&cmd_buffer
->batch
, dest_addr
,
853 flags
, idx
++, CS_GPR(0));
855 assert(idx
== util_bitcount(pool
->pipeline_statistics
));
859 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
:
860 compute_query_result(&cmd_buffer
->batch
, MI_ALU_REG2
,
861 anv_address_add(query_addr
, 8));
862 gpu_write_query_result(&cmd_buffer
->batch
, dest_addr
,
863 flags
, idx
++, CS_GPR(2));
864 compute_query_result(&cmd_buffer
->batch
, MI_ALU_REG2
,
865 anv_address_add(query_addr
, 24));
866 gpu_write_query_result(&cmd_buffer
->batch
, dest_addr
,
867 flags
, idx
++, CS_GPR(2));
870 case VK_QUERY_TYPE_TIMESTAMP
:
871 emit_load_alu_reg_u64(&cmd_buffer
->batch
,
872 CS_GPR(2), anv_address_add(query_addr
, 8));
873 gpu_write_query_result(&cmd_buffer
->batch
, dest_addr
,
874 flags
, 0, CS_GPR(2));
878 unreachable("unhandled query type");
881 if (flags
& VK_QUERY_RESULT_WITH_AVAILABILITY_BIT
) {
882 emit_load_alu_reg_u64(&cmd_buffer
->batch
, CS_GPR(0), query_addr
);
883 gpu_write_query_result(&cmd_buffer
->batch
, dest_addr
,
884 flags
, idx
, CS_GPR(0));
887 dest_addr
= anv_address_add(dest_addr
, destStride
);
892 void genX(CmdCopyQueryPoolResults
)(
893 VkCommandBuffer commandBuffer
,
894 VkQueryPool queryPool
,
898 VkDeviceSize destOffset
,
899 VkDeviceSize destStride
,
900 VkQueryResultFlags flags
)
902 anv_finishme("Queries not yet supported on Ivy Bridge");