anv: Clarify behavior of anv_image_aspect_to_plane()
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = device->physical;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 72; /* 576 bytes, see layout below */
98 break;
99 }
100 default:
101 assert(!"Invalid query type");
102 }
103
104 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (pool == NULL)
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 pool->type = pCreateInfo->queryType;
110 pool->pipeline_statistics = pipeline_statistics;
111 pool->stride = uint64s_per_slot * sizeof(uint64_t);
112 pool->slots = pCreateInfo->queryCount;
113
114 uint32_t bo_flags = 0;
115 if (pdevice->supports_48bit_addresses)
116 bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
117
118 if (pdevice->use_softpin)
119 bo_flags |= EXEC_OBJECT_PINNED;
120
121 if (pdevice->has_exec_async)
122 bo_flags |= EXEC_OBJECT_ASYNC;
123
124 uint64_t size = pool->slots * pool->stride;
125 result = anv_device_alloc_bo(device, size,
126 ANV_BO_ALLOC_MAPPED |
127 ANV_BO_ALLOC_SNOOPED,
128 0 /* explicit_address */,
129 &pool->bo);
130 if (result != VK_SUCCESS)
131 goto fail;
132
133 *pQueryPool = anv_query_pool_to_handle(pool);
134
135 return VK_SUCCESS;
136
137 fail:
138 vk_free2(&device->alloc, pAllocator, pool);
139
140 return result;
141 }
142
143 void genX(DestroyQueryPool)(
144 VkDevice _device,
145 VkQueryPool _pool,
146 const VkAllocationCallbacks* pAllocator)
147 {
148 ANV_FROM_HANDLE(anv_device, device, _device);
149 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
150
151 if (!pool)
152 return;
153
154 anv_device_release_bo(device, pool->bo);
155 vk_free2(&device->alloc, pAllocator, pool);
156 }
157
158 static struct anv_address
159 anv_query_address(struct anv_query_pool *pool, uint32_t query)
160 {
161 return (struct anv_address) {
162 .bo = pool->bo,
163 .offset = query * pool->stride,
164 };
165 }
166
167 /**
168 * VK_INTEL_performance_query layout (576 bytes) :
169 *
170 * ------------------------------
171 * | availability (8b) |
172 * |----------------------------|
173 * | marker (8b) |
174 * |----------------------------|
175 * | begin RPSTAT register (4b) |
176 * |----------------------------|
177 * | end RPSTAT register (4b) |
178 * |----------------------------|
179 * | begin perfcntr 1 & 2 (16b) |
180 * |----------------------------|
181 * | end perfcntr 1 & 2 (16b) |
182 * |----------------------------|
183 * | Unused (8b) |
184 * |----------------------------|
185 * | begin MI_RPC (256b) |
186 * |----------------------------|
187 * | end MI_RPC (256b) |
188 * ------------------------------
189 */
190
191 static uint32_t
192 intel_perf_marker_offset(void)
193 {
194 return 8;
195 }
196
197 static uint32_t
198 intel_perf_rpstart_offset(bool end)
199 {
200 return 16 + (end ? sizeof(uint32_t) : 0);
201 }
202
203 static uint32_t
204 intel_perf_counter(bool end)
205 {
206 return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
207 }
208
209 static uint32_t
210 intel_perf_mi_rpc_offset(bool end)
211 {
212 return 64 + (end ? 256 : 0);
213 }
214
215 static void
216 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
217 uint32_t value_index, uint64_t result)
218 {
219 if (flags & VK_QUERY_RESULT_64_BIT) {
220 uint64_t *dst64 = dst_slot;
221 dst64[value_index] = result;
222 } else {
223 uint32_t *dst32 = dst_slot;
224 dst32[value_index] = result;
225 }
226 }
227
228 static void *
229 query_slot(struct anv_query_pool *pool, uint32_t query)
230 {
231 return pool->bo->map + query * pool->stride;
232 }
233
234 static bool
235 query_is_available(struct anv_query_pool *pool, uint32_t query)
236 {
237 return *(volatile uint64_t *)query_slot(pool, query);
238 }
239
240 static VkResult
241 wait_for_available(struct anv_device *device,
242 struct anv_query_pool *pool, uint32_t query)
243 {
244 uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC);
245
246 while (anv_gettime_ns() < abs_timeout) {
247 if (query_is_available(pool, query))
248 return VK_SUCCESS;
249 VkResult status = anv_device_query_status(device);
250 if (status != VK_SUCCESS)
251 return status;
252 }
253
254 return anv_device_set_lost(device, "query timeout");
255 }
256
257 VkResult genX(GetQueryPoolResults)(
258 VkDevice _device,
259 VkQueryPool queryPool,
260 uint32_t firstQuery,
261 uint32_t queryCount,
262 size_t dataSize,
263 void* pData,
264 VkDeviceSize stride,
265 VkQueryResultFlags flags)
266 {
267 ANV_FROM_HANDLE(anv_device, device, _device);
268 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
269
270 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
271 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
272 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
273 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
274 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
275
276 if (anv_device_is_lost(device))
277 return VK_ERROR_DEVICE_LOST;
278
279 if (pData == NULL)
280 return VK_SUCCESS;
281
282 void *data_end = pData + dataSize;
283
284 VkResult status = VK_SUCCESS;
285 for (uint32_t i = 0; i < queryCount; i++) {
286 bool available = query_is_available(pool, firstQuery + i);
287
288 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
289 status = wait_for_available(device, pool, firstQuery + i);
290 if (status != VK_SUCCESS)
291 return status;
292
293 available = true;
294 }
295
296 /* From the Vulkan 1.0.42 spec:
297 *
298 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
299 * both not set then no result values are written to pData for
300 * queries that are in the unavailable state at the time of the call,
301 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
302 * availability state is still written to pData for those queries if
303 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
304 */
305 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
306
307 uint32_t idx = 0;
308 switch (pool->type) {
309 case VK_QUERY_TYPE_OCCLUSION: {
310 uint64_t *slot = query_slot(pool, firstQuery + i);
311 if (write_results) {
312 /* From the Vulkan 1.2.132 spec:
313 *
314 * "If VK_QUERY_RESULT_PARTIAL_BIT is set,
315 * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
316 * is unavailable, an intermediate result value between zero and
317 * the final result value is written to pData for that query."
318 */
319 uint64_t result = available ? slot[2] - slot[1] : 0;
320 cpu_write_query_result(pData, flags, idx, result);
321 }
322 idx++;
323 break;
324 }
325
326 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
327 uint64_t *slot = query_slot(pool, firstQuery + i);
328 uint32_t statistics = pool->pipeline_statistics;
329 while (statistics) {
330 uint32_t stat = u_bit_scan(&statistics);
331 if (write_results) {
332 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
333
334 /* WaDividePSInvocationCountBy4:HSW,BDW */
335 if ((device->info.gen == 8 || device->info.is_haswell) &&
336 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
337 result >>= 2;
338
339 cpu_write_query_result(pData, flags, idx, result);
340 }
341 idx++;
342 }
343 assert(idx == util_bitcount(pool->pipeline_statistics));
344 break;
345 }
346
347 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
348 uint64_t *slot = query_slot(pool, firstQuery + i);
349 if (write_results)
350 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
351 idx++;
352 if (write_results)
353 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
354 idx++;
355 break;
356 }
357
358 case VK_QUERY_TYPE_TIMESTAMP: {
359 uint64_t *slot = query_slot(pool, firstQuery + i);
360 if (write_results)
361 cpu_write_query_result(pData, flags, idx, slot[1]);
362 idx++;
363 break;
364 }
365
366 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
367 if (!write_results)
368 break;
369 const void *query_data = query_slot(pool, firstQuery + i);
370 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
371 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
372 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
373 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
374 struct gen_perf_query_result result;
375 struct gen_perf_query_info metric = {
376 .oa_format = (GEN_GEN >= 8 ?
377 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
378 I915_OA_FORMAT_A45_B8_C8),
379 };
380 uint32_t core_freq[2];
381 #if GEN_GEN < 9
382 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
383 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
384 #else
385 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
386 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
387 #endif
388 gen_perf_query_result_clear(&result);
389 gen_perf_query_result_accumulate(&result, &metric,
390 oa_begin, oa_end);
391 gen_perf_query_result_read_frequencies(&result, &device->info,
392 oa_begin, oa_end);
393 gen_perf_query_result_write_mdapi(pData, stride,
394 &device->info,
395 &result,
396 core_freq[0], core_freq[1]);
397 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
398 query_data + intel_perf_counter(false),
399 query_data + intel_perf_counter(true));
400 const uint64_t *marker = query_data + intel_perf_marker_offset();
401 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
402 break;
403 }
404
405 default:
406 unreachable("invalid pool type");
407 }
408
409 if (!write_results)
410 status = VK_NOT_READY;
411
412 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
413 cpu_write_query_result(pData, flags, idx, available);
414
415 pData += stride;
416 if (pData >= data_end)
417 break;
418 }
419
420 return status;
421 }
422
423 static void
424 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
425 struct anv_address addr)
426 {
427 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
428 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
429
430 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
431 pc.DestinationAddressType = DAT_PPGTT;
432 pc.PostSyncOperation = WritePSDepthCount;
433 pc.DepthStallEnable = true;
434 pc.Address = addr;
435
436 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
437 pc.CommandStreamerStallEnable = true;
438 }
439 }
440
441 static void
442 emit_query_mi_availability(struct gen_mi_builder *b,
443 struct anv_address addr,
444 bool available)
445 {
446 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
447 }
448
449 static void
450 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
451 struct anv_address addr,
452 bool available)
453 {
454 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
455 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
456
457 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
458 pc.DestinationAddressType = DAT_PPGTT;
459 pc.PostSyncOperation = WriteImmediateData;
460 pc.Address = addr;
461 pc.ImmediateData = available;
462 }
463 }
464
465 /**
466 * Goes through a series of consecutive query indices in the given pool
467 * setting all element values to 0 and emitting them as available.
468 */
469 static void
470 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
471 struct gen_mi_builder *b, struct anv_query_pool *pool,
472 uint32_t first_index, uint32_t num_queries)
473 {
474 switch (pool->type) {
475 case VK_QUERY_TYPE_OCCLUSION:
476 case VK_QUERY_TYPE_TIMESTAMP:
477 /* These queries are written with a PIPE_CONTROL so clear them using the
478 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
479 * of operations.
480 */
481 assert((pool->stride % 8) == 0);
482 for (uint32_t i = 0; i < num_queries; i++) {
483 struct anv_address slot_addr =
484 anv_query_address(pool, first_index + i);
485
486 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
487 emit_query_pc_availability(cmd_buffer,
488 anv_address_add(slot_addr, qword * 8),
489 false);
490 }
491 emit_query_pc_availability(cmd_buffer, slot_addr, true);
492 }
493 break;
494
495 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
496 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
497 for (uint32_t i = 0; i < num_queries; i++) {
498 struct anv_address slot_addr =
499 anv_query_address(pool, first_index + i);
500 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
501 emit_query_mi_availability(b, slot_addr, true);
502 }
503 break;
504
505 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
506 for (uint32_t i = 0; i < num_queries; i++) {
507 struct anv_address slot_addr =
508 anv_query_address(pool, first_index + i);
509 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
510 emit_query_mi_availability(b, slot_addr, true);
511 }
512 break;
513
514 default:
515 unreachable("Unsupported query type");
516 }
517 }
518
519 void genX(CmdResetQueryPool)(
520 VkCommandBuffer commandBuffer,
521 VkQueryPool queryPool,
522 uint32_t firstQuery,
523 uint32_t queryCount)
524 {
525 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
526 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
527
528 switch (pool->type) {
529 case VK_QUERY_TYPE_OCCLUSION:
530 case VK_QUERY_TYPE_TIMESTAMP:
531 for (uint32_t i = 0; i < queryCount; i++) {
532 emit_query_pc_availability(cmd_buffer,
533 anv_query_address(pool, firstQuery + i),
534 false);
535 }
536 break;
537
538 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
539 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
540 struct gen_mi_builder b;
541 gen_mi_builder_init(&b, &cmd_buffer->batch);
542
543 for (uint32_t i = 0; i < queryCount; i++)
544 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
545 break;
546 }
547
548 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
549 struct gen_mi_builder b;
550 gen_mi_builder_init(&b, &cmd_buffer->batch);
551
552 for (uint32_t i = 0; i < queryCount; i++)
553 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
554 break;
555 }
556
557 default:
558 unreachable("Unsupported query type");
559 }
560 }
561
562 void genX(ResetQueryPool)(
563 VkDevice _device,
564 VkQueryPool queryPool,
565 uint32_t firstQuery,
566 uint32_t queryCount)
567 {
568 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
569
570 for (uint32_t i = 0; i < queryCount; i++) {
571 uint64_t *slot = query_slot(pool, firstQuery + i);
572 *slot = 0;
573 }
574 }
575
576 static const uint32_t vk_pipeline_stat_to_reg[] = {
577 GENX(IA_VERTICES_COUNT_num),
578 GENX(IA_PRIMITIVES_COUNT_num),
579 GENX(VS_INVOCATION_COUNT_num),
580 GENX(GS_INVOCATION_COUNT_num),
581 GENX(GS_PRIMITIVES_COUNT_num),
582 GENX(CL_INVOCATION_COUNT_num),
583 GENX(CL_PRIMITIVES_COUNT_num),
584 GENX(PS_INVOCATION_COUNT_num),
585 GENX(HS_INVOCATION_COUNT_num),
586 GENX(DS_INVOCATION_COUNT_num),
587 GENX(CS_INVOCATION_COUNT_num),
588 };
589
590 static void
591 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
592 struct anv_address addr)
593 {
594 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
595 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
596
597 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
598 gen_mi_store(b, gen_mi_mem64(addr),
599 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
600 }
601
602 static void
603 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
604 struct anv_address addr)
605 {
606 assert(stream < MAX_XFB_STREAMS);
607
608 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
609 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
610 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
611 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
612 }
613
614 void genX(CmdBeginQuery)(
615 VkCommandBuffer commandBuffer,
616 VkQueryPool queryPool,
617 uint32_t query,
618 VkQueryControlFlags flags)
619 {
620 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
621 }
622
623 void genX(CmdBeginQueryIndexedEXT)(
624 VkCommandBuffer commandBuffer,
625 VkQueryPool queryPool,
626 uint32_t query,
627 VkQueryControlFlags flags,
628 uint32_t index)
629 {
630 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
631 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
632 struct anv_address query_addr = anv_query_address(pool, query);
633
634 struct gen_mi_builder b;
635 gen_mi_builder_init(&b, &cmd_buffer->batch);
636
637 switch (pool->type) {
638 case VK_QUERY_TYPE_OCCLUSION:
639 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
640 break;
641
642 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
643 /* TODO: This might only be necessary for certain stats */
644 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
645 pc.CommandStreamerStallEnable = true;
646 pc.StallAtPixelScoreboard = true;
647 }
648
649 uint32_t statistics = pool->pipeline_statistics;
650 uint32_t offset = 8;
651 while (statistics) {
652 uint32_t stat = u_bit_scan(&statistics);
653 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
654 offset += 16;
655 }
656 break;
657 }
658
659 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
660 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
661 pc.CommandStreamerStallEnable = true;
662 pc.StallAtPixelScoreboard = true;
663 }
664 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
665 break;
666
667 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
668 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
669 pc.CommandStreamerStallEnable = true;
670 pc.StallAtPixelScoreboard = true;
671 }
672 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
673 rpc.MemoryAddress =
674 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
675 }
676 #if GEN_GEN < 9
677 gen_mi_store(&b,
678 gen_mi_mem32(anv_address_add(query_addr,
679 intel_perf_rpstart_offset(false))),
680 gen_mi_reg32(GENX(RPSTAT1_num)));
681 #else
682 gen_mi_store(&b,
683 gen_mi_mem32(anv_address_add(query_addr,
684 intel_perf_rpstart_offset(false))),
685 gen_mi_reg32(GENX(RPSTAT0_num)));
686 #endif
687 #if GEN_GEN >= 8 && GEN_GEN <= 11
688 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
689 intel_perf_counter(false))),
690 gen_mi_reg64(GENX(PERFCNT1_num)));
691 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
692 intel_perf_counter(false) + 8)),
693 gen_mi_reg64(GENX(PERFCNT2_num)));
694 #endif
695 break;
696 }
697
698 default:
699 unreachable("");
700 }
701 }
702
703 void genX(CmdEndQuery)(
704 VkCommandBuffer commandBuffer,
705 VkQueryPool queryPool,
706 uint32_t query)
707 {
708 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
709 }
710
711 void genX(CmdEndQueryIndexedEXT)(
712 VkCommandBuffer commandBuffer,
713 VkQueryPool queryPool,
714 uint32_t query,
715 uint32_t index)
716 {
717 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
718 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
719 struct anv_address query_addr = anv_query_address(pool, query);
720
721 struct gen_mi_builder b;
722 gen_mi_builder_init(&b, &cmd_buffer->batch);
723
724 switch (pool->type) {
725 case VK_QUERY_TYPE_OCCLUSION:
726 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
727 emit_query_pc_availability(cmd_buffer, query_addr, true);
728 break;
729
730 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
731 /* TODO: This might only be necessary for certain stats */
732 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
733 pc.CommandStreamerStallEnable = true;
734 pc.StallAtPixelScoreboard = true;
735 }
736
737 uint32_t statistics = pool->pipeline_statistics;
738 uint32_t offset = 16;
739 while (statistics) {
740 uint32_t stat = u_bit_scan(&statistics);
741 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
742 offset += 16;
743 }
744
745 emit_query_mi_availability(&b, query_addr, true);
746 break;
747 }
748
749 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
750 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
751 pc.CommandStreamerStallEnable = true;
752 pc.StallAtPixelScoreboard = true;
753 }
754
755 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
756 emit_query_mi_availability(&b, query_addr, true);
757 break;
758
759 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
760 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
761 pc.CommandStreamerStallEnable = true;
762 pc.StallAtPixelScoreboard = true;
763 }
764 uint32_t marker_offset = intel_perf_marker_offset();
765 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
766 gen_mi_imm(cmd_buffer->intel_perf_marker));
767 #if GEN_GEN >= 8 && GEN_GEN <= 11
768 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
769 gen_mi_reg64(GENX(PERFCNT1_num)));
770 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
771 gen_mi_reg64(GENX(PERFCNT2_num)));
772 #endif
773 #if GEN_GEN < 9
774 gen_mi_store(&b,
775 gen_mi_mem32(anv_address_add(query_addr,
776 intel_perf_rpstart_offset(true))),
777 gen_mi_reg32(GENX(RPSTAT1_num)));
778 #else
779 gen_mi_store(&b,
780 gen_mi_mem32(anv_address_add(query_addr,
781 intel_perf_rpstart_offset(true))),
782 gen_mi_reg32(GENX(RPSTAT0_num)));
783 #endif
784 /* Position the last OA snapshot at the beginning of the query so that
785 * we can tell whether it's ready.
786 */
787 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
788 rpc.MemoryAddress = anv_address_add(query_addr,
789 intel_perf_mi_rpc_offset(true));
790 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
791 }
792 emit_query_mi_availability(&b, query_addr, true);
793 break;
794 }
795
796 default:
797 unreachable("");
798 }
799
800 /* When multiview is active the spec requires that N consecutive query
801 * indices are used, where N is the number of active views in the subpass.
802 * The spec allows that we only write the results to one of the queries
803 * but we still need to manage result availability for all the query indices.
804 * Since we only emit a single query for all active views in the
805 * first index, mark the other query indices as being already available
806 * with result 0.
807 */
808 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
809 const uint32_t num_queries =
810 util_bitcount(cmd_buffer->state.subpass->view_mask);
811 if (num_queries > 1)
812 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
813 }
814 }
815
816 #define TIMESTAMP 0x2358
817
818 void genX(CmdWriteTimestamp)(
819 VkCommandBuffer commandBuffer,
820 VkPipelineStageFlagBits pipelineStage,
821 VkQueryPool queryPool,
822 uint32_t query)
823 {
824 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
825 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
826 struct anv_address query_addr = anv_query_address(pool, query);
827
828 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
829
830 struct gen_mi_builder b;
831 gen_mi_builder_init(&b, &cmd_buffer->batch);
832
833 switch (pipelineStage) {
834 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
835 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
836 gen_mi_reg64(TIMESTAMP));
837 break;
838
839 default:
840 /* Everything else is bottom-of-pipe */
841 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
842 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
843
844 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
845 pc.DestinationAddressType = DAT_PPGTT;
846 pc.PostSyncOperation = WriteTimestamp;
847 pc.Address = anv_address_add(query_addr, 8);
848
849 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
850 pc.CommandStreamerStallEnable = true;
851 }
852 break;
853 }
854
855 emit_query_pc_availability(cmd_buffer, query_addr, true);
856
857 /* When multiview is active the spec requires that N consecutive query
858 * indices are used, where N is the number of active views in the subpass.
859 * The spec allows that we only write the results to one of the queries
860 * but we still need to manage result availability for all the query indices.
861 * Since we only emit a single query for all active views in the
862 * first index, mark the other query indices as being already available
863 * with result 0.
864 */
865 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
866 const uint32_t num_queries =
867 util_bitcount(cmd_buffer->state.subpass->view_mask);
868 if (num_queries > 1)
869 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
870 }
871 }
872
873 #if GEN_GEN > 7 || GEN_IS_HASWELL
874
875 #if GEN_GEN >= 8 || GEN_IS_HASWELL
876
877 #define MI_PREDICATE_SRC0 0x2400
878 #define MI_PREDICATE_SRC1 0x2408
879 #define MI_PREDICATE_RESULT 0x2418
880
881 /**
882 * Writes the results of a query to dst_addr is the value at poll_addr is equal
883 * to the reference value.
884 */
885 static void
886 gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
887 struct gen_mi_builder *b,
888 struct anv_address poll_addr,
889 struct anv_address dst_addr,
890 uint64_t ref_value,
891 VkQueryResultFlags flags,
892 uint32_t value_index,
893 struct gen_mi_value query_result)
894 {
895 gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC0), gen_mi_mem64(poll_addr));
896 gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(ref_value));
897 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
898 mip.LoadOperation = LOAD_LOAD;
899 mip.CombineOperation = COMBINE_SET;
900 mip.CompareOperation = COMPARE_SRCS_EQUAL;
901 }
902
903 if (flags & VK_QUERY_RESULT_64_BIT) {
904 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
905 gen_mi_store_if(b, gen_mi_mem64(res_addr), query_result);
906 } else {
907 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
908 gen_mi_store_if(b, gen_mi_mem32(res_addr), query_result);
909 }
910 }
911
912 #endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */
913
914 static void
915 gpu_write_query_result(struct gen_mi_builder *b,
916 struct anv_address dst_addr,
917 VkQueryResultFlags flags,
918 uint32_t value_index,
919 struct gen_mi_value query_result)
920 {
921 if (flags & VK_QUERY_RESULT_64_BIT) {
922 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
923 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
924 } else {
925 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
926 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
927 }
928 }
929
930 static struct gen_mi_value
931 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
932 {
933 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
934 gen_mi_mem64(anv_address_add(addr, 0)));
935 }
936
937 void genX(CmdCopyQueryPoolResults)(
938 VkCommandBuffer commandBuffer,
939 VkQueryPool queryPool,
940 uint32_t firstQuery,
941 uint32_t queryCount,
942 VkBuffer destBuffer,
943 VkDeviceSize destOffset,
944 VkDeviceSize destStride,
945 VkQueryResultFlags flags)
946 {
947 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
948 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
949 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
950
951 struct gen_mi_builder b;
952 gen_mi_builder_init(&b, &cmd_buffer->batch);
953 struct gen_mi_value result;
954
955 /* If render target writes are ongoing, request a render target cache flush
956 * to ensure proper ordering of the commands from the 3d pipe and the
957 * command streamer.
958 */
959 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
960 cmd_buffer->state.pending_pipe_bits |=
961 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
962 }
963
964 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
965 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
966 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
967 * because we're about to copy values from MI commands, we need to
968 * stall the command streamer to make sure the PIPE_CONTROL values have
969 * landed, otherwise we could see inconsistent values & availability.
970 *
971 * From the vulkan spec:
972 *
973 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
974 * previous uses of vkCmdResetQueryPool in the same queue, without
975 * any additional synchronization."
976 */
977 pool->type == VK_QUERY_TYPE_OCCLUSION ||
978 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
979 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
980 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
981 }
982
983 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
984 for (uint32_t i = 0; i < queryCount; i++) {
985 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
986 uint32_t idx = 0;
987 switch (pool->type) {
988 case VK_QUERY_TYPE_OCCLUSION:
989 result = compute_query_result(&b, anv_address_add(query_addr, 8));
990 #if GEN_GEN >= 8 || GEN_IS_HASWELL
991 /* Like in the case of vkGetQueryPoolResults, if the query is
992 * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
993 * conservatively write 0 as the query result. If the
994 * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
995 */
996 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
997 1 /* available */, flags, idx, result);
998 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
999 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1000 0 /* unavailable */, flags, idx, gen_mi_imm(0));
1001 }
1002 idx++;
1003 #else /* GEN_GEN < 8 && !GEN_IS_HASWELL */
1004 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1005 #endif
1006 break;
1007
1008 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1009 uint32_t statistics = pool->pipeline_statistics;
1010 while (statistics) {
1011 uint32_t stat = u_bit_scan(&statistics);
1012
1013 result = compute_query_result(&b, anv_address_add(query_addr,
1014 idx * 16 + 8));
1015
1016 /* WaDividePSInvocationCountBy4:HSW,BDW */
1017 if ((cmd_buffer->device->info.gen == 8 ||
1018 cmd_buffer->device->info.is_haswell) &&
1019 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1020 result = gen_mi_ushr32_imm(&b, result, 2);
1021 }
1022
1023 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1024 }
1025 assert(idx == util_bitcount(pool->pipeline_statistics));
1026 break;
1027 }
1028
1029 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1030 result = compute_query_result(&b, anv_address_add(query_addr, 8));
1031 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1032 result = compute_query_result(&b, anv_address_add(query_addr, 24));
1033 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1034 break;
1035
1036 case VK_QUERY_TYPE_TIMESTAMP:
1037 result = gen_mi_mem64(anv_address_add(query_addr, 8));
1038 gpu_write_query_result(&b, dest_addr, flags, 0, result);
1039 break;
1040
1041 default:
1042 unreachable("unhandled query type");
1043 }
1044
1045 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1046 gpu_write_query_result(&b, dest_addr, flags, idx,
1047 gen_mi_mem64(query_addr));
1048 }
1049
1050 dest_addr = anv_address_add(dest_addr, destStride);
1051 }
1052 }
1053
1054 #else
1055 void genX(CmdCopyQueryPoolResults)(
1056 VkCommandBuffer commandBuffer,
1057 VkQueryPool queryPool,
1058 uint32_t firstQuery,
1059 uint32_t queryCount,
1060 VkBuffer destBuffer,
1061 VkDeviceSize destOffset,
1062 VkDeviceSize destStride,
1063 VkQueryResultFlags flags)
1064 {
1065 anv_finishme("Queries not yet supported on Ivy Bridge");
1066 }
1067 #endif