anv: Enable Vulkan 1.2 support
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 72; /* 576 bytes, see layout below */
98 break;
99 }
100 default:
101 assert(!"Invalid query type");
102 }
103
104 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (pool == NULL)
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 pool->type = pCreateInfo->queryType;
110 pool->pipeline_statistics = pipeline_statistics;
111 pool->stride = uint64s_per_slot * sizeof(uint64_t);
112 pool->slots = pCreateInfo->queryCount;
113
114 uint32_t bo_flags = 0;
115 if (pdevice->supports_48bit_addresses)
116 bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
117
118 if (pdevice->use_softpin)
119 bo_flags |= EXEC_OBJECT_PINNED;
120
121 if (pdevice->has_exec_async)
122 bo_flags |= EXEC_OBJECT_ASYNC;
123
124 uint64_t size = pool->slots * pool->stride;
125 result = anv_device_alloc_bo(device, size,
126 ANV_BO_ALLOC_MAPPED |
127 ANV_BO_ALLOC_SNOOPED,
128 0 /* explicit_address */,
129 &pool->bo);
130 if (result != VK_SUCCESS)
131 goto fail;
132
133 *pQueryPool = anv_query_pool_to_handle(pool);
134
135 return VK_SUCCESS;
136
137 fail:
138 vk_free2(&device->alloc, pAllocator, pool);
139
140 return result;
141 }
142
143 void genX(DestroyQueryPool)(
144 VkDevice _device,
145 VkQueryPool _pool,
146 const VkAllocationCallbacks* pAllocator)
147 {
148 ANV_FROM_HANDLE(anv_device, device, _device);
149 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
150
151 if (!pool)
152 return;
153
154 anv_device_release_bo(device, pool->bo);
155 vk_free2(&device->alloc, pAllocator, pool);
156 }
157
158 static struct anv_address
159 anv_query_address(struct anv_query_pool *pool, uint32_t query)
160 {
161 return (struct anv_address) {
162 .bo = pool->bo,
163 .offset = query * pool->stride,
164 };
165 }
166
167 /**
168 * VK_INTEL_performance_query layout (576 bytes) :
169 *
170 * ------------------------------
171 * | availability (8b) |
172 * |----------------------------|
173 * | marker (8b) |
174 * |----------------------------|
175 * | begin RPSTAT register (4b) |
176 * |----------------------------|
177 * | end RPSTAT register (4b) |
178 * |----------------------------|
179 * | begin perfcntr 1 & 2 (16b) |
180 * |----------------------------|
181 * | end perfcntr 1 & 2 (16b) |
182 * |----------------------------|
183 * | Unused (8b) |
184 * |----------------------------|
185 * | begin MI_RPC (256b) |
186 * |----------------------------|
187 * | end MI_RPC (256b) |
188 * ------------------------------
189 */
190
191 static uint32_t
192 intel_perf_marker_offset(void)
193 {
194 return 8;
195 }
196
197 static uint32_t
198 intel_perf_rpstart_offset(bool end)
199 {
200 return 16 + (end ? sizeof(uint32_t) : 0);
201 }
202
203 static uint32_t
204 intel_perf_counter(bool end)
205 {
206 return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
207 }
208
209 static uint32_t
210 intel_perf_mi_rpc_offset(bool end)
211 {
212 return 64 + (end ? 256 : 0);
213 }
214
215 static void
216 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
217 uint32_t value_index, uint64_t result)
218 {
219 if (flags & VK_QUERY_RESULT_64_BIT) {
220 uint64_t *dst64 = dst_slot;
221 dst64[value_index] = result;
222 } else {
223 uint32_t *dst32 = dst_slot;
224 dst32[value_index] = result;
225 }
226 }
227
228 static void *
229 query_slot(struct anv_query_pool *pool, uint32_t query)
230 {
231 return pool->bo->map + query * pool->stride;
232 }
233
234 static bool
235 query_is_available(struct anv_query_pool *pool, uint32_t query)
236 {
237 return *(volatile uint64_t *)query_slot(pool, query);
238 }
239
240 static VkResult
241 wait_for_available(struct anv_device *device,
242 struct anv_query_pool *pool, uint32_t query)
243 {
244 uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC);
245
246 while (anv_gettime_ns() < abs_timeout) {
247 if (query_is_available(pool, query))
248 return VK_SUCCESS;
249 VkResult status = anv_device_query_status(device);
250 if (status != VK_SUCCESS)
251 return status;
252 }
253
254 return anv_device_set_lost(device, "query timeout");
255 }
256
257 VkResult genX(GetQueryPoolResults)(
258 VkDevice _device,
259 VkQueryPool queryPool,
260 uint32_t firstQuery,
261 uint32_t queryCount,
262 size_t dataSize,
263 void* pData,
264 VkDeviceSize stride,
265 VkQueryResultFlags flags)
266 {
267 ANV_FROM_HANDLE(anv_device, device, _device);
268 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
269
270 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
271 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
272 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
273 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
274 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
275
276 if (anv_device_is_lost(device))
277 return VK_ERROR_DEVICE_LOST;
278
279 if (pData == NULL)
280 return VK_SUCCESS;
281
282 void *data_end = pData + dataSize;
283
284 VkResult status = VK_SUCCESS;
285 for (uint32_t i = 0; i < queryCount; i++) {
286 bool available = query_is_available(pool, firstQuery + i);
287
288 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
289 status = wait_for_available(device, pool, firstQuery + i);
290 if (status != VK_SUCCESS)
291 return status;
292
293 available = true;
294 }
295
296 /* From the Vulkan 1.0.42 spec:
297 *
298 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
299 * both not set then no result values are written to pData for
300 * queries that are in the unavailable state at the time of the call,
301 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
302 * availability state is still written to pData for those queries if
303 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
304 */
305 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
306
307 uint32_t idx = 0;
308 switch (pool->type) {
309 case VK_QUERY_TYPE_OCCLUSION: {
310 uint64_t *slot = query_slot(pool, firstQuery + i);
311 if (write_results)
312 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
313 idx++;
314 break;
315 }
316
317 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
318 uint64_t *slot = query_slot(pool, firstQuery + i);
319 uint32_t statistics = pool->pipeline_statistics;
320 while (statistics) {
321 uint32_t stat = u_bit_scan(&statistics);
322 if (write_results) {
323 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
324
325 /* WaDividePSInvocationCountBy4:HSW,BDW */
326 if ((device->info.gen == 8 || device->info.is_haswell) &&
327 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
328 result >>= 2;
329
330 cpu_write_query_result(pData, flags, idx, result);
331 }
332 idx++;
333 }
334 assert(idx == util_bitcount(pool->pipeline_statistics));
335 break;
336 }
337
338 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
339 uint64_t *slot = query_slot(pool, firstQuery + i);
340 if (write_results)
341 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
342 idx++;
343 if (write_results)
344 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
345 idx++;
346 break;
347 }
348
349 case VK_QUERY_TYPE_TIMESTAMP: {
350 uint64_t *slot = query_slot(pool, firstQuery + i);
351 if (write_results)
352 cpu_write_query_result(pData, flags, idx, slot[1]);
353 idx++;
354 break;
355 }
356
357 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
358 if (!write_results)
359 break;
360 const void *query_data = query_slot(pool, firstQuery + i);
361 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
362 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
363 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
364 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
365 struct gen_perf_query_result result;
366 struct gen_perf_query_info metric = {
367 .oa_format = (GEN_GEN >= 8 ?
368 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
369 I915_OA_FORMAT_A45_B8_C8),
370 };
371 uint32_t core_freq[2];
372 #if GEN_GEN < 9
373 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
374 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
375 #else
376 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
377 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
378 #endif
379 gen_perf_query_result_clear(&result);
380 gen_perf_query_result_accumulate(&result, &metric,
381 oa_begin, oa_end);
382 gen_perf_query_result_read_frequencies(&result, &device->info,
383 oa_begin, oa_end);
384 gen_perf_query_result_write_mdapi(pData, stride,
385 &device->info,
386 &result,
387 core_freq[0], core_freq[1]);
388 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
389 query_data + intel_perf_counter(false),
390 query_data + intel_perf_counter(true));
391 const uint64_t *marker = query_data + intel_perf_marker_offset();
392 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
393 break;
394 }
395
396 default:
397 unreachable("invalid pool type");
398 }
399
400 if (!write_results)
401 status = VK_NOT_READY;
402
403 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
404 cpu_write_query_result(pData, flags, idx, available);
405
406 pData += stride;
407 if (pData >= data_end)
408 break;
409 }
410
411 return status;
412 }
413
414 static void
415 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
416 struct anv_address addr)
417 {
418 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
419 pc.DestinationAddressType = DAT_PPGTT;
420 pc.PostSyncOperation = WritePSDepthCount;
421 pc.DepthStallEnable = true;
422 pc.Address = addr;
423
424 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
425 pc.CommandStreamerStallEnable = true;
426 }
427 }
428
429 static void
430 emit_query_mi_availability(struct gen_mi_builder *b,
431 struct anv_address addr,
432 bool available)
433 {
434 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
435 }
436
437 static void
438 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
439 struct anv_address addr,
440 bool available)
441 {
442 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
443 pc.DestinationAddressType = DAT_PPGTT;
444 pc.PostSyncOperation = WriteImmediateData;
445 pc.Address = addr;
446 pc.ImmediateData = available;
447 }
448 }
449
450 /**
451 * Goes through a series of consecutive query indices in the given pool
452 * setting all element values to 0 and emitting them as available.
453 */
454 static void
455 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
456 struct gen_mi_builder *b, struct anv_query_pool *pool,
457 uint32_t first_index, uint32_t num_queries)
458 {
459 switch (pool->type) {
460 case VK_QUERY_TYPE_OCCLUSION:
461 case VK_QUERY_TYPE_TIMESTAMP:
462 /* These queries are written with a PIPE_CONTROL so clear them using the
463 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
464 * of operations.
465 */
466 assert((pool->stride % 8) == 0);
467 for (uint32_t i = 0; i < num_queries; i++) {
468 struct anv_address slot_addr =
469 anv_query_address(pool, first_index + i);
470
471 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
472 emit_query_pc_availability(cmd_buffer,
473 anv_address_add(slot_addr, qword * 8),
474 false);
475 }
476 emit_query_pc_availability(cmd_buffer, slot_addr, true);
477 }
478 break;
479
480 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
481 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
482 for (uint32_t i = 0; i < num_queries; i++) {
483 struct anv_address slot_addr =
484 anv_query_address(pool, first_index + i);
485 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
486 emit_query_mi_availability(b, slot_addr, true);
487 }
488 break;
489
490 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
491 for (uint32_t i = 0; i < num_queries; i++) {
492 struct anv_address slot_addr =
493 anv_query_address(pool, first_index + i);
494 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
495 emit_query_mi_availability(b, slot_addr, true);
496 }
497 break;
498
499 default:
500 unreachable("Unsupported query type");
501 }
502 }
503
504 void genX(CmdResetQueryPool)(
505 VkCommandBuffer commandBuffer,
506 VkQueryPool queryPool,
507 uint32_t firstQuery,
508 uint32_t queryCount)
509 {
510 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
511 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
512
513 switch (pool->type) {
514 case VK_QUERY_TYPE_OCCLUSION:
515 case VK_QUERY_TYPE_TIMESTAMP:
516 for (uint32_t i = 0; i < queryCount; i++) {
517 emit_query_pc_availability(cmd_buffer,
518 anv_query_address(pool, firstQuery + i),
519 false);
520 }
521 break;
522
523 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
524 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
525 struct gen_mi_builder b;
526 gen_mi_builder_init(&b, &cmd_buffer->batch);
527
528 for (uint32_t i = 0; i < queryCount; i++)
529 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
530 break;
531 }
532
533 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
534 struct gen_mi_builder b;
535 gen_mi_builder_init(&b, &cmd_buffer->batch);
536
537 for (uint32_t i = 0; i < queryCount; i++)
538 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
539 break;
540 }
541
542 default:
543 unreachable("Unsupported query type");
544 }
545 }
546
547 void genX(ResetQueryPool)(
548 VkDevice _device,
549 VkQueryPool queryPool,
550 uint32_t firstQuery,
551 uint32_t queryCount)
552 {
553 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
554
555 for (uint32_t i = 0; i < queryCount; i++) {
556 uint64_t *slot = query_slot(pool, firstQuery + i);
557 *slot = 0;
558 }
559 }
560
561 static const uint32_t vk_pipeline_stat_to_reg[] = {
562 GENX(IA_VERTICES_COUNT_num),
563 GENX(IA_PRIMITIVES_COUNT_num),
564 GENX(VS_INVOCATION_COUNT_num),
565 GENX(GS_INVOCATION_COUNT_num),
566 GENX(GS_PRIMITIVES_COUNT_num),
567 GENX(CL_INVOCATION_COUNT_num),
568 GENX(CL_PRIMITIVES_COUNT_num),
569 GENX(PS_INVOCATION_COUNT_num),
570 GENX(HS_INVOCATION_COUNT_num),
571 GENX(DS_INVOCATION_COUNT_num),
572 GENX(CS_INVOCATION_COUNT_num),
573 };
574
575 static void
576 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
577 struct anv_address addr)
578 {
579 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
580 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
581
582 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
583 gen_mi_store(b, gen_mi_mem64(addr),
584 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
585 }
586
587 static void
588 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
589 struct anv_address addr)
590 {
591 assert(stream < MAX_XFB_STREAMS);
592
593 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
594 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
595 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
596 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
597 }
598
599 void genX(CmdBeginQuery)(
600 VkCommandBuffer commandBuffer,
601 VkQueryPool queryPool,
602 uint32_t query,
603 VkQueryControlFlags flags)
604 {
605 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
606 }
607
608 void genX(CmdBeginQueryIndexedEXT)(
609 VkCommandBuffer commandBuffer,
610 VkQueryPool queryPool,
611 uint32_t query,
612 VkQueryControlFlags flags,
613 uint32_t index)
614 {
615 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
616 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
617 struct anv_address query_addr = anv_query_address(pool, query);
618
619 struct gen_mi_builder b;
620 gen_mi_builder_init(&b, &cmd_buffer->batch);
621
622 switch (pool->type) {
623 case VK_QUERY_TYPE_OCCLUSION:
624 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
625 break;
626
627 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
628 /* TODO: This might only be necessary for certain stats */
629 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
630 pc.CommandStreamerStallEnable = true;
631 pc.StallAtPixelScoreboard = true;
632 }
633
634 uint32_t statistics = pool->pipeline_statistics;
635 uint32_t offset = 8;
636 while (statistics) {
637 uint32_t stat = u_bit_scan(&statistics);
638 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
639 offset += 16;
640 }
641 break;
642 }
643
644 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
645 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
646 pc.CommandStreamerStallEnable = true;
647 pc.StallAtPixelScoreboard = true;
648 }
649 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
650 break;
651
652 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
653 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
654 pc.CommandStreamerStallEnable = true;
655 pc.StallAtPixelScoreboard = true;
656 }
657 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
658 rpc.MemoryAddress =
659 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
660 }
661 #if GEN_GEN < 9
662 gen_mi_store(&b,
663 gen_mi_mem32(anv_address_add(query_addr,
664 intel_perf_rpstart_offset(false))),
665 gen_mi_reg32(GENX(RPSTAT1_num)));
666 #else
667 gen_mi_store(&b,
668 gen_mi_mem32(anv_address_add(query_addr,
669 intel_perf_rpstart_offset(false))),
670 gen_mi_reg32(GENX(RPSTAT0_num)));
671 #endif
672 #if GEN_GEN >= 8 && GEN_GEN <= 11
673 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
674 intel_perf_counter(false))),
675 gen_mi_reg64(GENX(PERFCNT1_num)));
676 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
677 intel_perf_counter(false) + 8)),
678 gen_mi_reg64(GENX(PERFCNT2_num)));
679 #endif
680 break;
681 }
682
683 default:
684 unreachable("");
685 }
686 }
687
688 void genX(CmdEndQuery)(
689 VkCommandBuffer commandBuffer,
690 VkQueryPool queryPool,
691 uint32_t query)
692 {
693 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
694 }
695
696 void genX(CmdEndQueryIndexedEXT)(
697 VkCommandBuffer commandBuffer,
698 VkQueryPool queryPool,
699 uint32_t query,
700 uint32_t index)
701 {
702 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
703 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
704 struct anv_address query_addr = anv_query_address(pool, query);
705
706 struct gen_mi_builder b;
707 gen_mi_builder_init(&b, &cmd_buffer->batch);
708
709 switch (pool->type) {
710 case VK_QUERY_TYPE_OCCLUSION:
711 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
712 emit_query_pc_availability(cmd_buffer, query_addr, true);
713 break;
714
715 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
716 /* TODO: This might only be necessary for certain stats */
717 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
718 pc.CommandStreamerStallEnable = true;
719 pc.StallAtPixelScoreboard = true;
720 }
721
722 uint32_t statistics = pool->pipeline_statistics;
723 uint32_t offset = 16;
724 while (statistics) {
725 uint32_t stat = u_bit_scan(&statistics);
726 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
727 offset += 16;
728 }
729
730 emit_query_mi_availability(&b, query_addr, true);
731 break;
732 }
733
734 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
735 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
736 pc.CommandStreamerStallEnable = true;
737 pc.StallAtPixelScoreboard = true;
738 }
739
740 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
741 emit_query_mi_availability(&b, query_addr, true);
742 break;
743
744 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
745 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
746 pc.CommandStreamerStallEnable = true;
747 pc.StallAtPixelScoreboard = true;
748 }
749 uint32_t marker_offset = intel_perf_marker_offset();
750 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
751 gen_mi_imm(cmd_buffer->intel_perf_marker));
752 #if GEN_GEN >= 8 && GEN_GEN <= 11
753 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
754 gen_mi_reg64(GENX(PERFCNT1_num)));
755 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
756 gen_mi_reg64(GENX(PERFCNT2_num)));
757 #endif
758 #if GEN_GEN < 9
759 gen_mi_store(&b,
760 gen_mi_mem32(anv_address_add(query_addr,
761 intel_perf_rpstart_offset(true))),
762 gen_mi_reg32(GENX(RPSTAT1_num)));
763 #else
764 gen_mi_store(&b,
765 gen_mi_mem32(anv_address_add(query_addr,
766 intel_perf_rpstart_offset(true))),
767 gen_mi_reg32(GENX(RPSTAT0_num)));
768 #endif
769 /* Position the last OA snapshot at the beginning of the query so that
770 * we can tell whether it's ready.
771 */
772 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
773 rpc.MemoryAddress = anv_address_add(query_addr,
774 intel_perf_mi_rpc_offset(true));
775 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
776 }
777 emit_query_mi_availability(&b, query_addr, true);
778 break;
779 }
780
781 default:
782 unreachable("");
783 }
784
785 /* When multiview is active the spec requires that N consecutive query
786 * indices are used, where N is the number of active views in the subpass.
787 * The spec allows that we only write the results to one of the queries
788 * but we still need to manage result availability for all the query indices.
789 * Since we only emit a single query for all active views in the
790 * first index, mark the other query indices as being already available
791 * with result 0.
792 */
793 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
794 const uint32_t num_queries =
795 util_bitcount(cmd_buffer->state.subpass->view_mask);
796 if (num_queries > 1)
797 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
798 }
799 }
800
801 #define TIMESTAMP 0x2358
802
803 void genX(CmdWriteTimestamp)(
804 VkCommandBuffer commandBuffer,
805 VkPipelineStageFlagBits pipelineStage,
806 VkQueryPool queryPool,
807 uint32_t query)
808 {
809 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
810 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
811 struct anv_address query_addr = anv_query_address(pool, query);
812
813 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
814
815 struct gen_mi_builder b;
816 gen_mi_builder_init(&b, &cmd_buffer->batch);
817
818 switch (pipelineStage) {
819 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
820 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
821 gen_mi_reg64(TIMESTAMP));
822 break;
823
824 default:
825 /* Everything else is bottom-of-pipe */
826 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
827 pc.DestinationAddressType = DAT_PPGTT;
828 pc.PostSyncOperation = WriteTimestamp;
829 pc.Address = anv_address_add(query_addr, 8);
830
831 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
832 pc.CommandStreamerStallEnable = true;
833 }
834 break;
835 }
836
837 emit_query_pc_availability(cmd_buffer, query_addr, true);
838
839 /* When multiview is active the spec requires that N consecutive query
840 * indices are used, where N is the number of active views in the subpass.
841 * The spec allows that we only write the results to one of the queries
842 * but we still need to manage result availability for all the query indices.
843 * Since we only emit a single query for all active views in the
844 * first index, mark the other query indices as being already available
845 * with result 0.
846 */
847 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
848 const uint32_t num_queries =
849 util_bitcount(cmd_buffer->state.subpass->view_mask);
850 if (num_queries > 1)
851 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
852 }
853 }
854
855 #if GEN_GEN > 7 || GEN_IS_HASWELL
856
857 static void
858 gpu_write_query_result(struct gen_mi_builder *b,
859 struct anv_address dst_addr,
860 VkQueryResultFlags flags,
861 uint32_t value_index,
862 struct gen_mi_value query_result)
863 {
864 if (flags & VK_QUERY_RESULT_64_BIT) {
865 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
866 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
867 } else {
868 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
869 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
870 }
871 }
872
873 static struct gen_mi_value
874 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
875 {
876 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
877 gen_mi_mem64(anv_address_add(addr, 0)));
878 }
879
880 void genX(CmdCopyQueryPoolResults)(
881 VkCommandBuffer commandBuffer,
882 VkQueryPool queryPool,
883 uint32_t firstQuery,
884 uint32_t queryCount,
885 VkBuffer destBuffer,
886 VkDeviceSize destOffset,
887 VkDeviceSize destStride,
888 VkQueryResultFlags flags)
889 {
890 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
891 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
892 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
893
894 struct gen_mi_builder b;
895 gen_mi_builder_init(&b, &cmd_buffer->batch);
896 struct gen_mi_value result;
897
898 /* If render target writes are ongoing, request a render target cache flush
899 * to ensure proper ordering of the commands from the 3d pipe and the
900 * command streamer.
901 */
902 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
903 cmd_buffer->state.pending_pipe_bits |=
904 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
905 }
906
907 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
908 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
909 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
910 * because we're about to copy values from MI commands, we need to
911 * stall the command streamer to make sure the PIPE_CONTROL values have
912 * landed, otherwise we could see inconsistent values & availability.
913 *
914 * From the vulkan spec:
915 *
916 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
917 * previous uses of vkCmdResetQueryPool in the same queue, without
918 * any additional synchronization."
919 */
920 pool->type == VK_QUERY_TYPE_OCCLUSION ||
921 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
922 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
923 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
924 }
925
926 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
927 for (uint32_t i = 0; i < queryCount; i++) {
928 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
929 uint32_t idx = 0;
930 switch (pool->type) {
931 case VK_QUERY_TYPE_OCCLUSION:
932 result = compute_query_result(&b, anv_address_add(query_addr, 8));
933 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
934 break;
935
936 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
937 uint32_t statistics = pool->pipeline_statistics;
938 while (statistics) {
939 uint32_t stat = u_bit_scan(&statistics);
940
941 result = compute_query_result(&b, anv_address_add(query_addr,
942 idx * 16 + 8));
943
944 /* WaDividePSInvocationCountBy4:HSW,BDW */
945 if ((cmd_buffer->device->info.gen == 8 ||
946 cmd_buffer->device->info.is_haswell) &&
947 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
948 result = gen_mi_ushr32_imm(&b, result, 2);
949 }
950
951 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
952 }
953 assert(idx == util_bitcount(pool->pipeline_statistics));
954 break;
955 }
956
957 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
958 result = compute_query_result(&b, anv_address_add(query_addr, 8));
959 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
960 result = compute_query_result(&b, anv_address_add(query_addr, 24));
961 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
962 break;
963
964 case VK_QUERY_TYPE_TIMESTAMP:
965 result = gen_mi_mem64(anv_address_add(query_addr, 8));
966 gpu_write_query_result(&b, dest_addr, flags, 0, result);
967 break;
968
969 default:
970 unreachable("unhandled query type");
971 }
972
973 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
974 gpu_write_query_result(&b, dest_addr, flags, idx,
975 gen_mi_mem64(query_addr));
976 }
977
978 dest_addr = anv_address_add(dest_addr, destStride);
979 }
980 }
981
982 #else
983 void genX(CmdCopyQueryPoolResults)(
984 VkCommandBuffer commandBuffer,
985 VkQueryPool queryPool,
986 uint32_t firstQuery,
987 uint32_t queryCount,
988 VkBuffer destBuffer,
989 VkDeviceSize destOffset,
990 VkDeviceSize destStride,
991 VkQueryResultFlags flags)
992 {
993 anv_finishme("Queries not yet supported on Ivy Bridge");
994 }
995 #endif