0a295cebb87f0a5a004679fe613c490b94116b31
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 72; /* 576 bytes, see layout below */
98 break;
99 }
100 default:
101 assert(!"Invalid query type");
102 }
103
104 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (pool == NULL)
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 pool->type = pCreateInfo->queryType;
110 pool->pipeline_statistics = pipeline_statistics;
111 pool->stride = uint64s_per_slot * sizeof(uint64_t);
112 pool->slots = pCreateInfo->queryCount;
113
114 uint32_t bo_flags = 0;
115 if (pdevice->supports_48bit_addresses)
116 bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
117
118 if (pdevice->use_softpin)
119 bo_flags |= EXEC_OBJECT_PINNED;
120
121 if (pdevice->has_exec_async)
122 bo_flags |= EXEC_OBJECT_ASYNC;
123
124 uint64_t size = pool->slots * pool->stride;
125 result = anv_device_alloc_bo(device, size,
126 ANV_BO_ALLOC_MAPPED |
127 ANV_BO_ALLOC_SNOOPED,
128 &pool->bo);
129 if (result != VK_SUCCESS)
130 goto fail;
131
132 *pQueryPool = anv_query_pool_to_handle(pool);
133
134 return VK_SUCCESS;
135
136 fail:
137 vk_free2(&device->alloc, pAllocator, pool);
138
139 return result;
140 }
141
142 void genX(DestroyQueryPool)(
143 VkDevice _device,
144 VkQueryPool _pool,
145 const VkAllocationCallbacks* pAllocator)
146 {
147 ANV_FROM_HANDLE(anv_device, device, _device);
148 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
149
150 if (!pool)
151 return;
152
153 anv_device_release_bo(device, pool->bo);
154 vk_free2(&device->alloc, pAllocator, pool);
155 }
156
157 static struct anv_address
158 anv_query_address(struct anv_query_pool *pool, uint32_t query)
159 {
160 return (struct anv_address) {
161 .bo = pool->bo,
162 .offset = query * pool->stride,
163 };
164 }
165
166 /**
167 * VK_INTEL_performance_query layout (576 bytes) :
168 *
169 * ------------------------------
170 * | availability (8b) |
171 * |----------------------------|
172 * | marker (8b) |
173 * |----------------------------|
174 * | begin RPSTAT register (4b) |
175 * |----------------------------|
176 * | end RPSTAT register (4b) |
177 * |----------------------------|
178 * | begin perfcntr 1 & 2 (16b) |
179 * |----------------------------|
180 * | end perfcntr 1 & 2 (16b) |
181 * |----------------------------|
182 * | Unused (8b) |
183 * |----------------------------|
184 * | begin MI_RPC (256b) |
185 * |----------------------------|
186 * | end MI_RPC (256b) |
187 * ------------------------------
188 */
189
190 static uint32_t
191 intel_perf_marker_offset(void)
192 {
193 return 8;
194 }
195
196 static uint32_t
197 intel_perf_rpstart_offset(bool end)
198 {
199 return 16 + (end ? sizeof(uint32_t) : 0);
200 }
201
202 static uint32_t
203 intel_perf_counter(bool end)
204 {
205 return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
206 }
207
208 static uint32_t
209 intel_perf_mi_rpc_offset(bool end)
210 {
211 return 64 + (end ? 256 : 0);
212 }
213
214 static void
215 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
216 uint32_t value_index, uint64_t result)
217 {
218 if (flags & VK_QUERY_RESULT_64_BIT) {
219 uint64_t *dst64 = dst_slot;
220 dst64[value_index] = result;
221 } else {
222 uint32_t *dst32 = dst_slot;
223 dst32[value_index] = result;
224 }
225 }
226
227 static void *
228 query_slot(struct anv_query_pool *pool, uint32_t query)
229 {
230 return pool->bo->map + query * pool->stride;
231 }
232
233 static bool
234 query_is_available(struct anv_query_pool *pool, uint32_t query)
235 {
236 return *(volatile uint64_t *)query_slot(pool, query);
237 }
238
239 static VkResult
240 wait_for_available(struct anv_device *device,
241 struct anv_query_pool *pool, uint32_t query)
242 {
243 uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC);
244
245 while (anv_gettime_ns() < abs_timeout) {
246 if (query_is_available(pool, query))
247 return VK_SUCCESS;
248 VkResult status = anv_device_query_status(device);
249 if (status != VK_SUCCESS)
250 return status;
251 }
252
253 return anv_device_set_lost(device, "query timeout");
254 }
255
256 VkResult genX(GetQueryPoolResults)(
257 VkDevice _device,
258 VkQueryPool queryPool,
259 uint32_t firstQuery,
260 uint32_t queryCount,
261 size_t dataSize,
262 void* pData,
263 VkDeviceSize stride,
264 VkQueryResultFlags flags)
265 {
266 ANV_FROM_HANDLE(anv_device, device, _device);
267 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
268
269 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
270 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
271 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
272 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
273 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
274
275 if (anv_device_is_lost(device))
276 return VK_ERROR_DEVICE_LOST;
277
278 if (pData == NULL)
279 return VK_SUCCESS;
280
281 void *data_end = pData + dataSize;
282
283 VkResult status = VK_SUCCESS;
284 for (uint32_t i = 0; i < queryCount; i++) {
285 bool available = query_is_available(pool, firstQuery + i);
286
287 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
288 status = wait_for_available(device, pool, firstQuery + i);
289 if (status != VK_SUCCESS)
290 return status;
291
292 available = true;
293 }
294
295 /* From the Vulkan 1.0.42 spec:
296 *
297 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
298 * both not set then no result values are written to pData for
299 * queries that are in the unavailable state at the time of the call,
300 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
301 * availability state is still written to pData for those queries if
302 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
303 */
304 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
305
306 uint32_t idx = 0;
307 switch (pool->type) {
308 case VK_QUERY_TYPE_OCCLUSION: {
309 uint64_t *slot = query_slot(pool, firstQuery + i);
310 if (write_results)
311 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
312 idx++;
313 break;
314 }
315
316 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
317 uint64_t *slot = query_slot(pool, firstQuery + i);
318 uint32_t statistics = pool->pipeline_statistics;
319 while (statistics) {
320 uint32_t stat = u_bit_scan(&statistics);
321 if (write_results) {
322 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
323
324 /* WaDividePSInvocationCountBy4:HSW,BDW */
325 if ((device->info.gen == 8 || device->info.is_haswell) &&
326 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
327 result >>= 2;
328
329 cpu_write_query_result(pData, flags, idx, result);
330 }
331 idx++;
332 }
333 assert(idx == util_bitcount(pool->pipeline_statistics));
334 break;
335 }
336
337 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
338 uint64_t *slot = query_slot(pool, firstQuery + i);
339 if (write_results)
340 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
341 idx++;
342 if (write_results)
343 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
344 idx++;
345 break;
346 }
347
348 case VK_QUERY_TYPE_TIMESTAMP: {
349 uint64_t *slot = query_slot(pool, firstQuery + i);
350 if (write_results)
351 cpu_write_query_result(pData, flags, idx, slot[1]);
352 idx++;
353 break;
354 }
355
356 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
357 if (!write_results)
358 break;
359 const void *query_data = query_slot(pool, firstQuery + i);
360 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
361 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
362 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
363 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
364 struct gen_perf_query_result result;
365 struct gen_perf_query_info metric = {
366 .oa_format = (GEN_GEN >= 8 ?
367 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
368 I915_OA_FORMAT_A45_B8_C8),
369 };
370 uint32_t core_freq[2];
371 #if GEN_GEN < 9
372 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
373 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
374 #else
375 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
376 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
377 #endif
378 gen_perf_query_result_clear(&result);
379 gen_perf_query_result_accumulate(&result, &metric,
380 oa_begin, oa_end);
381 gen_perf_query_result_read_frequencies(&result, &device->info,
382 oa_begin, oa_end);
383 gen_perf_query_result_write_mdapi(pData, stride,
384 &device->info,
385 &result,
386 core_freq[0], core_freq[1]);
387 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
388 query_data + intel_perf_counter(false),
389 query_data + intel_perf_counter(true));
390 const uint64_t *marker = query_data + intel_perf_marker_offset();
391 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
392 break;
393 }
394
395 default:
396 unreachable("invalid pool type");
397 }
398
399 if (!write_results)
400 status = VK_NOT_READY;
401
402 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
403 cpu_write_query_result(pData, flags, idx, available);
404
405 pData += stride;
406 if (pData >= data_end)
407 break;
408 }
409
410 return status;
411 }
412
413 static void
414 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
415 struct anv_address addr)
416 {
417 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
418 pc.DestinationAddressType = DAT_PPGTT;
419 pc.PostSyncOperation = WritePSDepthCount;
420 pc.DepthStallEnable = true;
421 pc.Address = addr;
422
423 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
424 pc.CommandStreamerStallEnable = true;
425 }
426 }
427
428 static void
429 emit_query_mi_availability(struct gen_mi_builder *b,
430 struct anv_address addr,
431 bool available)
432 {
433 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
434 }
435
436 static void
437 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
438 struct anv_address addr,
439 bool available)
440 {
441 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
442 pc.DestinationAddressType = DAT_PPGTT;
443 pc.PostSyncOperation = WriteImmediateData;
444 pc.Address = addr;
445 pc.ImmediateData = available;
446 }
447 }
448
449 /**
450 * Goes through a series of consecutive query indices in the given pool
451 * setting all element values to 0 and emitting them as available.
452 */
453 static void
454 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
455 struct gen_mi_builder *b, struct anv_query_pool *pool,
456 uint32_t first_index, uint32_t num_queries)
457 {
458 switch (pool->type) {
459 case VK_QUERY_TYPE_OCCLUSION:
460 case VK_QUERY_TYPE_TIMESTAMP:
461 /* These queries are written with a PIPE_CONTROL so clear them using the
462 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
463 * of operations.
464 */
465 assert((pool->stride % 8) == 0);
466 for (uint32_t i = 0; i < num_queries; i++) {
467 struct anv_address slot_addr =
468 anv_query_address(pool, first_index + i);
469
470 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
471 emit_query_pc_availability(cmd_buffer,
472 anv_address_add(slot_addr, qword * 8),
473 false);
474 }
475 emit_query_pc_availability(cmd_buffer, slot_addr, true);
476 }
477 break;
478
479 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
480 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
481 for (uint32_t i = 0; i < num_queries; i++) {
482 struct anv_address slot_addr =
483 anv_query_address(pool, first_index + i);
484 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
485 emit_query_mi_availability(b, slot_addr, true);
486 }
487 break;
488
489 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
490 for (uint32_t i = 0; i < num_queries; i++) {
491 struct anv_address slot_addr =
492 anv_query_address(pool, first_index + i);
493 gen_mi_memset(b, slot_addr, 0, pool->stride - 8);
494 emit_query_mi_availability(b, anv_address_add(slot_addr,
495 pool->stride - 8), true);
496 }
497 break;
498
499 default:
500 unreachable("Unsupported query type");
501 }
502 }
503
504 void genX(CmdResetQueryPool)(
505 VkCommandBuffer commandBuffer,
506 VkQueryPool queryPool,
507 uint32_t firstQuery,
508 uint32_t queryCount)
509 {
510 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
511 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
512
513 switch (pool->type) {
514 case VK_QUERY_TYPE_OCCLUSION:
515 case VK_QUERY_TYPE_TIMESTAMP:
516 for (uint32_t i = 0; i < queryCount; i++) {
517 emit_query_pc_availability(cmd_buffer,
518 anv_query_address(pool, firstQuery + i),
519 false);
520 }
521 break;
522
523 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
524 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
525 struct gen_mi_builder b;
526 gen_mi_builder_init(&b, &cmd_buffer->batch);
527
528 for (uint32_t i = 0; i < queryCount; i++)
529 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
530 break;
531 }
532
533 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
534 struct gen_mi_builder b;
535 gen_mi_builder_init(&b, &cmd_buffer->batch);
536
537 for (uint32_t i = 0; i < queryCount; i++) {
538 emit_query_mi_availability(
539 &b,
540 anv_address_add(
541 anv_query_address(pool, firstQuery + i),
542 pool->stride - 8),
543 false);
544 }
545 break;
546 }
547
548 default:
549 unreachable("Unsupported query type");
550 }
551 }
552
553 void genX(ResetQueryPoolEXT)(
554 VkDevice _device,
555 VkQueryPool queryPool,
556 uint32_t firstQuery,
557 uint32_t queryCount)
558 {
559 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
560
561 for (uint32_t i = 0; i < queryCount; i++) {
562 uint64_t *slot = query_slot(pool, firstQuery + i);
563 *slot = 0;
564 }
565 }
566
567 static const uint32_t vk_pipeline_stat_to_reg[] = {
568 GENX(IA_VERTICES_COUNT_num),
569 GENX(IA_PRIMITIVES_COUNT_num),
570 GENX(VS_INVOCATION_COUNT_num),
571 GENX(GS_INVOCATION_COUNT_num),
572 GENX(GS_PRIMITIVES_COUNT_num),
573 GENX(CL_INVOCATION_COUNT_num),
574 GENX(CL_PRIMITIVES_COUNT_num),
575 GENX(PS_INVOCATION_COUNT_num),
576 GENX(HS_INVOCATION_COUNT_num),
577 GENX(DS_INVOCATION_COUNT_num),
578 GENX(CS_INVOCATION_COUNT_num),
579 };
580
581 static void
582 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
583 struct anv_address addr)
584 {
585 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
586 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
587
588 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
589 gen_mi_store(b, gen_mi_mem64(addr),
590 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
591 }
592
593 static void
594 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
595 struct anv_address addr)
596 {
597 assert(stream < MAX_XFB_STREAMS);
598
599 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
600 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
601 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
602 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
603 }
604
605 void genX(CmdBeginQuery)(
606 VkCommandBuffer commandBuffer,
607 VkQueryPool queryPool,
608 uint32_t query,
609 VkQueryControlFlags flags)
610 {
611 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
612 }
613
614 void genX(CmdBeginQueryIndexedEXT)(
615 VkCommandBuffer commandBuffer,
616 VkQueryPool queryPool,
617 uint32_t query,
618 VkQueryControlFlags flags,
619 uint32_t index)
620 {
621 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
622 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
623 struct anv_address query_addr = anv_query_address(pool, query);
624
625 struct gen_mi_builder b;
626 gen_mi_builder_init(&b, &cmd_buffer->batch);
627
628 switch (pool->type) {
629 case VK_QUERY_TYPE_OCCLUSION:
630 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
631 break;
632
633 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
634 /* TODO: This might only be necessary for certain stats */
635 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
636 pc.CommandStreamerStallEnable = true;
637 pc.StallAtPixelScoreboard = true;
638 }
639
640 uint32_t statistics = pool->pipeline_statistics;
641 uint32_t offset = 8;
642 while (statistics) {
643 uint32_t stat = u_bit_scan(&statistics);
644 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
645 offset += 16;
646 }
647 break;
648 }
649
650 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
651 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
652 pc.CommandStreamerStallEnable = true;
653 pc.StallAtPixelScoreboard = true;
654 }
655 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
656 break;
657
658 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
659 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
660 pc.CommandStreamerStallEnable = true;
661 pc.StallAtPixelScoreboard = true;
662 }
663 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
664 rpc.MemoryAddress =
665 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
666 }
667 #if GEN_GEN < 9
668 gen_mi_store(&b,
669 gen_mi_mem32(anv_address_add(query_addr,
670 intel_perf_rpstart_offset(false))),
671 gen_mi_reg32(GENX(RPSTAT1_num)));
672 #else
673 gen_mi_store(&b,
674 gen_mi_mem32(anv_address_add(query_addr,
675 intel_perf_rpstart_offset(false))),
676 gen_mi_reg32(GENX(RPSTAT0_num)));
677 #endif
678 #if GEN_GEN >= 8 && GEN_GEN <= 11
679 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
680 intel_perf_counter(false))),
681 gen_mi_reg64(GENX(PERFCNT1_num)));
682 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
683 intel_perf_counter(false) + 8)),
684 gen_mi_reg64(GENX(PERFCNT2_num)));
685 #endif
686 break;
687 }
688
689 default:
690 unreachable("");
691 }
692 }
693
694 void genX(CmdEndQuery)(
695 VkCommandBuffer commandBuffer,
696 VkQueryPool queryPool,
697 uint32_t query)
698 {
699 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
700 }
701
702 void genX(CmdEndQueryIndexedEXT)(
703 VkCommandBuffer commandBuffer,
704 VkQueryPool queryPool,
705 uint32_t query,
706 uint32_t index)
707 {
708 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
709 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
710 struct anv_address query_addr = anv_query_address(pool, query);
711
712 struct gen_mi_builder b;
713 gen_mi_builder_init(&b, &cmd_buffer->batch);
714
715 switch (pool->type) {
716 case VK_QUERY_TYPE_OCCLUSION:
717 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
718 emit_query_pc_availability(cmd_buffer, query_addr, true);
719 break;
720
721 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
722 /* TODO: This might only be necessary for certain stats */
723 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
724 pc.CommandStreamerStallEnable = true;
725 pc.StallAtPixelScoreboard = true;
726 }
727
728 uint32_t statistics = pool->pipeline_statistics;
729 uint32_t offset = 16;
730 while (statistics) {
731 uint32_t stat = u_bit_scan(&statistics);
732 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
733 offset += 16;
734 }
735
736 emit_query_mi_availability(&b, query_addr, true);
737 break;
738 }
739
740 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
741 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
742 pc.CommandStreamerStallEnable = true;
743 pc.StallAtPixelScoreboard = true;
744 }
745
746 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
747 emit_query_mi_availability(&b, query_addr, true);
748 break;
749
750 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
751 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
752 pc.CommandStreamerStallEnable = true;
753 pc.StallAtPixelScoreboard = true;
754 }
755 uint32_t marker_offset = intel_perf_marker_offset();
756 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
757 gen_mi_imm(cmd_buffer->intel_perf_marker));
758 #if GEN_GEN >= 8 && GEN_GEN <= 11
759 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
760 gen_mi_reg64(GENX(PERFCNT1_num)));
761 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
762 gen_mi_reg64(GENX(PERFCNT2_num)));
763 #endif
764 #if GEN_GEN < 9
765 gen_mi_store(&b,
766 gen_mi_mem32(anv_address_add(query_addr,
767 intel_perf_rpstart_offset(true))),
768 gen_mi_reg32(GENX(RPSTAT1_num)));
769 #else
770 gen_mi_store(&b,
771 gen_mi_mem32(anv_address_add(query_addr,
772 intel_perf_rpstart_offset(true))),
773 gen_mi_reg32(GENX(RPSTAT0_num)));
774 #endif
775 /* Position the last OA snapshot at the beginning of the query so that
776 * we can tell whether it's ready.
777 */
778 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
779 rpc.MemoryAddress = anv_address_add(query_addr,
780 intel_perf_mi_rpc_offset(true));
781 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
782 }
783 emit_query_mi_availability(&b,
784 anv_address_add(query_addr, pool->stride - 8),
785 true);
786 break;
787 }
788
789 default:
790 unreachable("");
791 }
792
793 /* When multiview is active the spec requires that N consecutive query
794 * indices are used, where N is the number of active views in the subpass.
795 * The spec allows that we only write the results to one of the queries
796 * but we still need to manage result availability for all the query indices.
797 * Since we only emit a single query for all active views in the
798 * first index, mark the other query indices as being already available
799 * with result 0.
800 */
801 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
802 const uint32_t num_queries =
803 util_bitcount(cmd_buffer->state.subpass->view_mask);
804 if (num_queries > 1)
805 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
806 }
807 }
808
809 #define TIMESTAMP 0x2358
810
811 void genX(CmdWriteTimestamp)(
812 VkCommandBuffer commandBuffer,
813 VkPipelineStageFlagBits pipelineStage,
814 VkQueryPool queryPool,
815 uint32_t query)
816 {
817 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
818 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
819 struct anv_address query_addr = anv_query_address(pool, query);
820
821 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
822
823 struct gen_mi_builder b;
824 gen_mi_builder_init(&b, &cmd_buffer->batch);
825
826 switch (pipelineStage) {
827 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
828 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
829 gen_mi_reg64(TIMESTAMP));
830 break;
831
832 default:
833 /* Everything else is bottom-of-pipe */
834 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
835 pc.DestinationAddressType = DAT_PPGTT;
836 pc.PostSyncOperation = WriteTimestamp;
837 pc.Address = anv_address_add(query_addr, 8);
838
839 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
840 pc.CommandStreamerStallEnable = true;
841 }
842 break;
843 }
844
845 emit_query_pc_availability(cmd_buffer, query_addr, true);
846
847 /* When multiview is active the spec requires that N consecutive query
848 * indices are used, where N is the number of active views in the subpass.
849 * The spec allows that we only write the results to one of the queries
850 * but we still need to manage result availability for all the query indices.
851 * Since we only emit a single query for all active views in the
852 * first index, mark the other query indices as being already available
853 * with result 0.
854 */
855 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
856 const uint32_t num_queries =
857 util_bitcount(cmd_buffer->state.subpass->view_mask);
858 if (num_queries > 1)
859 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
860 }
861 }
862
863 #if GEN_GEN > 7 || GEN_IS_HASWELL
864
865 static void
866 gpu_write_query_result(struct gen_mi_builder *b,
867 struct anv_address dst_addr,
868 VkQueryResultFlags flags,
869 uint32_t value_index,
870 struct gen_mi_value query_result)
871 {
872 if (flags & VK_QUERY_RESULT_64_BIT) {
873 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
874 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
875 } else {
876 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
877 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
878 }
879 }
880
881 static struct gen_mi_value
882 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
883 {
884 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
885 gen_mi_mem64(anv_address_add(addr, 0)));
886 }
887
888 void genX(CmdCopyQueryPoolResults)(
889 VkCommandBuffer commandBuffer,
890 VkQueryPool queryPool,
891 uint32_t firstQuery,
892 uint32_t queryCount,
893 VkBuffer destBuffer,
894 VkDeviceSize destOffset,
895 VkDeviceSize destStride,
896 VkQueryResultFlags flags)
897 {
898 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
899 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
900 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
901
902 struct gen_mi_builder b;
903 gen_mi_builder_init(&b, &cmd_buffer->batch);
904 struct gen_mi_value result;
905
906 /* If render target writes are ongoing, request a render target cache flush
907 * to ensure proper ordering of the commands from the 3d pipe and the
908 * command streamer.
909 */
910 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
911 cmd_buffer->state.pending_pipe_bits |=
912 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
913 }
914
915 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
916 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
917 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
918 * because we're about to copy values from MI commands, we need to
919 * stall the command streamer to make sure the PIPE_CONTROL values have
920 * landed, otherwise we could see inconsistent values & availability.
921 *
922 * From the vulkan spec:
923 *
924 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
925 * previous uses of vkCmdResetQueryPool in the same queue, without
926 * any additional synchronization."
927 */
928 pool->type == VK_QUERY_TYPE_OCCLUSION ||
929 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
930 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
931 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
932 }
933
934 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
935 for (uint32_t i = 0; i < queryCount; i++) {
936 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
937 uint32_t idx = 0;
938 switch (pool->type) {
939 case VK_QUERY_TYPE_OCCLUSION:
940 result = compute_query_result(&b, anv_address_add(query_addr, 8));
941 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
942 break;
943
944 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
945 uint32_t statistics = pool->pipeline_statistics;
946 while (statistics) {
947 uint32_t stat = u_bit_scan(&statistics);
948
949 result = compute_query_result(&b, anv_address_add(query_addr,
950 idx * 16 + 8));
951
952 /* WaDividePSInvocationCountBy4:HSW,BDW */
953 if ((cmd_buffer->device->info.gen == 8 ||
954 cmd_buffer->device->info.is_haswell) &&
955 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
956 result = gen_mi_ushr32_imm(&b, result, 2);
957 }
958
959 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
960 }
961 assert(idx == util_bitcount(pool->pipeline_statistics));
962 break;
963 }
964
965 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
966 result = compute_query_result(&b, anv_address_add(query_addr, 8));
967 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
968 result = compute_query_result(&b, anv_address_add(query_addr, 24));
969 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
970 break;
971
972 case VK_QUERY_TYPE_TIMESTAMP:
973 result = gen_mi_mem64(anv_address_add(query_addr, 8));
974 gpu_write_query_result(&b, dest_addr, flags, 0, result);
975 break;
976
977 default:
978 unreachable("unhandled query type");
979 }
980
981 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
982 gpu_write_query_result(&b, dest_addr, flags, idx,
983 gen_mi_mem64(query_addr));
984 }
985
986 dest_addr = anv_address_add(dest_addr, destStride);
987 }
988 }
989
990 #else
991 void genX(CmdCopyQueryPoolResults)(
992 VkCommandBuffer commandBuffer,
993 VkQueryPool queryPool,
994 uint32_t firstQuery,
995 uint32_t queryCount,
996 VkBuffer destBuffer,
997 VkDeviceSize destOffset,
998 VkDeviceSize destStride,
999 VkQueryResultFlags flags)
1000 {
1001 anv_finishme("Queries not yet supported on Ivy Bridge");
1002 }
1003 #endif