anv: Add an explicit_address parameter to anv_device_alloc_bo
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 72; /* 576 bytes, see layout below */
98 break;
99 }
100 default:
101 assert(!"Invalid query type");
102 }
103
104 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (pool == NULL)
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 pool->type = pCreateInfo->queryType;
110 pool->pipeline_statistics = pipeline_statistics;
111 pool->stride = uint64s_per_slot * sizeof(uint64_t);
112 pool->slots = pCreateInfo->queryCount;
113
114 uint32_t bo_flags = 0;
115 if (pdevice->supports_48bit_addresses)
116 bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
117
118 if (pdevice->use_softpin)
119 bo_flags |= EXEC_OBJECT_PINNED;
120
121 if (pdevice->has_exec_async)
122 bo_flags |= EXEC_OBJECT_ASYNC;
123
124 uint64_t size = pool->slots * pool->stride;
125 result = anv_device_alloc_bo(device, size,
126 ANV_BO_ALLOC_MAPPED |
127 ANV_BO_ALLOC_SNOOPED,
128 0 /* explicit_address */,
129 &pool->bo);
130 if (result != VK_SUCCESS)
131 goto fail;
132
133 *pQueryPool = anv_query_pool_to_handle(pool);
134
135 return VK_SUCCESS;
136
137 fail:
138 vk_free2(&device->alloc, pAllocator, pool);
139
140 return result;
141 }
142
143 void genX(DestroyQueryPool)(
144 VkDevice _device,
145 VkQueryPool _pool,
146 const VkAllocationCallbacks* pAllocator)
147 {
148 ANV_FROM_HANDLE(anv_device, device, _device);
149 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
150
151 if (!pool)
152 return;
153
154 anv_device_release_bo(device, pool->bo);
155 vk_free2(&device->alloc, pAllocator, pool);
156 }
157
158 static struct anv_address
159 anv_query_address(struct anv_query_pool *pool, uint32_t query)
160 {
161 return (struct anv_address) {
162 .bo = pool->bo,
163 .offset = query * pool->stride,
164 };
165 }
166
167 /**
168 * VK_INTEL_performance_query layout (576 bytes) :
169 *
170 * ------------------------------
171 * | availability (8b) |
172 * |----------------------------|
173 * | marker (8b) |
174 * |----------------------------|
175 * | begin RPSTAT register (4b) |
176 * |----------------------------|
177 * | end RPSTAT register (4b) |
178 * |----------------------------|
179 * | begin perfcntr 1 & 2 (16b) |
180 * |----------------------------|
181 * | end perfcntr 1 & 2 (16b) |
182 * |----------------------------|
183 * | Unused (8b) |
184 * |----------------------------|
185 * | begin MI_RPC (256b) |
186 * |----------------------------|
187 * | end MI_RPC (256b) |
188 * ------------------------------
189 */
190
191 static uint32_t
192 intel_perf_marker_offset(void)
193 {
194 return 8;
195 }
196
197 static uint32_t
198 intel_perf_rpstart_offset(bool end)
199 {
200 return 16 + (end ? sizeof(uint32_t) : 0);
201 }
202
203 static uint32_t
204 intel_perf_counter(bool end)
205 {
206 return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
207 }
208
209 static uint32_t
210 intel_perf_mi_rpc_offset(bool end)
211 {
212 return 64 + (end ? 256 : 0);
213 }
214
215 static void
216 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
217 uint32_t value_index, uint64_t result)
218 {
219 if (flags & VK_QUERY_RESULT_64_BIT) {
220 uint64_t *dst64 = dst_slot;
221 dst64[value_index] = result;
222 } else {
223 uint32_t *dst32 = dst_slot;
224 dst32[value_index] = result;
225 }
226 }
227
228 static void *
229 query_slot(struct anv_query_pool *pool, uint32_t query)
230 {
231 return pool->bo->map + query * pool->stride;
232 }
233
234 static bool
235 query_is_available(struct anv_query_pool *pool, uint32_t query)
236 {
237 return *(volatile uint64_t *)query_slot(pool, query);
238 }
239
240 static VkResult
241 wait_for_available(struct anv_device *device,
242 struct anv_query_pool *pool, uint32_t query)
243 {
244 uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC);
245
246 while (anv_gettime_ns() < abs_timeout) {
247 if (query_is_available(pool, query))
248 return VK_SUCCESS;
249 VkResult status = anv_device_query_status(device);
250 if (status != VK_SUCCESS)
251 return status;
252 }
253
254 return anv_device_set_lost(device, "query timeout");
255 }
256
257 VkResult genX(GetQueryPoolResults)(
258 VkDevice _device,
259 VkQueryPool queryPool,
260 uint32_t firstQuery,
261 uint32_t queryCount,
262 size_t dataSize,
263 void* pData,
264 VkDeviceSize stride,
265 VkQueryResultFlags flags)
266 {
267 ANV_FROM_HANDLE(anv_device, device, _device);
268 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
269
270 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
271 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
272 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
273 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
274 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
275
276 if (anv_device_is_lost(device))
277 return VK_ERROR_DEVICE_LOST;
278
279 if (pData == NULL)
280 return VK_SUCCESS;
281
282 void *data_end = pData + dataSize;
283
284 VkResult status = VK_SUCCESS;
285 for (uint32_t i = 0; i < queryCount; i++) {
286 bool available = query_is_available(pool, firstQuery + i);
287
288 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
289 status = wait_for_available(device, pool, firstQuery + i);
290 if (status != VK_SUCCESS)
291 return status;
292
293 available = true;
294 }
295
296 /* From the Vulkan 1.0.42 spec:
297 *
298 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
299 * both not set then no result values are written to pData for
300 * queries that are in the unavailable state at the time of the call,
301 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
302 * availability state is still written to pData for those queries if
303 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
304 */
305 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
306
307 uint32_t idx = 0;
308 switch (pool->type) {
309 case VK_QUERY_TYPE_OCCLUSION: {
310 uint64_t *slot = query_slot(pool, firstQuery + i);
311 if (write_results)
312 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
313 idx++;
314 break;
315 }
316
317 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
318 uint64_t *slot = query_slot(pool, firstQuery + i);
319 uint32_t statistics = pool->pipeline_statistics;
320 while (statistics) {
321 uint32_t stat = u_bit_scan(&statistics);
322 if (write_results) {
323 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
324
325 /* WaDividePSInvocationCountBy4:HSW,BDW */
326 if ((device->info.gen == 8 || device->info.is_haswell) &&
327 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
328 result >>= 2;
329
330 cpu_write_query_result(pData, flags, idx, result);
331 }
332 idx++;
333 }
334 assert(idx == util_bitcount(pool->pipeline_statistics));
335 break;
336 }
337
338 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
339 uint64_t *slot = query_slot(pool, firstQuery + i);
340 if (write_results)
341 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
342 idx++;
343 if (write_results)
344 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
345 idx++;
346 break;
347 }
348
349 case VK_QUERY_TYPE_TIMESTAMP: {
350 uint64_t *slot = query_slot(pool, firstQuery + i);
351 if (write_results)
352 cpu_write_query_result(pData, flags, idx, slot[1]);
353 idx++;
354 break;
355 }
356
357 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
358 if (!write_results)
359 break;
360 const void *query_data = query_slot(pool, firstQuery + i);
361 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
362 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
363 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
364 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
365 struct gen_perf_query_result result;
366 struct gen_perf_query_info metric = {
367 .oa_format = (GEN_GEN >= 8 ?
368 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
369 I915_OA_FORMAT_A45_B8_C8),
370 };
371 uint32_t core_freq[2];
372 #if GEN_GEN < 9
373 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
374 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
375 #else
376 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
377 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
378 #endif
379 gen_perf_query_result_clear(&result);
380 gen_perf_query_result_accumulate(&result, &metric,
381 oa_begin, oa_end);
382 gen_perf_query_result_read_frequencies(&result, &device->info,
383 oa_begin, oa_end);
384 gen_perf_query_result_write_mdapi(pData, stride,
385 &device->info,
386 &result,
387 core_freq[0], core_freq[1]);
388 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
389 query_data + intel_perf_counter(false),
390 query_data + intel_perf_counter(true));
391 const uint64_t *marker = query_data + intel_perf_marker_offset();
392 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
393 break;
394 }
395
396 default:
397 unreachable("invalid pool type");
398 }
399
400 if (!write_results)
401 status = VK_NOT_READY;
402
403 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
404 cpu_write_query_result(pData, flags, idx, available);
405
406 pData += stride;
407 if (pData >= data_end)
408 break;
409 }
410
411 return status;
412 }
413
414 static void
415 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
416 struct anv_address addr)
417 {
418 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
419 pc.DestinationAddressType = DAT_PPGTT;
420 pc.PostSyncOperation = WritePSDepthCount;
421 pc.DepthStallEnable = true;
422 pc.Address = addr;
423
424 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
425 pc.CommandStreamerStallEnable = true;
426 }
427 }
428
429 static void
430 emit_query_mi_availability(struct gen_mi_builder *b,
431 struct anv_address addr,
432 bool available)
433 {
434 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
435 }
436
437 static void
438 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
439 struct anv_address addr,
440 bool available)
441 {
442 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
443 pc.DestinationAddressType = DAT_PPGTT;
444 pc.PostSyncOperation = WriteImmediateData;
445 pc.Address = addr;
446 pc.ImmediateData = available;
447 }
448 }
449
450 /**
451 * Goes through a series of consecutive query indices in the given pool
452 * setting all element values to 0 and emitting them as available.
453 */
454 static void
455 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
456 struct gen_mi_builder *b, struct anv_query_pool *pool,
457 uint32_t first_index, uint32_t num_queries)
458 {
459 switch (pool->type) {
460 case VK_QUERY_TYPE_OCCLUSION:
461 case VK_QUERY_TYPE_TIMESTAMP:
462 /* These queries are written with a PIPE_CONTROL so clear them using the
463 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
464 * of operations.
465 */
466 assert((pool->stride % 8) == 0);
467 for (uint32_t i = 0; i < num_queries; i++) {
468 struct anv_address slot_addr =
469 anv_query_address(pool, first_index + i);
470
471 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
472 emit_query_pc_availability(cmd_buffer,
473 anv_address_add(slot_addr, qword * 8),
474 false);
475 }
476 emit_query_pc_availability(cmd_buffer, slot_addr, true);
477 }
478 break;
479
480 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
481 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
482 for (uint32_t i = 0; i < num_queries; i++) {
483 struct anv_address slot_addr =
484 anv_query_address(pool, first_index + i);
485 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
486 emit_query_mi_availability(b, slot_addr, true);
487 }
488 break;
489
490 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
491 for (uint32_t i = 0; i < num_queries; i++) {
492 struct anv_address slot_addr =
493 anv_query_address(pool, first_index + i);
494 gen_mi_memset(b, slot_addr, 0, pool->stride - 8);
495 emit_query_mi_availability(b, anv_address_add(slot_addr,
496 pool->stride - 8), true);
497 }
498 break;
499
500 default:
501 unreachable("Unsupported query type");
502 }
503 }
504
505 void genX(CmdResetQueryPool)(
506 VkCommandBuffer commandBuffer,
507 VkQueryPool queryPool,
508 uint32_t firstQuery,
509 uint32_t queryCount)
510 {
511 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
512 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
513
514 switch (pool->type) {
515 case VK_QUERY_TYPE_OCCLUSION:
516 case VK_QUERY_TYPE_TIMESTAMP:
517 for (uint32_t i = 0; i < queryCount; i++) {
518 emit_query_pc_availability(cmd_buffer,
519 anv_query_address(pool, firstQuery + i),
520 false);
521 }
522 break;
523
524 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
525 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
526 struct gen_mi_builder b;
527 gen_mi_builder_init(&b, &cmd_buffer->batch);
528
529 for (uint32_t i = 0; i < queryCount; i++)
530 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
531 break;
532 }
533
534 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
535 struct gen_mi_builder b;
536 gen_mi_builder_init(&b, &cmd_buffer->batch);
537
538 for (uint32_t i = 0; i < queryCount; i++) {
539 emit_query_mi_availability(
540 &b,
541 anv_address_add(
542 anv_query_address(pool, firstQuery + i),
543 pool->stride - 8),
544 false);
545 }
546 break;
547 }
548
549 default:
550 unreachable("Unsupported query type");
551 }
552 }
553
554 void genX(ResetQueryPoolEXT)(
555 VkDevice _device,
556 VkQueryPool queryPool,
557 uint32_t firstQuery,
558 uint32_t queryCount)
559 {
560 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
561
562 for (uint32_t i = 0; i < queryCount; i++) {
563 uint64_t *slot = query_slot(pool, firstQuery + i);
564 *slot = 0;
565 }
566 }
567
568 static const uint32_t vk_pipeline_stat_to_reg[] = {
569 GENX(IA_VERTICES_COUNT_num),
570 GENX(IA_PRIMITIVES_COUNT_num),
571 GENX(VS_INVOCATION_COUNT_num),
572 GENX(GS_INVOCATION_COUNT_num),
573 GENX(GS_PRIMITIVES_COUNT_num),
574 GENX(CL_INVOCATION_COUNT_num),
575 GENX(CL_PRIMITIVES_COUNT_num),
576 GENX(PS_INVOCATION_COUNT_num),
577 GENX(HS_INVOCATION_COUNT_num),
578 GENX(DS_INVOCATION_COUNT_num),
579 GENX(CS_INVOCATION_COUNT_num),
580 };
581
582 static void
583 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
584 struct anv_address addr)
585 {
586 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
587 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
588
589 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
590 gen_mi_store(b, gen_mi_mem64(addr),
591 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
592 }
593
594 static void
595 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
596 struct anv_address addr)
597 {
598 assert(stream < MAX_XFB_STREAMS);
599
600 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
601 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
602 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
603 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
604 }
605
606 void genX(CmdBeginQuery)(
607 VkCommandBuffer commandBuffer,
608 VkQueryPool queryPool,
609 uint32_t query,
610 VkQueryControlFlags flags)
611 {
612 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
613 }
614
615 void genX(CmdBeginQueryIndexedEXT)(
616 VkCommandBuffer commandBuffer,
617 VkQueryPool queryPool,
618 uint32_t query,
619 VkQueryControlFlags flags,
620 uint32_t index)
621 {
622 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
623 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
624 struct anv_address query_addr = anv_query_address(pool, query);
625
626 struct gen_mi_builder b;
627 gen_mi_builder_init(&b, &cmd_buffer->batch);
628
629 switch (pool->type) {
630 case VK_QUERY_TYPE_OCCLUSION:
631 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
632 break;
633
634 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
635 /* TODO: This might only be necessary for certain stats */
636 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
637 pc.CommandStreamerStallEnable = true;
638 pc.StallAtPixelScoreboard = true;
639 }
640
641 uint32_t statistics = pool->pipeline_statistics;
642 uint32_t offset = 8;
643 while (statistics) {
644 uint32_t stat = u_bit_scan(&statistics);
645 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
646 offset += 16;
647 }
648 break;
649 }
650
651 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
652 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
653 pc.CommandStreamerStallEnable = true;
654 pc.StallAtPixelScoreboard = true;
655 }
656 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
657 break;
658
659 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
660 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
661 pc.CommandStreamerStallEnable = true;
662 pc.StallAtPixelScoreboard = true;
663 }
664 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
665 rpc.MemoryAddress =
666 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
667 }
668 #if GEN_GEN < 9
669 gen_mi_store(&b,
670 gen_mi_mem32(anv_address_add(query_addr,
671 intel_perf_rpstart_offset(false))),
672 gen_mi_reg32(GENX(RPSTAT1_num)));
673 #else
674 gen_mi_store(&b,
675 gen_mi_mem32(anv_address_add(query_addr,
676 intel_perf_rpstart_offset(false))),
677 gen_mi_reg32(GENX(RPSTAT0_num)));
678 #endif
679 #if GEN_GEN >= 8 && GEN_GEN <= 11
680 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
681 intel_perf_counter(false))),
682 gen_mi_reg64(GENX(PERFCNT1_num)));
683 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
684 intel_perf_counter(false) + 8)),
685 gen_mi_reg64(GENX(PERFCNT2_num)));
686 #endif
687 break;
688 }
689
690 default:
691 unreachable("");
692 }
693 }
694
695 void genX(CmdEndQuery)(
696 VkCommandBuffer commandBuffer,
697 VkQueryPool queryPool,
698 uint32_t query)
699 {
700 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
701 }
702
703 void genX(CmdEndQueryIndexedEXT)(
704 VkCommandBuffer commandBuffer,
705 VkQueryPool queryPool,
706 uint32_t query,
707 uint32_t index)
708 {
709 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
710 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
711 struct anv_address query_addr = anv_query_address(pool, query);
712
713 struct gen_mi_builder b;
714 gen_mi_builder_init(&b, &cmd_buffer->batch);
715
716 switch (pool->type) {
717 case VK_QUERY_TYPE_OCCLUSION:
718 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
719 emit_query_pc_availability(cmd_buffer, query_addr, true);
720 break;
721
722 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
723 /* TODO: This might only be necessary for certain stats */
724 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
725 pc.CommandStreamerStallEnable = true;
726 pc.StallAtPixelScoreboard = true;
727 }
728
729 uint32_t statistics = pool->pipeline_statistics;
730 uint32_t offset = 16;
731 while (statistics) {
732 uint32_t stat = u_bit_scan(&statistics);
733 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
734 offset += 16;
735 }
736
737 emit_query_mi_availability(&b, query_addr, true);
738 break;
739 }
740
741 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
742 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
743 pc.CommandStreamerStallEnable = true;
744 pc.StallAtPixelScoreboard = true;
745 }
746
747 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
748 emit_query_mi_availability(&b, query_addr, true);
749 break;
750
751 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
752 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
753 pc.CommandStreamerStallEnable = true;
754 pc.StallAtPixelScoreboard = true;
755 }
756 uint32_t marker_offset = intel_perf_marker_offset();
757 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
758 gen_mi_imm(cmd_buffer->intel_perf_marker));
759 #if GEN_GEN >= 8 && GEN_GEN <= 11
760 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
761 gen_mi_reg64(GENX(PERFCNT1_num)));
762 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
763 gen_mi_reg64(GENX(PERFCNT2_num)));
764 #endif
765 #if GEN_GEN < 9
766 gen_mi_store(&b,
767 gen_mi_mem32(anv_address_add(query_addr,
768 intel_perf_rpstart_offset(true))),
769 gen_mi_reg32(GENX(RPSTAT1_num)));
770 #else
771 gen_mi_store(&b,
772 gen_mi_mem32(anv_address_add(query_addr,
773 intel_perf_rpstart_offset(true))),
774 gen_mi_reg32(GENX(RPSTAT0_num)));
775 #endif
776 /* Position the last OA snapshot at the beginning of the query so that
777 * we can tell whether it's ready.
778 */
779 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
780 rpc.MemoryAddress = anv_address_add(query_addr,
781 intel_perf_mi_rpc_offset(true));
782 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
783 }
784 emit_query_mi_availability(&b,
785 anv_address_add(query_addr, pool->stride - 8),
786 true);
787 break;
788 }
789
790 default:
791 unreachable("");
792 }
793
794 /* When multiview is active the spec requires that N consecutive query
795 * indices are used, where N is the number of active views in the subpass.
796 * The spec allows that we only write the results to one of the queries
797 * but we still need to manage result availability for all the query indices.
798 * Since we only emit a single query for all active views in the
799 * first index, mark the other query indices as being already available
800 * with result 0.
801 */
802 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
803 const uint32_t num_queries =
804 util_bitcount(cmd_buffer->state.subpass->view_mask);
805 if (num_queries > 1)
806 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
807 }
808 }
809
810 #define TIMESTAMP 0x2358
811
812 void genX(CmdWriteTimestamp)(
813 VkCommandBuffer commandBuffer,
814 VkPipelineStageFlagBits pipelineStage,
815 VkQueryPool queryPool,
816 uint32_t query)
817 {
818 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
819 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
820 struct anv_address query_addr = anv_query_address(pool, query);
821
822 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
823
824 struct gen_mi_builder b;
825 gen_mi_builder_init(&b, &cmd_buffer->batch);
826
827 switch (pipelineStage) {
828 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
829 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
830 gen_mi_reg64(TIMESTAMP));
831 break;
832
833 default:
834 /* Everything else is bottom-of-pipe */
835 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
836 pc.DestinationAddressType = DAT_PPGTT;
837 pc.PostSyncOperation = WriteTimestamp;
838 pc.Address = anv_address_add(query_addr, 8);
839
840 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
841 pc.CommandStreamerStallEnable = true;
842 }
843 break;
844 }
845
846 emit_query_pc_availability(cmd_buffer, query_addr, true);
847
848 /* When multiview is active the spec requires that N consecutive query
849 * indices are used, where N is the number of active views in the subpass.
850 * The spec allows that we only write the results to one of the queries
851 * but we still need to manage result availability for all the query indices.
852 * Since we only emit a single query for all active views in the
853 * first index, mark the other query indices as being already available
854 * with result 0.
855 */
856 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
857 const uint32_t num_queries =
858 util_bitcount(cmd_buffer->state.subpass->view_mask);
859 if (num_queries > 1)
860 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
861 }
862 }
863
864 #if GEN_GEN > 7 || GEN_IS_HASWELL
865
866 static void
867 gpu_write_query_result(struct gen_mi_builder *b,
868 struct anv_address dst_addr,
869 VkQueryResultFlags flags,
870 uint32_t value_index,
871 struct gen_mi_value query_result)
872 {
873 if (flags & VK_QUERY_RESULT_64_BIT) {
874 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
875 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
876 } else {
877 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
878 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
879 }
880 }
881
882 static struct gen_mi_value
883 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
884 {
885 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
886 gen_mi_mem64(anv_address_add(addr, 0)));
887 }
888
889 void genX(CmdCopyQueryPoolResults)(
890 VkCommandBuffer commandBuffer,
891 VkQueryPool queryPool,
892 uint32_t firstQuery,
893 uint32_t queryCount,
894 VkBuffer destBuffer,
895 VkDeviceSize destOffset,
896 VkDeviceSize destStride,
897 VkQueryResultFlags flags)
898 {
899 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
900 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
901 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
902
903 struct gen_mi_builder b;
904 gen_mi_builder_init(&b, &cmd_buffer->batch);
905 struct gen_mi_value result;
906
907 /* If render target writes are ongoing, request a render target cache flush
908 * to ensure proper ordering of the commands from the 3d pipe and the
909 * command streamer.
910 */
911 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
912 cmd_buffer->state.pending_pipe_bits |=
913 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
914 }
915
916 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
917 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
918 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
919 * because we're about to copy values from MI commands, we need to
920 * stall the command streamer to make sure the PIPE_CONTROL values have
921 * landed, otherwise we could see inconsistent values & availability.
922 *
923 * From the vulkan spec:
924 *
925 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
926 * previous uses of vkCmdResetQueryPool in the same queue, without
927 * any additional synchronization."
928 */
929 pool->type == VK_QUERY_TYPE_OCCLUSION ||
930 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
931 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
932 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
933 }
934
935 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
936 for (uint32_t i = 0; i < queryCount; i++) {
937 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
938 uint32_t idx = 0;
939 switch (pool->type) {
940 case VK_QUERY_TYPE_OCCLUSION:
941 result = compute_query_result(&b, anv_address_add(query_addr, 8));
942 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
943 break;
944
945 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
946 uint32_t statistics = pool->pipeline_statistics;
947 while (statistics) {
948 uint32_t stat = u_bit_scan(&statistics);
949
950 result = compute_query_result(&b, anv_address_add(query_addr,
951 idx * 16 + 8));
952
953 /* WaDividePSInvocationCountBy4:HSW,BDW */
954 if ((cmd_buffer->device->info.gen == 8 ||
955 cmd_buffer->device->info.is_haswell) &&
956 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
957 result = gen_mi_ushr32_imm(&b, result, 2);
958 }
959
960 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
961 }
962 assert(idx == util_bitcount(pool->pipeline_statistics));
963 break;
964 }
965
966 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
967 result = compute_query_result(&b, anv_address_add(query_addr, 8));
968 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
969 result = compute_query_result(&b, anv_address_add(query_addr, 24));
970 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
971 break;
972
973 case VK_QUERY_TYPE_TIMESTAMP:
974 result = gen_mi_mem64(anv_address_add(query_addr, 8));
975 gpu_write_query_result(&b, dest_addr, flags, 0, result);
976 break;
977
978 default:
979 unreachable("unhandled query type");
980 }
981
982 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
983 gpu_write_query_result(&b, dest_addr, flags, idx,
984 gen_mi_mem64(query_addr));
985 }
986
987 dest_addr = anv_address_add(dest_addr, destStride);
988 }
989 }
990
991 #else
992 void genX(CmdCopyQueryPoolResults)(
993 VkCommandBuffer commandBuffer,
994 VkQueryPool queryPool,
995 uint32_t firstQuery,
996 uint32_t queryCount,
997 VkBuffer destBuffer,
998 VkDeviceSize destOffset,
999 VkDeviceSize destStride,
1000 VkQueryResultFlags flags)
1001 {
1002 anv_finishme("Queries not yet supported on Ivy Bridge");
1003 }
1004 #endif