anv: Refactor setting descriptors with immutable sampler
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40 #include "perf/gen_perf.h"
41 #include "perf/gen_perf_mdapi.h"
42
43 #define OA_REPORT_N_UINT64 (256 / sizeof(uint64_t))
44
45 VkResult genX(CreateQueryPool)(
46 VkDevice _device,
47 const VkQueryPoolCreateInfo* pCreateInfo,
48 const VkAllocationCallbacks* pAllocator,
49 VkQueryPool* pQueryPool)
50 {
51 ANV_FROM_HANDLE(anv_device, device, _device);
52 const struct anv_physical_device *pdevice = device->physical;
53 struct anv_query_pool *pool;
54 VkResult result;
55
56 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
57
58 /* Query pool slots are made up of some number of 64-bit values packed
59 * tightly together. For most query types have the first 64-bit value is
60 * the "available" bit which is 0 when the query is unavailable and 1 when
61 * it is available. The 64-bit values that follow are determined by the
62 * type of query.
63 *
64 * For performance queries, we have a requirement to align OA reports at
65 * 64bytes so we put those first and have the "available" bit behind
66 * together with some other counters.
67 */
68 uint32_t uint64s_per_slot = 1;
69
70 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
71 switch (pCreateInfo->queryType) {
72 case VK_QUERY_TYPE_OCCLUSION:
73 /* Occlusion queries have two values: begin and end. */
74 uint64s_per_slot += 2;
75 break;
76 case VK_QUERY_TYPE_TIMESTAMP:
77 /* Timestamps just have the one timestamp value */
78 uint64s_per_slot += 1;
79 break;
80 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
81 pipeline_statistics = pCreateInfo->pipelineStatistics;
82 /* We're going to trust this field implicitly so we need to ensure that
83 * no unhandled extension bits leak in.
84 */
85 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
86
87 /* Statistics queries have a min and max for every statistic */
88 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
89 break;
90 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
91 /* Transform feedback queries are 4 values, begin/end for
92 * written/available.
93 */
94 uint64s_per_slot += 4;
95 break;
96 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
97 uint64s_per_slot = 72; /* 576 bytes, see layout below */
98 break;
99 }
100 default:
101 assert(!"Invalid query type");
102 }
103
104 pool = vk_alloc2(&device->vk.alloc, pAllocator, sizeof(*pool), 8,
105 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
106 if (pool == NULL)
107 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
108
109 vk_object_base_init(&device->vk, &pool->base, VK_OBJECT_TYPE_QUERY_POOL);
110 pool->type = pCreateInfo->queryType;
111 pool->pipeline_statistics = pipeline_statistics;
112 pool->stride = uint64s_per_slot * sizeof(uint64_t);
113 pool->slots = pCreateInfo->queryCount;
114
115 uint32_t bo_flags = 0;
116 if (pdevice->supports_48bit_addresses)
117 bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
118
119 if (pdevice->use_softpin)
120 bo_flags |= EXEC_OBJECT_PINNED;
121
122 if (pdevice->has_exec_async)
123 bo_flags |= EXEC_OBJECT_ASYNC;
124
125 uint64_t size = pool->slots * pool->stride;
126 result = anv_device_alloc_bo(device, size,
127 ANV_BO_ALLOC_MAPPED |
128 ANV_BO_ALLOC_SNOOPED,
129 0 /* explicit_address */,
130 &pool->bo);
131 if (result != VK_SUCCESS)
132 goto fail;
133
134 *pQueryPool = anv_query_pool_to_handle(pool);
135
136 return VK_SUCCESS;
137
138 fail:
139 vk_free2(&device->vk.alloc, pAllocator, pool);
140
141 return result;
142 }
143
144 void genX(DestroyQueryPool)(
145 VkDevice _device,
146 VkQueryPool _pool,
147 const VkAllocationCallbacks* pAllocator)
148 {
149 ANV_FROM_HANDLE(anv_device, device, _device);
150 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
151
152 if (!pool)
153 return;
154
155 anv_device_release_bo(device, pool->bo);
156 vk_object_base_finish(&pool->base);
157 vk_free2(&device->vk.alloc, pAllocator, pool);
158 }
159
160 static struct anv_address
161 anv_query_address(struct anv_query_pool *pool, uint32_t query)
162 {
163 return (struct anv_address) {
164 .bo = pool->bo,
165 .offset = query * pool->stride,
166 };
167 }
168
169 /**
170 * VK_INTEL_performance_query layout (576 bytes) :
171 *
172 * ------------------------------
173 * | availability (8b) |
174 * |----------------------------|
175 * | marker (8b) |
176 * |----------------------------|
177 * | begin RPSTAT register (4b) |
178 * |----------------------------|
179 * | end RPSTAT register (4b) |
180 * |----------------------------|
181 * | begin perfcntr 1 & 2 (16b) |
182 * |----------------------------|
183 * | end perfcntr 1 & 2 (16b) |
184 * |----------------------------|
185 * | Unused (8b) |
186 * |----------------------------|
187 * | begin MI_RPC (256b) |
188 * |----------------------------|
189 * | end MI_RPC (256b) |
190 * ------------------------------
191 */
192
193 static uint32_t
194 intel_perf_marker_offset(void)
195 {
196 return 8;
197 }
198
199 static uint32_t
200 intel_perf_rpstart_offset(bool end)
201 {
202 return 16 + (end ? sizeof(uint32_t) : 0);
203 }
204
205 #if GEN_GEN >= 8 && GEN_GEN <= 11
206 static uint32_t
207 intel_perf_counter(bool end)
208 {
209 return 24 + (end ? (2 * sizeof(uint64_t)) : 0);
210 }
211 #endif
212
213 static uint32_t
214 intel_perf_mi_rpc_offset(bool end)
215 {
216 return 64 + (end ? 256 : 0);
217 }
218
219 static void
220 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
221 uint32_t value_index, uint64_t result)
222 {
223 if (flags & VK_QUERY_RESULT_64_BIT) {
224 uint64_t *dst64 = dst_slot;
225 dst64[value_index] = result;
226 } else {
227 uint32_t *dst32 = dst_slot;
228 dst32[value_index] = result;
229 }
230 }
231
232 static void *
233 query_slot(struct anv_query_pool *pool, uint32_t query)
234 {
235 return pool->bo->map + query * pool->stride;
236 }
237
238 static bool
239 query_is_available(struct anv_query_pool *pool, uint32_t query)
240 {
241 return *(volatile uint64_t *)query_slot(pool, query);
242 }
243
244 static VkResult
245 wait_for_available(struct anv_device *device,
246 struct anv_query_pool *pool, uint32_t query)
247 {
248 uint64_t abs_timeout = anv_get_absolute_timeout(5 * NSEC_PER_SEC);
249
250 while (anv_gettime_ns() < abs_timeout) {
251 if (query_is_available(pool, query))
252 return VK_SUCCESS;
253 VkResult status = anv_device_query_status(device);
254 if (status != VK_SUCCESS)
255 return status;
256 }
257
258 return anv_device_set_lost(device, "query timeout");
259 }
260
261 VkResult genX(GetQueryPoolResults)(
262 VkDevice _device,
263 VkQueryPool queryPool,
264 uint32_t firstQuery,
265 uint32_t queryCount,
266 size_t dataSize,
267 void* pData,
268 VkDeviceSize stride,
269 VkQueryResultFlags flags)
270 {
271 ANV_FROM_HANDLE(anv_device, device, _device);
272 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
273
274 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
275 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
276 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
277 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT ||
278 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL);
279
280 if (anv_device_is_lost(device))
281 return VK_ERROR_DEVICE_LOST;
282
283 if (pData == NULL)
284 return VK_SUCCESS;
285
286 void *data_end = pData + dataSize;
287
288 VkResult status = VK_SUCCESS;
289 for (uint32_t i = 0; i < queryCount; i++) {
290 bool available = query_is_available(pool, firstQuery + i);
291
292 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
293 status = wait_for_available(device, pool, firstQuery + i);
294 if (status != VK_SUCCESS)
295 return status;
296
297 available = true;
298 }
299
300 /* From the Vulkan 1.0.42 spec:
301 *
302 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
303 * both not set then no result values are written to pData for
304 * queries that are in the unavailable state at the time of the call,
305 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
306 * availability state is still written to pData for those queries if
307 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
308 */
309 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
310
311 uint32_t idx = 0;
312 switch (pool->type) {
313 case VK_QUERY_TYPE_OCCLUSION: {
314 uint64_t *slot = query_slot(pool, firstQuery + i);
315 if (write_results) {
316 /* From the Vulkan 1.2.132 spec:
317 *
318 * "If VK_QUERY_RESULT_PARTIAL_BIT is set,
319 * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status
320 * is unavailable, an intermediate result value between zero and
321 * the final result value is written to pData for that query."
322 */
323 uint64_t result = available ? slot[2] - slot[1] : 0;
324 cpu_write_query_result(pData, flags, idx, result);
325 }
326 idx++;
327 break;
328 }
329
330 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
331 uint64_t *slot = query_slot(pool, firstQuery + i);
332 uint32_t statistics = pool->pipeline_statistics;
333 while (statistics) {
334 uint32_t stat = u_bit_scan(&statistics);
335 if (write_results) {
336 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
337
338 /* WaDividePSInvocationCountBy4:HSW,BDW */
339 if ((device->info.gen == 8 || device->info.is_haswell) &&
340 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
341 result >>= 2;
342
343 cpu_write_query_result(pData, flags, idx, result);
344 }
345 idx++;
346 }
347 assert(idx == util_bitcount(pool->pipeline_statistics));
348 break;
349 }
350
351 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
352 uint64_t *slot = query_slot(pool, firstQuery + i);
353 if (write_results)
354 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
355 idx++;
356 if (write_results)
357 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
358 idx++;
359 break;
360 }
361
362 case VK_QUERY_TYPE_TIMESTAMP: {
363 uint64_t *slot = query_slot(pool, firstQuery + i);
364 if (write_results)
365 cpu_write_query_result(pData, flags, idx, slot[1]);
366 idx++;
367 break;
368 }
369
370 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
371 if (!write_results)
372 break;
373 const void *query_data = query_slot(pool, firstQuery + i);
374 const uint32_t *oa_begin = query_data + intel_perf_mi_rpc_offset(false);
375 const uint32_t *oa_end = query_data + intel_perf_mi_rpc_offset(true);
376 const uint32_t *rpstat_begin = query_data + intel_perf_rpstart_offset(false);
377 const uint32_t *rpstat_end = query_data + intel_perf_mi_rpc_offset(true);
378 struct gen_perf_query_result result;
379 struct gen_perf_query_info metric = {
380 .oa_format = (GEN_GEN >= 8 ?
381 I915_OA_FORMAT_A32u40_A4u32_B8_C8 :
382 I915_OA_FORMAT_A45_B8_C8),
383 };
384 uint32_t core_freq[2];
385 #if GEN_GEN < 9
386 core_freq[0] = ((*rpstat_begin >> 7) & 0x7f) * 1000000ULL;
387 core_freq[1] = ((*rpstat_end >> 7) & 0x7f) * 1000000ULL;
388 #else
389 core_freq[0] = ((*rpstat_begin >> 23) & 0x1ff) * 1000000ULL;
390 core_freq[1] = ((*rpstat_end >> 23) & 0x1ff) * 1000000ULL;
391 #endif
392 gen_perf_query_result_clear(&result);
393 gen_perf_query_result_accumulate(&result, &metric,
394 oa_begin, oa_end);
395 gen_perf_query_result_read_frequencies(&result, &device->info,
396 oa_begin, oa_end);
397 gen_perf_query_result_write_mdapi(pData, stride,
398 &device->info,
399 &result,
400 core_freq[0], core_freq[1]);
401 #if GEN_GEN >= 8 && GEN_GEN <= 11
402 gen_perf_query_mdapi_write_perfcntr(pData, stride, &device->info,
403 query_data + intel_perf_counter(false),
404 query_data + intel_perf_counter(true));
405 #endif
406 const uint64_t *marker = query_data + intel_perf_marker_offset();
407 gen_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker);
408 break;
409 }
410
411 default:
412 unreachable("invalid pool type");
413 }
414
415 if (!write_results)
416 status = VK_NOT_READY;
417
418 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
419 cpu_write_query_result(pData, flags, idx, available);
420
421 pData += stride;
422 if (pData >= data_end)
423 break;
424 }
425
426 return status;
427 }
428
429 static void
430 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
431 struct anv_address addr)
432 {
433 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
434 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
435
436 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
437 pc.DestinationAddressType = DAT_PPGTT;
438 pc.PostSyncOperation = WritePSDepthCount;
439 pc.DepthStallEnable = true;
440 pc.Address = addr;
441
442 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
443 pc.CommandStreamerStallEnable = true;
444 }
445 }
446
447 static void
448 emit_query_mi_availability(struct gen_mi_builder *b,
449 struct anv_address addr,
450 bool available)
451 {
452 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
453 }
454
455 static void
456 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
457 struct anv_address addr,
458 bool available)
459 {
460 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
461 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
462
463 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
464 pc.DestinationAddressType = DAT_PPGTT;
465 pc.PostSyncOperation = WriteImmediateData;
466 pc.Address = addr;
467 pc.ImmediateData = available;
468 }
469 }
470
471 /**
472 * Goes through a series of consecutive query indices in the given pool
473 * setting all element values to 0 and emitting them as available.
474 */
475 static void
476 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
477 struct gen_mi_builder *b, struct anv_query_pool *pool,
478 uint32_t first_index, uint32_t num_queries)
479 {
480 switch (pool->type) {
481 case VK_QUERY_TYPE_OCCLUSION:
482 case VK_QUERY_TYPE_TIMESTAMP:
483 /* These queries are written with a PIPE_CONTROL so clear them using the
484 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
485 * of operations.
486 */
487 assert((pool->stride % 8) == 0);
488 for (uint32_t i = 0; i < num_queries; i++) {
489 struct anv_address slot_addr =
490 anv_query_address(pool, first_index + i);
491
492 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
493 emit_query_pc_availability(cmd_buffer,
494 anv_address_add(slot_addr, qword * 8),
495 false);
496 }
497 emit_query_pc_availability(cmd_buffer, slot_addr, true);
498 }
499 break;
500
501 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
502 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
503 for (uint32_t i = 0; i < num_queries; i++) {
504 struct anv_address slot_addr =
505 anv_query_address(pool, first_index + i);
506 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
507 emit_query_mi_availability(b, slot_addr, true);
508 }
509 break;
510
511 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL:
512 for (uint32_t i = 0; i < num_queries; i++) {
513 struct anv_address slot_addr =
514 anv_query_address(pool, first_index + i);
515 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
516 emit_query_mi_availability(b, slot_addr, true);
517 }
518 break;
519
520 default:
521 unreachable("Unsupported query type");
522 }
523 }
524
525 void genX(CmdResetQueryPool)(
526 VkCommandBuffer commandBuffer,
527 VkQueryPool queryPool,
528 uint32_t firstQuery,
529 uint32_t queryCount)
530 {
531 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
532 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
533
534 switch (pool->type) {
535 case VK_QUERY_TYPE_OCCLUSION:
536 case VK_QUERY_TYPE_TIMESTAMP:
537 for (uint32_t i = 0; i < queryCount; i++) {
538 emit_query_pc_availability(cmd_buffer,
539 anv_query_address(pool, firstQuery + i),
540 false);
541 }
542 break;
543
544 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
545 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
546 struct gen_mi_builder b;
547 gen_mi_builder_init(&b, &cmd_buffer->batch);
548
549 for (uint32_t i = 0; i < queryCount; i++)
550 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
551 break;
552 }
553
554 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
555 struct gen_mi_builder b;
556 gen_mi_builder_init(&b, &cmd_buffer->batch);
557
558 for (uint32_t i = 0; i < queryCount; i++)
559 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
560 break;
561 }
562
563 default:
564 unreachable("Unsupported query type");
565 }
566 }
567
568 void genX(ResetQueryPool)(
569 VkDevice _device,
570 VkQueryPool queryPool,
571 uint32_t firstQuery,
572 uint32_t queryCount)
573 {
574 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
575
576 for (uint32_t i = 0; i < queryCount; i++) {
577 uint64_t *slot = query_slot(pool, firstQuery + i);
578 *slot = 0;
579 }
580 }
581
582 static const uint32_t vk_pipeline_stat_to_reg[] = {
583 GENX(IA_VERTICES_COUNT_num),
584 GENX(IA_PRIMITIVES_COUNT_num),
585 GENX(VS_INVOCATION_COUNT_num),
586 GENX(GS_INVOCATION_COUNT_num),
587 GENX(GS_PRIMITIVES_COUNT_num),
588 GENX(CL_INVOCATION_COUNT_num),
589 GENX(CL_PRIMITIVES_COUNT_num),
590 GENX(PS_INVOCATION_COUNT_num),
591 GENX(HS_INVOCATION_COUNT_num),
592 GENX(DS_INVOCATION_COUNT_num),
593 GENX(CS_INVOCATION_COUNT_num),
594 };
595
596 static void
597 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
598 struct anv_address addr)
599 {
600 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
601 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
602
603 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
604 gen_mi_store(b, gen_mi_mem64(addr),
605 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
606 }
607
608 static void
609 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
610 struct anv_address addr)
611 {
612 assert(stream < MAX_XFB_STREAMS);
613
614 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
615 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
616 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
617 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
618 }
619
620 void genX(CmdBeginQuery)(
621 VkCommandBuffer commandBuffer,
622 VkQueryPool queryPool,
623 uint32_t query,
624 VkQueryControlFlags flags)
625 {
626 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
627 }
628
629 void genX(CmdBeginQueryIndexedEXT)(
630 VkCommandBuffer commandBuffer,
631 VkQueryPool queryPool,
632 uint32_t query,
633 VkQueryControlFlags flags,
634 uint32_t index)
635 {
636 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
637 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
638 struct anv_address query_addr = anv_query_address(pool, query);
639
640 struct gen_mi_builder b;
641 gen_mi_builder_init(&b, &cmd_buffer->batch);
642
643 switch (pool->type) {
644 case VK_QUERY_TYPE_OCCLUSION:
645 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
646 break;
647
648 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
649 /* TODO: This might only be necessary for certain stats */
650 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
651 pc.CommandStreamerStallEnable = true;
652 pc.StallAtPixelScoreboard = true;
653 }
654
655 uint32_t statistics = pool->pipeline_statistics;
656 uint32_t offset = 8;
657 while (statistics) {
658 uint32_t stat = u_bit_scan(&statistics);
659 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
660 offset += 16;
661 }
662 break;
663 }
664
665 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
666 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
667 pc.CommandStreamerStallEnable = true;
668 pc.StallAtPixelScoreboard = true;
669 }
670 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
671 break;
672
673 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
674 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
675 pc.CommandStreamerStallEnable = true;
676 pc.StallAtPixelScoreboard = true;
677 }
678 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
679 rpc.MemoryAddress =
680 anv_address_add(query_addr, intel_perf_mi_rpc_offset(false));
681 }
682 #if GEN_GEN < 9
683 gen_mi_store(&b,
684 gen_mi_mem32(anv_address_add(query_addr,
685 intel_perf_rpstart_offset(false))),
686 gen_mi_reg32(GENX(RPSTAT1_num)));
687 #else
688 gen_mi_store(&b,
689 gen_mi_mem32(anv_address_add(query_addr,
690 intel_perf_rpstart_offset(false))),
691 gen_mi_reg32(GENX(RPSTAT0_num)));
692 #endif
693 #if GEN_GEN >= 8 && GEN_GEN <= 11
694 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
695 intel_perf_counter(false))),
696 gen_mi_reg64(GENX(PERFCNT1_num)));
697 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr,
698 intel_perf_counter(false) + 8)),
699 gen_mi_reg64(GENX(PERFCNT2_num)));
700 #endif
701 break;
702 }
703
704 default:
705 unreachable("");
706 }
707 }
708
709 void genX(CmdEndQuery)(
710 VkCommandBuffer commandBuffer,
711 VkQueryPool queryPool,
712 uint32_t query)
713 {
714 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
715 }
716
717 void genX(CmdEndQueryIndexedEXT)(
718 VkCommandBuffer commandBuffer,
719 VkQueryPool queryPool,
720 uint32_t query,
721 uint32_t index)
722 {
723 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
724 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
725 struct anv_address query_addr = anv_query_address(pool, query);
726
727 struct gen_mi_builder b;
728 gen_mi_builder_init(&b, &cmd_buffer->batch);
729
730 switch (pool->type) {
731 case VK_QUERY_TYPE_OCCLUSION:
732 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
733 emit_query_pc_availability(cmd_buffer, query_addr, true);
734 break;
735
736 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
737 /* TODO: This might only be necessary for certain stats */
738 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
739 pc.CommandStreamerStallEnable = true;
740 pc.StallAtPixelScoreboard = true;
741 }
742
743 uint32_t statistics = pool->pipeline_statistics;
744 uint32_t offset = 16;
745 while (statistics) {
746 uint32_t stat = u_bit_scan(&statistics);
747 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
748 offset += 16;
749 }
750
751 emit_query_mi_availability(&b, query_addr, true);
752 break;
753 }
754
755 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
756 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
757 pc.CommandStreamerStallEnable = true;
758 pc.StallAtPixelScoreboard = true;
759 }
760
761 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
762 emit_query_mi_availability(&b, query_addr, true);
763 break;
764
765 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: {
766 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
767 pc.CommandStreamerStallEnable = true;
768 pc.StallAtPixelScoreboard = true;
769 }
770 uint32_t marker_offset = intel_perf_marker_offset();
771 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, marker_offset)),
772 gen_mi_imm(cmd_buffer->intel_perf_marker));
773 #if GEN_GEN >= 8 && GEN_GEN <= 11
774 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true))),
775 gen_mi_reg64(GENX(PERFCNT1_num)));
776 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, intel_perf_counter(true) + 8)),
777 gen_mi_reg64(GENX(PERFCNT2_num)));
778 #endif
779 #if GEN_GEN < 9
780 gen_mi_store(&b,
781 gen_mi_mem32(anv_address_add(query_addr,
782 intel_perf_rpstart_offset(true))),
783 gen_mi_reg32(GENX(RPSTAT1_num)));
784 #else
785 gen_mi_store(&b,
786 gen_mi_mem32(anv_address_add(query_addr,
787 intel_perf_rpstart_offset(true))),
788 gen_mi_reg32(GENX(RPSTAT0_num)));
789 #endif
790 /* Position the last OA snapshot at the beginning of the query so that
791 * we can tell whether it's ready.
792 */
793 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) {
794 rpc.MemoryAddress = anv_address_add(query_addr,
795 intel_perf_mi_rpc_offset(true));
796 rpc.ReportID = 0xdeadbeef; /* This goes in the first dword */
797 }
798 emit_query_mi_availability(&b, query_addr, true);
799 break;
800 }
801
802 default:
803 unreachable("");
804 }
805
806 /* When multiview is active the spec requires that N consecutive query
807 * indices are used, where N is the number of active views in the subpass.
808 * The spec allows that we only write the results to one of the queries
809 * but we still need to manage result availability for all the query indices.
810 * Since we only emit a single query for all active views in the
811 * first index, mark the other query indices as being already available
812 * with result 0.
813 */
814 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
815 const uint32_t num_queries =
816 util_bitcount(cmd_buffer->state.subpass->view_mask);
817 if (num_queries > 1)
818 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
819 }
820 }
821
822 #define TIMESTAMP 0x2358
823
824 void genX(CmdWriteTimestamp)(
825 VkCommandBuffer commandBuffer,
826 VkPipelineStageFlagBits pipelineStage,
827 VkQueryPool queryPool,
828 uint32_t query)
829 {
830 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
831 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
832 struct anv_address query_addr = anv_query_address(pool, query);
833
834 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
835
836 struct gen_mi_builder b;
837 gen_mi_builder_init(&b, &cmd_buffer->batch);
838
839 switch (pipelineStage) {
840 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
841 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
842 gen_mi_reg64(TIMESTAMP));
843 break;
844
845 default:
846 /* Everything else is bottom-of-pipe */
847 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
848 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
849
850 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
851 pc.DestinationAddressType = DAT_PPGTT;
852 pc.PostSyncOperation = WriteTimestamp;
853 pc.Address = anv_address_add(query_addr, 8);
854
855 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
856 pc.CommandStreamerStallEnable = true;
857 }
858 break;
859 }
860
861 emit_query_pc_availability(cmd_buffer, query_addr, true);
862
863 /* When multiview is active the spec requires that N consecutive query
864 * indices are used, where N is the number of active views in the subpass.
865 * The spec allows that we only write the results to one of the queries
866 * but we still need to manage result availability for all the query indices.
867 * Since we only emit a single query for all active views in the
868 * first index, mark the other query indices as being already available
869 * with result 0.
870 */
871 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
872 const uint32_t num_queries =
873 util_bitcount(cmd_buffer->state.subpass->view_mask);
874 if (num_queries > 1)
875 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
876 }
877 }
878
879 #if GEN_GEN > 7 || GEN_IS_HASWELL
880
881 #if GEN_GEN >= 8 || GEN_IS_HASWELL
882
883 #define MI_PREDICATE_SRC0 0x2400
884 #define MI_PREDICATE_SRC1 0x2408
885 #define MI_PREDICATE_RESULT 0x2418
886
887 /**
888 * Writes the results of a query to dst_addr is the value at poll_addr is equal
889 * to the reference value.
890 */
891 static void
892 gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer,
893 struct gen_mi_builder *b,
894 struct anv_address poll_addr,
895 struct anv_address dst_addr,
896 uint64_t ref_value,
897 VkQueryResultFlags flags,
898 uint32_t value_index,
899 struct gen_mi_value query_result)
900 {
901 gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC0), gen_mi_mem64(poll_addr));
902 gen_mi_store(b, gen_mi_reg64(MI_PREDICATE_SRC1), gen_mi_imm(ref_value));
903 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
904 mip.LoadOperation = LOAD_LOAD;
905 mip.CombineOperation = COMBINE_SET;
906 mip.CompareOperation = COMPARE_SRCS_EQUAL;
907 }
908
909 if (flags & VK_QUERY_RESULT_64_BIT) {
910 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
911 gen_mi_store_if(b, gen_mi_mem64(res_addr), query_result);
912 } else {
913 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
914 gen_mi_store_if(b, gen_mi_mem32(res_addr), query_result);
915 }
916 }
917
918 #endif /* GEN_GEN >= 8 || GEN_IS_HASWELL */
919
920 static void
921 gpu_write_query_result(struct gen_mi_builder *b,
922 struct anv_address dst_addr,
923 VkQueryResultFlags flags,
924 uint32_t value_index,
925 struct gen_mi_value query_result)
926 {
927 if (flags & VK_QUERY_RESULT_64_BIT) {
928 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
929 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
930 } else {
931 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
932 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
933 }
934 }
935
936 static struct gen_mi_value
937 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
938 {
939 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
940 gen_mi_mem64(anv_address_add(addr, 0)));
941 }
942
943 void genX(CmdCopyQueryPoolResults)(
944 VkCommandBuffer commandBuffer,
945 VkQueryPool queryPool,
946 uint32_t firstQuery,
947 uint32_t queryCount,
948 VkBuffer destBuffer,
949 VkDeviceSize destOffset,
950 VkDeviceSize destStride,
951 VkQueryResultFlags flags)
952 {
953 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
954 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
955 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
956
957 struct gen_mi_builder b;
958 gen_mi_builder_init(&b, &cmd_buffer->batch);
959 struct gen_mi_value result;
960
961 /* If render target writes are ongoing, request a render target cache flush
962 * to ensure proper ordering of the commands from the 3d pipe and the
963 * command streamer.
964 */
965 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
966 cmd_buffer->state.pending_pipe_bits |=
967 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
968 }
969
970 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
971 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
972 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
973 * because we're about to copy values from MI commands, we need to
974 * stall the command streamer to make sure the PIPE_CONTROL values have
975 * landed, otherwise we could see inconsistent values & availability.
976 *
977 * From the vulkan spec:
978 *
979 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
980 * previous uses of vkCmdResetQueryPool in the same queue, without
981 * any additional synchronization."
982 */
983 pool->type == VK_QUERY_TYPE_OCCLUSION ||
984 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
985 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
986 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
987 }
988
989 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
990 for (uint32_t i = 0; i < queryCount; i++) {
991 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
992 uint32_t idx = 0;
993 switch (pool->type) {
994 case VK_QUERY_TYPE_OCCLUSION:
995 result = compute_query_result(&b, anv_address_add(query_addr, 8));
996 #if GEN_GEN >= 8 || GEN_IS_HASWELL
997 /* Like in the case of vkGetQueryPoolResults, if the query is
998 * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set,
999 * conservatively write 0 as the query result. If the
1000 * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value.
1001 */
1002 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1003 1 /* available */, flags, idx, result);
1004 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
1005 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr,
1006 0 /* unavailable */, flags, idx, gen_mi_imm(0));
1007 }
1008 idx++;
1009 #else /* GEN_GEN < 8 && !GEN_IS_HASWELL */
1010 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1011 #endif
1012 break;
1013
1014 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1015 uint32_t statistics = pool->pipeline_statistics;
1016 while (statistics) {
1017 uint32_t stat = u_bit_scan(&statistics);
1018
1019 result = compute_query_result(&b, anv_address_add(query_addr,
1020 idx * 16 + 8));
1021
1022 /* WaDividePSInvocationCountBy4:HSW,BDW */
1023 if ((cmd_buffer->device->info.gen == 8 ||
1024 cmd_buffer->device->info.is_haswell) &&
1025 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
1026 result = gen_mi_ushr32_imm(&b, result, 2);
1027 }
1028
1029 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1030 }
1031 assert(idx == util_bitcount(pool->pipeline_statistics));
1032 break;
1033 }
1034
1035 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1036 result = compute_query_result(&b, anv_address_add(query_addr, 8));
1037 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1038 result = compute_query_result(&b, anv_address_add(query_addr, 24));
1039 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
1040 break;
1041
1042 case VK_QUERY_TYPE_TIMESTAMP:
1043 result = gen_mi_mem64(anv_address_add(query_addr, 8));
1044 gpu_write_query_result(&b, dest_addr, flags, 0, result);
1045 break;
1046
1047 default:
1048 unreachable("unhandled query type");
1049 }
1050
1051 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1052 gpu_write_query_result(&b, dest_addr, flags, idx,
1053 gen_mi_mem64(query_addr));
1054 }
1055
1056 dest_addr = anv_address_add(dest_addr, destStride);
1057 }
1058 }
1059
1060 #else
1061 void genX(CmdCopyQueryPoolResults)(
1062 VkCommandBuffer commandBuffer,
1063 VkQueryPool queryPool,
1064 uint32_t firstQuery,
1065 uint32_t queryCount,
1066 VkBuffer destBuffer,
1067 VkDeviceSize destOffset,
1068 VkDeviceSize destStride,
1069 VkQueryResultFlags flags)
1070 {
1071 anv_finishme("Queries not yet supported on Ivy Bridge");
1072 }
1073 #endif