anv: Advertise VK_KHR_shader_subgroup_extended_types
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 /* We reserve GPR 14 and 15 for conditional rendering */
36 #define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14
37 #define __gen_get_batch_dwords anv_batch_emit_dwords
38 #define __gen_address_offset anv_address_add
39 #include "common/gen_mi_builder.h"
40
41 VkResult genX(CreateQueryPool)(
42 VkDevice _device,
43 const VkQueryPoolCreateInfo* pCreateInfo,
44 const VkAllocationCallbacks* pAllocator,
45 VkQueryPool* pQueryPool)
46 {
47 ANV_FROM_HANDLE(anv_device, device, _device);
48 const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
49 struct anv_query_pool *pool;
50 VkResult result;
51
52 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
53
54 /* Query pool slots are made up of some number of 64-bit values packed
55 * tightly together. The first 64-bit value is always the "available" bit
56 * which is 0 when the query is unavailable and 1 when it is available.
57 * The 64-bit values that follow are determined by the type of query.
58 */
59 uint32_t uint64s_per_slot = 1;
60
61 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
62 switch (pCreateInfo->queryType) {
63 case VK_QUERY_TYPE_OCCLUSION:
64 /* Occlusion queries have two values: begin and end. */
65 uint64s_per_slot += 2;
66 break;
67 case VK_QUERY_TYPE_TIMESTAMP:
68 /* Timestamps just have the one timestamp value */
69 uint64s_per_slot += 1;
70 break;
71 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
72 pipeline_statistics = pCreateInfo->pipelineStatistics;
73 /* We're going to trust this field implicitly so we need to ensure that
74 * no unhandled extension bits leak in.
75 */
76 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
77
78 /* Statistics queries have a min and max for every statistic */
79 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
80 break;
81 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
82 /* Transform feedback queries are 4 values, begin/end for
83 * written/available.
84 */
85 uint64s_per_slot += 4;
86 break;
87 default:
88 assert(!"Invalid query type");
89 }
90
91 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
92 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
93 if (pool == NULL)
94 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
95
96 pool->type = pCreateInfo->queryType;
97 pool->pipeline_statistics = pipeline_statistics;
98 pool->stride = uint64s_per_slot * sizeof(uint64_t);
99 pool->slots = pCreateInfo->queryCount;
100
101 uint64_t size = pool->slots * pool->stride;
102 result = anv_bo_init_new(&pool->bo, device, size);
103 if (result != VK_SUCCESS)
104 goto fail;
105
106 if (pdevice->supports_48bit_addresses)
107 pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
108
109 if (pdevice->use_softpin)
110 pool->bo.flags |= EXEC_OBJECT_PINNED;
111
112 if (pdevice->has_exec_async)
113 pool->bo.flags |= EXEC_OBJECT_ASYNC;
114
115 anv_vma_alloc(device, &pool->bo);
116
117 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
118 * platforms, this does nothing. On non-LLC platforms, this means snooping
119 * which comes at a slight cost. However, the buffers aren't big, won't be
120 * written frequently, and trying to handle the flushing manually without
121 * doing too much flushing is extremely painful.
122 */
123 anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
124
125 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
126
127 *pQueryPool = anv_query_pool_to_handle(pool);
128
129 return VK_SUCCESS;
130
131 fail:
132 vk_free2(&device->alloc, pAllocator, pool);
133
134 return result;
135 }
136
137 void genX(DestroyQueryPool)(
138 VkDevice _device,
139 VkQueryPool _pool,
140 const VkAllocationCallbacks* pAllocator)
141 {
142 ANV_FROM_HANDLE(anv_device, device, _device);
143 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
144
145 if (!pool)
146 return;
147
148 anv_gem_munmap(pool->bo.map, pool->bo.size);
149 anv_vma_free(device, &pool->bo);
150 anv_gem_close(device, pool->bo.gem_handle);
151 vk_free2(&device->alloc, pAllocator, pool);
152 }
153
154 static struct anv_address
155 anv_query_address(struct anv_query_pool *pool, uint32_t query)
156 {
157 return (struct anv_address) {
158 .bo = &pool->bo,
159 .offset = query * pool->stride,
160 };
161 }
162
163 static void
164 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
165 uint32_t value_index, uint64_t result)
166 {
167 if (flags & VK_QUERY_RESULT_64_BIT) {
168 uint64_t *dst64 = dst_slot;
169 dst64[value_index] = result;
170 } else {
171 uint32_t *dst32 = dst_slot;
172 dst32[value_index] = result;
173 }
174 }
175
176 static bool
177 query_is_available(uint64_t *slot)
178 {
179 return *(volatile uint64_t *)slot;
180 }
181
182 static VkResult
183 wait_for_available(struct anv_device *device,
184 struct anv_query_pool *pool, uint64_t *slot)
185 {
186 while (true) {
187 if (query_is_available(slot))
188 return VK_SUCCESS;
189
190 int ret = anv_gem_busy(device, pool->bo.gem_handle);
191 if (ret == 1) {
192 /* The BO is still busy, keep waiting. */
193 continue;
194 } else if (ret == -1) {
195 /* We don't know the real error. */
196 return anv_device_set_lost(device, "gem wait failed: %m");
197 } else {
198 assert(ret == 0);
199 /* The BO is no longer busy. */
200 if (query_is_available(slot)) {
201 return VK_SUCCESS;
202 } else {
203 VkResult status = anv_device_query_status(device);
204 if (status != VK_SUCCESS)
205 return status;
206
207 /* If we haven't seen availability yet, then we never will. This
208 * can only happen if we have a client error where they call
209 * GetQueryPoolResults on a query that they haven't submitted to
210 * the GPU yet. The spec allows us to do anything in this case,
211 * but returning VK_SUCCESS doesn't seem right and we shouldn't
212 * just keep spinning.
213 */
214 return VK_NOT_READY;
215 }
216 }
217 }
218 }
219
220 VkResult genX(GetQueryPoolResults)(
221 VkDevice _device,
222 VkQueryPool queryPool,
223 uint32_t firstQuery,
224 uint32_t queryCount,
225 size_t dataSize,
226 void* pData,
227 VkDeviceSize stride,
228 VkQueryResultFlags flags)
229 {
230 ANV_FROM_HANDLE(anv_device, device, _device);
231 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
232
233 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
234 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
235 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
236 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
237
238 if (anv_device_is_lost(device))
239 return VK_ERROR_DEVICE_LOST;
240
241 if (pData == NULL)
242 return VK_SUCCESS;
243
244 void *data_end = pData + dataSize;
245
246 VkResult status = VK_SUCCESS;
247 for (uint32_t i = 0; i < queryCount; i++) {
248 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
249
250 /* Availability is always at the start of the slot */
251 bool available = slot[0];
252
253 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
254 status = wait_for_available(device, pool, slot);
255 if (status != VK_SUCCESS)
256 return status;
257
258 available = true;
259 }
260
261 /* From the Vulkan 1.0.42 spec:
262 *
263 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
264 * both not set then no result values are written to pData for
265 * queries that are in the unavailable state at the time of the call,
266 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
267 * availability state is still written to pData for those queries if
268 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
269 */
270 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
271
272 uint32_t idx = 0;
273 switch (pool->type) {
274 case VK_QUERY_TYPE_OCCLUSION:
275 if (write_results)
276 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
277 idx++;
278 break;
279
280 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
281 uint32_t statistics = pool->pipeline_statistics;
282 while (statistics) {
283 uint32_t stat = u_bit_scan(&statistics);
284 if (write_results) {
285 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
286
287 /* WaDividePSInvocationCountBy4:HSW,BDW */
288 if ((device->info.gen == 8 || device->info.is_haswell) &&
289 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
290 result >>= 2;
291
292 cpu_write_query_result(pData, flags, idx, result);
293 }
294 idx++;
295 }
296 assert(idx == util_bitcount(pool->pipeline_statistics));
297 break;
298 }
299
300 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
301 if (write_results)
302 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
303 idx++;
304 if (write_results)
305 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
306 idx++;
307 break;
308
309 case VK_QUERY_TYPE_TIMESTAMP:
310 if (write_results)
311 cpu_write_query_result(pData, flags, idx, slot[1]);
312 idx++;
313 break;
314
315 default:
316 unreachable("invalid pool type");
317 }
318
319 if (!write_results)
320 status = VK_NOT_READY;
321
322 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
323 cpu_write_query_result(pData, flags, idx, available);
324
325 pData += stride;
326 if (pData >= data_end)
327 break;
328 }
329
330 return status;
331 }
332
333 static void
334 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
335 struct anv_address addr)
336 {
337 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
338 pc.DestinationAddressType = DAT_PPGTT;
339 pc.PostSyncOperation = WritePSDepthCount;
340 pc.DepthStallEnable = true;
341 pc.Address = addr;
342
343 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
344 pc.CommandStreamerStallEnable = true;
345 }
346 }
347
348 static void
349 emit_query_mi_availability(struct gen_mi_builder *b,
350 struct anv_address addr,
351 bool available)
352 {
353 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available));
354 }
355
356 static void
357 emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer,
358 struct anv_address addr,
359 bool available)
360 {
361 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
362 pc.DestinationAddressType = DAT_PPGTT;
363 pc.PostSyncOperation = WriteImmediateData;
364 pc.Address = addr;
365 pc.ImmediateData = available;
366 }
367 }
368
369 /**
370 * Goes through a series of consecutive query indices in the given pool
371 * setting all element values to 0 and emitting them as available.
372 */
373 static void
374 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
375 struct gen_mi_builder *b, struct anv_query_pool *pool,
376 uint32_t first_index, uint32_t num_queries)
377 {
378 switch (pool->type) {
379 case VK_QUERY_TYPE_OCCLUSION:
380 case VK_QUERY_TYPE_TIMESTAMP:
381 /* These queries are written with a PIPE_CONTROL so clear them using the
382 * PIPE_CONTROL as well so we don't have to synchronize between 2 types
383 * of operations.
384 */
385 assert((pool->stride % 8) == 0);
386 for (uint32_t i = 0; i < num_queries; i++) {
387 struct anv_address slot_addr =
388 anv_query_address(pool, first_index + i);
389
390 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) {
391 emit_query_pc_availability(cmd_buffer,
392 anv_address_add(slot_addr, qword * 8),
393 false);
394 }
395 emit_query_pc_availability(cmd_buffer, slot_addr, true);
396 }
397 break;
398
399 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
400 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
401 for (uint32_t i = 0; i < num_queries; i++) {
402 struct anv_address slot_addr =
403 anv_query_address(pool, first_index + i);
404 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8);
405 emit_query_mi_availability(b, slot_addr, true);
406 }
407 break;
408
409 default:
410 unreachable("Unsupported query type");
411 }
412 }
413
414 void genX(CmdResetQueryPool)(
415 VkCommandBuffer commandBuffer,
416 VkQueryPool queryPool,
417 uint32_t firstQuery,
418 uint32_t queryCount)
419 {
420 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
421 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
422
423 switch (pool->type) {
424 case VK_QUERY_TYPE_OCCLUSION:
425 case VK_QUERY_TYPE_TIMESTAMP:
426 for (uint32_t i = 0; i < queryCount; i++) {
427 emit_query_pc_availability(cmd_buffer,
428 anv_query_address(pool, firstQuery + i),
429 false);
430 }
431 break;
432
433 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
434 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
435 struct gen_mi_builder b;
436 gen_mi_builder_init(&b, &cmd_buffer->batch);
437
438 for (uint32_t i = 0; i < queryCount; i++)
439 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false);
440 break;
441 }
442
443 default:
444 unreachable("Unsupported query type");
445 }
446 }
447
448 void genX(ResetQueryPoolEXT)(
449 VkDevice _device,
450 VkQueryPool queryPool,
451 uint32_t firstQuery,
452 uint32_t queryCount)
453 {
454 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
455
456 for (uint32_t i = 0; i < queryCount; i++) {
457 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
458 *slot = 0;
459 }
460 }
461
462 static const uint32_t vk_pipeline_stat_to_reg[] = {
463 GENX(IA_VERTICES_COUNT_num),
464 GENX(IA_PRIMITIVES_COUNT_num),
465 GENX(VS_INVOCATION_COUNT_num),
466 GENX(GS_INVOCATION_COUNT_num),
467 GENX(GS_PRIMITIVES_COUNT_num),
468 GENX(CL_INVOCATION_COUNT_num),
469 GENX(CL_PRIMITIVES_COUNT_num),
470 GENX(PS_INVOCATION_COUNT_num),
471 GENX(HS_INVOCATION_COUNT_num),
472 GENX(DS_INVOCATION_COUNT_num),
473 GENX(CS_INVOCATION_COUNT_num),
474 };
475
476 static void
477 emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat,
478 struct anv_address addr)
479 {
480 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
481 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
482
483 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
484 gen_mi_store(b, gen_mi_mem64(addr),
485 gen_mi_reg64(vk_pipeline_stat_to_reg[stat]));
486 }
487
488 static void
489 emit_xfb_query(struct gen_mi_builder *b, uint32_t stream,
490 struct anv_address addr)
491 {
492 assert(stream < MAX_XFB_STREAMS);
493
494 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)),
495 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8));
496 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)),
497 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8));
498 }
499
500 void genX(CmdBeginQuery)(
501 VkCommandBuffer commandBuffer,
502 VkQueryPool queryPool,
503 uint32_t query,
504 VkQueryControlFlags flags)
505 {
506 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
507 }
508
509 void genX(CmdBeginQueryIndexedEXT)(
510 VkCommandBuffer commandBuffer,
511 VkQueryPool queryPool,
512 uint32_t query,
513 VkQueryControlFlags flags,
514 uint32_t index)
515 {
516 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
517 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
518 struct anv_address query_addr = anv_query_address(pool, query);
519
520 struct gen_mi_builder b;
521 gen_mi_builder_init(&b, &cmd_buffer->batch);
522
523 switch (pool->type) {
524 case VK_QUERY_TYPE_OCCLUSION:
525 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
526 break;
527
528 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
529 /* TODO: This might only be necessary for certain stats */
530 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
531 pc.CommandStreamerStallEnable = true;
532 pc.StallAtPixelScoreboard = true;
533 }
534
535 uint32_t statistics = pool->pipeline_statistics;
536 uint32_t offset = 8;
537 while (statistics) {
538 uint32_t stat = u_bit_scan(&statistics);
539 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
540 offset += 16;
541 }
542 break;
543 }
544
545 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
546 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
547 pc.CommandStreamerStallEnable = true;
548 pc.StallAtPixelScoreboard = true;
549 }
550 emit_xfb_query(&b, index, anv_address_add(query_addr, 8));
551 break;
552
553 default:
554 unreachable("");
555 }
556 }
557
558 void genX(CmdEndQuery)(
559 VkCommandBuffer commandBuffer,
560 VkQueryPool queryPool,
561 uint32_t query)
562 {
563 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0);
564 }
565
566 void genX(CmdEndQueryIndexedEXT)(
567 VkCommandBuffer commandBuffer,
568 VkQueryPool queryPool,
569 uint32_t query,
570 uint32_t index)
571 {
572 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
573 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
574 struct anv_address query_addr = anv_query_address(pool, query);
575
576 struct gen_mi_builder b;
577 gen_mi_builder_init(&b, &cmd_buffer->batch);
578
579 switch (pool->type) {
580 case VK_QUERY_TYPE_OCCLUSION:
581 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
582 emit_query_pc_availability(cmd_buffer, query_addr, true);
583 break;
584
585 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
586 /* TODO: This might only be necessary for certain stats */
587 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
588 pc.CommandStreamerStallEnable = true;
589 pc.StallAtPixelScoreboard = true;
590 }
591
592 uint32_t statistics = pool->pipeline_statistics;
593 uint32_t offset = 16;
594 while (statistics) {
595 uint32_t stat = u_bit_scan(&statistics);
596 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset));
597 offset += 16;
598 }
599
600 emit_query_mi_availability(&b, query_addr, true);
601 break;
602 }
603
604 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
605 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
606 pc.CommandStreamerStallEnable = true;
607 pc.StallAtPixelScoreboard = true;
608 }
609
610 emit_xfb_query(&b, index, anv_address_add(query_addr, 16));
611 emit_query_mi_availability(&b, query_addr, true);
612 break;
613
614 default:
615 unreachable("");
616 }
617
618 /* When multiview is active the spec requires that N consecutive query
619 * indices are used, where N is the number of active views in the subpass.
620 * The spec allows that we only write the results to one of the queries
621 * but we still need to manage result availability for all the query indices.
622 * Since we only emit a single query for all active views in the
623 * first index, mark the other query indices as being already available
624 * with result 0.
625 */
626 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
627 const uint32_t num_queries =
628 util_bitcount(cmd_buffer->state.subpass->view_mask);
629 if (num_queries > 1)
630 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
631 }
632 }
633
634 #define TIMESTAMP 0x2358
635
636 void genX(CmdWriteTimestamp)(
637 VkCommandBuffer commandBuffer,
638 VkPipelineStageFlagBits pipelineStage,
639 VkQueryPool queryPool,
640 uint32_t query)
641 {
642 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
643 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
644 struct anv_address query_addr = anv_query_address(pool, query);
645
646 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
647
648 struct gen_mi_builder b;
649 gen_mi_builder_init(&b, &cmd_buffer->batch);
650
651 switch (pipelineStage) {
652 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
653 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)),
654 gen_mi_reg64(TIMESTAMP));
655 break;
656
657 default:
658 /* Everything else is bottom-of-pipe */
659 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
660 pc.DestinationAddressType = DAT_PPGTT;
661 pc.PostSyncOperation = WriteTimestamp;
662 pc.Address = anv_address_add(query_addr, 8);
663
664 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
665 pc.CommandStreamerStallEnable = true;
666 }
667 break;
668 }
669
670 emit_query_pc_availability(cmd_buffer, query_addr, true);
671
672 /* When multiview is active the spec requires that N consecutive query
673 * indices are used, where N is the number of active views in the subpass.
674 * The spec allows that we only write the results to one of the queries
675 * but we still need to manage result availability for all the query indices.
676 * Since we only emit a single query for all active views in the
677 * first index, mark the other query indices as being already available
678 * with result 0.
679 */
680 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
681 const uint32_t num_queries =
682 util_bitcount(cmd_buffer->state.subpass->view_mask);
683 if (num_queries > 1)
684 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1);
685 }
686 }
687
688 #if GEN_GEN > 7 || GEN_IS_HASWELL
689
690 static void
691 gpu_write_query_result(struct gen_mi_builder *b,
692 struct anv_address dst_addr,
693 VkQueryResultFlags flags,
694 uint32_t value_index,
695 struct gen_mi_value query_result)
696 {
697 if (flags & VK_QUERY_RESULT_64_BIT) {
698 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8);
699 gen_mi_store(b, gen_mi_mem64(res_addr), query_result);
700 } else {
701 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4);
702 gen_mi_store(b, gen_mi_mem32(res_addr), query_result);
703 }
704 }
705
706 static struct gen_mi_value
707 compute_query_result(struct gen_mi_builder *b, struct anv_address addr)
708 {
709 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)),
710 gen_mi_mem64(anv_address_add(addr, 0)));
711 }
712
713 void genX(CmdCopyQueryPoolResults)(
714 VkCommandBuffer commandBuffer,
715 VkQueryPool queryPool,
716 uint32_t firstQuery,
717 uint32_t queryCount,
718 VkBuffer destBuffer,
719 VkDeviceSize destOffset,
720 VkDeviceSize destStride,
721 VkQueryResultFlags flags)
722 {
723 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
724 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
725 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
726
727 struct gen_mi_builder b;
728 gen_mi_builder_init(&b, &cmd_buffer->batch);
729 struct gen_mi_value result;
730
731 /* If render target writes are ongoing, request a render target cache flush
732 * to ensure proper ordering of the commands from the 3d pipe and the
733 * command streamer.
734 */
735 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
736 cmd_buffer->state.pending_pipe_bits |=
737 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
738 }
739
740 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
741 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) ||
742 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and
743 * because we're about to copy values from MI commands, we need to
744 * stall the command streamer to make sure the PIPE_CONTROL values have
745 * landed, otherwise we could see inconsistent values & availability.
746 *
747 * From the vulkan spec:
748 *
749 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
750 * previous uses of vkCmdResetQueryPool in the same queue, without
751 * any additional synchronization."
752 */
753 pool->type == VK_QUERY_TYPE_OCCLUSION ||
754 pool->type == VK_QUERY_TYPE_TIMESTAMP) {
755 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
756 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
757 }
758
759 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
760 for (uint32_t i = 0; i < queryCount; i++) {
761 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
762 uint32_t idx = 0;
763 switch (pool->type) {
764 case VK_QUERY_TYPE_OCCLUSION:
765 result = compute_query_result(&b, anv_address_add(query_addr, 8));
766 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
767 break;
768
769 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
770 uint32_t statistics = pool->pipeline_statistics;
771 while (statistics) {
772 uint32_t stat = u_bit_scan(&statistics);
773
774 result = compute_query_result(&b, anv_address_add(query_addr,
775 idx * 16 + 8));
776
777 /* WaDividePSInvocationCountBy4:HSW,BDW */
778 if ((cmd_buffer->device->info.gen == 8 ||
779 cmd_buffer->device->info.is_haswell) &&
780 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
781 result = gen_mi_ushr32_imm(&b, result, 2);
782 }
783
784 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
785 }
786 assert(idx == util_bitcount(pool->pipeline_statistics));
787 break;
788 }
789
790 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
791 result = compute_query_result(&b, anv_address_add(query_addr, 8));
792 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
793 result = compute_query_result(&b, anv_address_add(query_addr, 24));
794 gpu_write_query_result(&b, dest_addr, flags, idx++, result);
795 break;
796
797 case VK_QUERY_TYPE_TIMESTAMP:
798 result = gen_mi_mem64(anv_address_add(query_addr, 8));
799 gpu_write_query_result(&b, dest_addr, flags, 0, result);
800 break;
801
802 default:
803 unreachable("unhandled query type");
804 }
805
806 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
807 gpu_write_query_result(&b, dest_addr, flags, idx,
808 gen_mi_mem64(query_addr));
809 }
810
811 dest_addr = anv_address_add(dest_addr, destStride);
812 }
813 }
814
815 #else
816 void genX(CmdCopyQueryPoolResults)(
817 VkCommandBuffer commandBuffer,
818 VkQueryPool queryPool,
819 uint32_t firstQuery,
820 uint32_t queryCount,
821 VkBuffer destBuffer,
822 VkDeviceSize destOffset,
823 VkDeviceSize destStride,
824 VkQueryResultFlags flags)
825 {
826 anv_finishme("Queries not yet supported on Ivy Bridge");
827 }
828 #endif