794d92dc6c9ecd644f48f29acedd084e12034fa1
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 VkResult genX(CreateQueryPool)(
36 VkDevice _device,
37 const VkQueryPoolCreateInfo* pCreateInfo,
38 const VkAllocationCallbacks* pAllocator,
39 VkQueryPool* pQueryPool)
40 {
41 ANV_FROM_HANDLE(anv_device, device, _device);
42 const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
43 struct anv_query_pool *pool;
44 VkResult result;
45
46 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
47
48 /* Query pool slots are made up of some number of 64-bit values packed
49 * tightly together. The first 64-bit value is always the "available" bit
50 * which is 0 when the query is unavailable and 1 when it is available.
51 * The 64-bit values that follow are determined by the type of query.
52 */
53 uint32_t uint64s_per_slot = 1;
54
55 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
56 switch (pCreateInfo->queryType) {
57 case VK_QUERY_TYPE_OCCLUSION:
58 /* Occlusion queries have two values: begin and end. */
59 uint64s_per_slot += 2;
60 break;
61 case VK_QUERY_TYPE_TIMESTAMP:
62 /* Timestamps just have the one timestamp value */
63 uint64s_per_slot += 1;
64 break;
65 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
66 pipeline_statistics = pCreateInfo->pipelineStatistics;
67 /* We're going to trust this field implicitly so we need to ensure that
68 * no unhandled extension bits leak in.
69 */
70 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
71
72 /* Statistics queries have a min and max for every statistic */
73 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics);
74 break;
75 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
76 /* Transform feedback queries are 4 values, begin/end for
77 * written/available.
78 */
79 uint64s_per_slot += 4;
80 break;
81 default:
82 assert(!"Invalid query type");
83 }
84
85 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
86 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
87 if (pool == NULL)
88 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
89
90 pool->type = pCreateInfo->queryType;
91 pool->pipeline_statistics = pipeline_statistics;
92 pool->stride = uint64s_per_slot * sizeof(uint64_t);
93 pool->slots = pCreateInfo->queryCount;
94
95 uint64_t size = pool->slots * pool->stride;
96 result = anv_bo_init_new(&pool->bo, device, size);
97 if (result != VK_SUCCESS)
98 goto fail;
99
100 if (pdevice->supports_48bit_addresses)
101 pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
102
103 if (pdevice->use_softpin)
104 pool->bo.flags |= EXEC_OBJECT_PINNED;
105
106 if (pdevice->has_exec_async)
107 pool->bo.flags |= EXEC_OBJECT_ASYNC;
108
109 anv_vma_alloc(device, &pool->bo);
110
111 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
112 * platforms, this does nothing. On non-LLC platforms, this means snooping
113 * which comes at a slight cost. However, the buffers aren't big, won't be
114 * written frequently, and trying to handle the flushing manually without
115 * doing too much flushing is extremely painful.
116 */
117 anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
118
119 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
120
121 *pQueryPool = anv_query_pool_to_handle(pool);
122
123 return VK_SUCCESS;
124
125 fail:
126 vk_free2(&device->alloc, pAllocator, pool);
127
128 return result;
129 }
130
131 void genX(DestroyQueryPool)(
132 VkDevice _device,
133 VkQueryPool _pool,
134 const VkAllocationCallbacks* pAllocator)
135 {
136 ANV_FROM_HANDLE(anv_device, device, _device);
137 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
138
139 if (!pool)
140 return;
141
142 anv_gem_munmap(pool->bo.map, pool->bo.size);
143 anv_vma_free(device, &pool->bo);
144 anv_gem_close(device, pool->bo.gem_handle);
145 vk_free2(&device->alloc, pAllocator, pool);
146 }
147
148 static struct anv_address
149 anv_query_address(struct anv_query_pool *pool, uint32_t query)
150 {
151 return (struct anv_address) {
152 .bo = &pool->bo,
153 .offset = query * pool->stride,
154 };
155 }
156
157 static void
158 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
159 uint32_t value_index, uint64_t result)
160 {
161 if (flags & VK_QUERY_RESULT_64_BIT) {
162 uint64_t *dst64 = dst_slot;
163 dst64[value_index] = result;
164 } else {
165 uint32_t *dst32 = dst_slot;
166 dst32[value_index] = result;
167 }
168 }
169
170 static bool
171 query_is_available(uint64_t *slot)
172 {
173 return *(volatile uint64_t *)slot;
174 }
175
176 static VkResult
177 wait_for_available(struct anv_device *device,
178 struct anv_query_pool *pool, uint64_t *slot)
179 {
180 while (true) {
181 if (query_is_available(slot))
182 return VK_SUCCESS;
183
184 int ret = anv_gem_busy(device, pool->bo.gem_handle);
185 if (ret == 1) {
186 /* The BO is still busy, keep waiting. */
187 continue;
188 } else if (ret == -1) {
189 /* We don't know the real error. */
190 return anv_device_set_lost(device, "gem wait failed: %m");
191 } else {
192 assert(ret == 0);
193 /* The BO is no longer busy. */
194 if (query_is_available(slot)) {
195 return VK_SUCCESS;
196 } else {
197 VkResult status = anv_device_query_status(device);
198 if (status != VK_SUCCESS)
199 return status;
200
201 /* If we haven't seen availability yet, then we never will. This
202 * can only happen if we have a client error where they call
203 * GetQueryPoolResults on a query that they haven't submitted to
204 * the GPU yet. The spec allows us to do anything in this case,
205 * but returning VK_SUCCESS doesn't seem right and we shouldn't
206 * just keep spinning.
207 */
208 return VK_NOT_READY;
209 }
210 }
211 }
212 }
213
214 VkResult genX(GetQueryPoolResults)(
215 VkDevice _device,
216 VkQueryPool queryPool,
217 uint32_t firstQuery,
218 uint32_t queryCount,
219 size_t dataSize,
220 void* pData,
221 VkDeviceSize stride,
222 VkQueryResultFlags flags)
223 {
224 ANV_FROM_HANDLE(anv_device, device, _device);
225 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
226
227 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
228 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
229 pool->type == VK_QUERY_TYPE_TIMESTAMP ||
230 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT);
231
232 if (anv_device_is_lost(device))
233 return VK_ERROR_DEVICE_LOST;
234
235 if (pData == NULL)
236 return VK_SUCCESS;
237
238 void *data_end = pData + dataSize;
239
240 VkResult status = VK_SUCCESS;
241 for (uint32_t i = 0; i < queryCount; i++) {
242 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
243
244 /* Availability is always at the start of the slot */
245 bool available = slot[0];
246
247 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
248 status = wait_for_available(device, pool, slot);
249 if (status != VK_SUCCESS)
250 return status;
251
252 available = true;
253 }
254
255 /* From the Vulkan 1.0.42 spec:
256 *
257 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
258 * both not set then no result values are written to pData for
259 * queries that are in the unavailable state at the time of the call,
260 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
261 * availability state is still written to pData for those queries if
262 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
263 */
264 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
265
266 uint32_t idx = 0;
267 switch (pool->type) {
268 case VK_QUERY_TYPE_OCCLUSION:
269 if (write_results)
270 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
271 idx++;
272 break;
273
274 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
275 uint32_t statistics = pool->pipeline_statistics;
276 while (statistics) {
277 uint32_t stat = u_bit_scan(&statistics);
278 if (write_results) {
279 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
280
281 /* WaDividePSInvocationCountBy4:HSW,BDW */
282 if ((device->info.gen == 8 || device->info.is_haswell) &&
283 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
284 result >>= 2;
285
286 cpu_write_query_result(pData, flags, idx, result);
287 }
288 idx++;
289 }
290 assert(idx == util_bitcount(pool->pipeline_statistics));
291 break;
292 }
293
294 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
295 if (write_results)
296 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]);
297 idx++;
298 if (write_results)
299 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]);
300 idx++;
301 break;
302
303 case VK_QUERY_TYPE_TIMESTAMP:
304 if (write_results)
305 cpu_write_query_result(pData, flags, idx, slot[1]);
306 idx++;
307 break;
308
309 default:
310 unreachable("invalid pool type");
311 }
312
313 if (!write_results)
314 status = VK_NOT_READY;
315
316 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
317 cpu_write_query_result(pData, flags, idx, available);
318
319 pData += stride;
320 if (pData >= data_end)
321 break;
322 }
323
324 return status;
325 }
326
327 static void
328 emit_srm32(struct anv_batch *batch, struct anv_address addr, uint32_t reg)
329 {
330 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
331 srm.MemoryAddress = addr;
332 srm.RegisterAddress = reg;
333 }
334 }
335
336 static void
337 emit_srm64(struct anv_batch *batch, struct anv_address addr, uint32_t reg)
338 {
339 emit_srm32(batch, anv_address_add(addr, 0), reg + 0);
340 emit_srm32(batch, anv_address_add(addr, 4), reg + 4);
341 }
342
343 static void
344 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
345 struct anv_address addr)
346 {
347 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
348 pc.DestinationAddressType = DAT_PPGTT;
349 pc.PostSyncOperation = WritePSDepthCount;
350 pc.DepthStallEnable = true;
351 pc.Address = addr;
352
353 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
354 pc.CommandStreamerStallEnable = true;
355 }
356 }
357
358 static void
359 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
360 struct anv_address addr)
361 {
362 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
363 pc.DestinationAddressType = DAT_PPGTT;
364 pc.PostSyncOperation = WriteImmediateData;
365 pc.Address = addr;
366 pc.ImmediateData = 1;
367 }
368 }
369
370 /**
371 * Goes through a series of consecutive query indices in the given pool
372 * setting all element values to 0 and emitting them as available.
373 */
374 static void
375 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
376 struct anv_query_pool *pool,
377 uint32_t first_index, uint32_t num_queries)
378 {
379 for (uint32_t i = 0; i < num_queries; i++) {
380 struct anv_address slot_addr =
381 anv_query_address(pool, first_index + i);
382 genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8),
383 0, pool->stride - 8);
384 emit_query_availability(cmd_buffer, slot_addr);
385 }
386 }
387
388 void genX(CmdResetQueryPool)(
389 VkCommandBuffer commandBuffer,
390 VkQueryPool queryPool,
391 uint32_t firstQuery,
392 uint32_t queryCount)
393 {
394 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
395 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
396
397 for (uint32_t i = 0; i < queryCount; i++) {
398 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
399 sdm.Address = anv_query_address(pool, firstQuery + i);
400 sdm.ImmediateData = 0;
401 }
402 }
403 }
404
405 static const uint32_t vk_pipeline_stat_to_reg[] = {
406 GENX(IA_VERTICES_COUNT_num),
407 GENX(IA_PRIMITIVES_COUNT_num),
408 GENX(VS_INVOCATION_COUNT_num),
409 GENX(GS_INVOCATION_COUNT_num),
410 GENX(GS_PRIMITIVES_COUNT_num),
411 GENX(CL_INVOCATION_COUNT_num),
412 GENX(CL_PRIMITIVES_COUNT_num),
413 GENX(PS_INVOCATION_COUNT_num),
414 GENX(HS_INVOCATION_COUNT_num),
415 GENX(DS_INVOCATION_COUNT_num),
416 GENX(CS_INVOCATION_COUNT_num),
417 };
418
419 static void
420 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
421 struct anv_address addr)
422 {
423 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
424 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
425
426 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
427 emit_srm64(&cmd_buffer->batch, addr, vk_pipeline_stat_to_reg[stat]);
428 }
429
430 static void
431 emit_xfb_query(struct anv_cmd_buffer *cmd_buffer, uint32_t stream,
432 struct anv_address addr)
433 {
434 assert(stream < MAX_XFB_STREAMS);
435
436 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
437 lrm.RegisterAddress = GENX(SO_NUM_PRIMS_WRITTEN0_num) + 0 + stream * 8;
438 lrm.MemoryAddress = anv_address_add(addr, 0);
439 }
440 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
441 lrm.RegisterAddress = GENX(SO_NUM_PRIMS_WRITTEN0_num) + 4 + stream * 8;
442 lrm.MemoryAddress = anv_address_add(addr, 4);
443 }
444
445 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
446 lrm.RegisterAddress = GENX(SO_PRIM_STORAGE_NEEDED0_num) + 0 + stream * 8;
447 lrm.MemoryAddress = anv_address_add(addr, 16);
448 }
449 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
450 lrm.RegisterAddress = GENX(SO_PRIM_STORAGE_NEEDED0_num) + 4 + stream * 8;
451 lrm.MemoryAddress = anv_address_add(addr, 20);
452 }
453 }
454
455 void genX(CmdBeginQuery)(
456 VkCommandBuffer commandBuffer,
457 VkQueryPool queryPool,
458 uint32_t query,
459 VkQueryControlFlags flags)
460 {
461 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0);
462 }
463
464 void genX(CmdBeginQueryIndexedEXT)(
465 VkCommandBuffer commandBuffer,
466 VkQueryPool queryPool,
467 uint32_t query,
468 VkQueryControlFlags flags,
469 uint32_t index)
470 {
471 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
472 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
473 struct anv_address query_addr = anv_query_address(pool, query);
474
475 switch (pool->type) {
476 case VK_QUERY_TYPE_OCCLUSION:
477 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8));
478 break;
479
480 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
481 /* TODO: This might only be necessary for certain stats */
482 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
483 pc.CommandStreamerStallEnable = true;
484 pc.StallAtPixelScoreboard = true;
485 }
486
487 uint32_t statistics = pool->pipeline_statistics;
488 uint32_t offset = 8;
489 while (statistics) {
490 uint32_t stat = u_bit_scan(&statistics);
491 emit_pipeline_stat(cmd_buffer, stat,
492 anv_address_add(query_addr, offset));
493 offset += 16;
494 }
495 break;
496 }
497
498 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
499 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
500 pc.CommandStreamerStallEnable = true;
501 pc.StallAtPixelScoreboard = true;
502 }
503 emit_xfb_query(cmd_buffer, index, anv_address_add(query_addr, 8));
504 break;
505
506 default:
507 unreachable("");
508 }
509 }
510
511 void genX(CmdEndQuery)(
512 VkCommandBuffer commandBuffer,
513 VkQueryPool queryPool,
514 VkQueryControlFlags flags)
515 {
516 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, flags, 0);
517 }
518
519 void genX(CmdEndQueryIndexedEXT)(
520 VkCommandBuffer commandBuffer,
521 VkQueryPool queryPool,
522 uint32_t query,
523 uint32_t index)
524 {
525 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
526 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
527 struct anv_address query_addr = anv_query_address(pool, query);
528
529 switch (pool->type) {
530 case VK_QUERY_TYPE_OCCLUSION:
531 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16));
532 emit_query_availability(cmd_buffer, query_addr);
533 break;
534
535 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
536 /* TODO: This might only be necessary for certain stats */
537 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
538 pc.CommandStreamerStallEnable = true;
539 pc.StallAtPixelScoreboard = true;
540 }
541
542 uint32_t statistics = pool->pipeline_statistics;
543 uint32_t offset = 16;
544 while (statistics) {
545 uint32_t stat = u_bit_scan(&statistics);
546 emit_pipeline_stat(cmd_buffer, stat,
547 anv_address_add(query_addr, offset));
548 offset += 16;
549 }
550
551 emit_query_availability(cmd_buffer, query_addr);
552 break;
553 }
554
555 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
556 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
557 pc.CommandStreamerStallEnable = true;
558 pc.StallAtPixelScoreboard = true;
559 }
560
561 emit_xfb_query(cmd_buffer, index, anv_address_add(query_addr, 16));
562 emit_query_availability(cmd_buffer, query_addr);
563 break;
564
565 default:
566 unreachable("");
567 }
568
569 /* When multiview is active the spec requires that N consecutive query
570 * indices are used, where N is the number of active views in the subpass.
571 * The spec allows that we only write the results to one of the queries
572 * but we still need to manage result availability for all the query indices.
573 * Since we only emit a single query for all active views in the
574 * first index, mark the other query indices as being already available
575 * with result 0.
576 */
577 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
578 const uint32_t num_queries =
579 util_bitcount(cmd_buffer->state.subpass->view_mask);
580 if (num_queries > 1)
581 emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
582 }
583 }
584
585 #define TIMESTAMP 0x2358
586
587 void genX(CmdWriteTimestamp)(
588 VkCommandBuffer commandBuffer,
589 VkPipelineStageFlagBits pipelineStage,
590 VkQueryPool queryPool,
591 uint32_t query)
592 {
593 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
594 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
595 struct anv_address query_addr = anv_query_address(pool, query);
596
597 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
598
599 switch (pipelineStage) {
600 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
601 emit_srm64(&cmd_buffer->batch, anv_address_add(query_addr, 8), TIMESTAMP);
602 break;
603
604 default:
605 /* Everything else is bottom-of-pipe */
606 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
607 pc.DestinationAddressType = DAT_PPGTT;
608 pc.PostSyncOperation = WriteTimestamp;
609 pc.Address = anv_address_add(query_addr, 8);
610
611 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
612 pc.CommandStreamerStallEnable = true;
613 }
614 break;
615 }
616
617 emit_query_availability(cmd_buffer, query_addr);
618
619 /* When multiview is active the spec requires that N consecutive query
620 * indices are used, where N is the number of active views in the subpass.
621 * The spec allows that we only write the results to one of the queries
622 * but we still need to manage result availability for all the query indices.
623 * Since we only emit a single query for all active views in the
624 * first index, mark the other query indices as being already available
625 * with result 0.
626 */
627 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
628 const uint32_t num_queries =
629 util_bitcount(cmd_buffer->state.subpass->view_mask);
630 if (num_queries > 1)
631 emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
632 }
633 }
634
635 #if GEN_GEN > 7 || GEN_IS_HASWELL
636
637 static uint32_t
638 mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
639 {
640 struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
641 .ALUOpcode = opcode,
642 .Operand1 = operand1,
643 .Operand2 = operand2,
644 };
645
646 uint32_t dw;
647 GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
648
649 return dw;
650 }
651
652 #define CS_GPR(n) (0x2600 + (n) * 8)
653
654 static void
655 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
656 struct anv_address addr)
657 {
658 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
659 lrm.RegisterAddress = reg;
660 lrm.MemoryAddress = anv_address_add(addr, 0);
661 }
662 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
663 lrm.RegisterAddress = reg + 4;
664 lrm.MemoryAddress = anv_address_add(addr, 4);
665 }
666 }
667
668 static void
669 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
670 {
671 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
672 lri.RegisterOffset = reg;
673 lri.DataDWord = imm;
674 }
675 }
676
677 static void
678 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
679 {
680 emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
681 emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
682 }
683
684 static void
685 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
686 {
687 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
688 lrr.SourceRegisterAddress = src;
689 lrr.DestinationRegisterAddress = dst;
690 }
691 }
692
693 /*
694 * GPR0 = GPR0 & ((1ull << n) - 1);
695 */
696 static void
697 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
698 {
699 assert(n < 64);
700 emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
701
702 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
703 if (!dw) {
704 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
705 return;
706 }
707
708 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
709 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1);
710 dw[3] = mi_alu(MI_ALU_AND, 0, 0);
711 dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
712 }
713
714 /*
715 * GPR0 = GPR0 << 30;
716 */
717 static void
718 shl_gpr0_by_30_bits(struct anv_batch *batch)
719 {
720 /* First we mask 34 bits of GPR0 to prevent overflow */
721 keep_gpr0_lower_n_bits(batch, 34);
722
723 const uint32_t outer_count = 5;
724 const uint32_t inner_count = 6;
725 STATIC_ASSERT(outer_count * inner_count == 30);
726 const uint32_t cmd_len = 1 + inner_count * 4;
727
728 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
729 * 30 left shifts.
730 */
731 for (int o = 0; o < outer_count; o++) {
732 /* Submit one MI_MATH to shift left by 6 bits */
733 uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
734 if (!dw) {
735 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
736 return;
737 }
738
739 dw++;
740 for (int i = 0; i < inner_count; i++, dw += 4) {
741 dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
742 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
743 dw[2] = mi_alu(MI_ALU_ADD, 0, 0);
744 dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
745 }
746 }
747 }
748
749 /*
750 * GPR0 = GPR0 >> 2;
751 *
752 * Note that the upper 30 bits of GPR are lost!
753 */
754 static void
755 shr_gpr0_by_2_bits(struct anv_batch *batch)
756 {
757 shl_gpr0_by_30_bits(batch);
758 emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
759 emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
760 }
761
762 static void
763 gpu_write_query_result(struct anv_batch *batch,
764 struct anv_address dst_addr,
765 VkQueryResultFlags flags,
766 uint32_t value_index, uint32_t reg)
767 {
768 if (flags & VK_QUERY_RESULT_64_BIT) {
769 emit_srm64(batch, anv_address_add(dst_addr, value_index * 8), reg);
770 } else {
771 emit_srm32(batch, anv_address_add(dst_addr, value_index * 4), reg);
772 }
773 }
774
775 static void
776 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
777 struct anv_address addr)
778 {
779 emit_load_alu_reg_u64(batch, CS_GPR(0), anv_address_add(addr, 0));
780 emit_load_alu_reg_u64(batch, CS_GPR(1), anv_address_add(addr, 8));
781
782 /* FIXME: We need to clamp the result for 32 bit. */
783
784 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
785 if (!dw) {
786 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
787 return;
788 }
789
790 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1);
791 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
792 dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
793 dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU);
794 }
795
796 void genX(CmdCopyQueryPoolResults)(
797 VkCommandBuffer commandBuffer,
798 VkQueryPool queryPool,
799 uint32_t firstQuery,
800 uint32_t queryCount,
801 VkBuffer destBuffer,
802 VkDeviceSize destOffset,
803 VkDeviceSize destStride,
804 VkQueryResultFlags flags)
805 {
806 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
807 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
808 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
809
810 /* If render target writes are ongoing, request a render target cache flush
811 * to ensure proper ordering of the commands from the 3d pipe and the
812 * command streamer.
813 */
814 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) {
815 cmd_buffer->state.pending_pipe_bits |=
816 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
817 }
818
819 if ((flags & VK_QUERY_RESULT_WAIT_BIT) ||
820 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) {
821 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT;
822 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
823 }
824
825 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset);
826 for (uint32_t i = 0; i < queryCount; i++) {
827 struct anv_address query_addr = anv_query_address(pool, firstQuery + i);
828 uint32_t idx = 0;
829 switch (pool->type) {
830 case VK_QUERY_TYPE_OCCLUSION:
831 compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
832 anv_address_add(query_addr, 8));
833 gpu_write_query_result(&cmd_buffer->batch, dest_addr,
834 flags, idx++, CS_GPR(2));
835 break;
836
837 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
838 uint32_t statistics = pool->pipeline_statistics;
839 while (statistics) {
840 uint32_t stat = u_bit_scan(&statistics);
841
842 compute_query_result(&cmd_buffer->batch, MI_ALU_REG0,
843 anv_address_add(query_addr, idx * 16 + 8));
844
845 /* WaDividePSInvocationCountBy4:HSW,BDW */
846 if ((cmd_buffer->device->info.gen == 8 ||
847 cmd_buffer->device->info.is_haswell) &&
848 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
849 shr_gpr0_by_2_bits(&cmd_buffer->batch);
850 }
851
852 gpu_write_query_result(&cmd_buffer->batch, dest_addr,
853 flags, idx++, CS_GPR(0));
854 }
855 assert(idx == util_bitcount(pool->pipeline_statistics));
856 break;
857 }
858
859 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
860 compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
861 anv_address_add(query_addr, 8));
862 gpu_write_query_result(&cmd_buffer->batch, dest_addr,
863 flags, idx++, CS_GPR(2));
864 compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
865 anv_address_add(query_addr, 24));
866 gpu_write_query_result(&cmd_buffer->batch, dest_addr,
867 flags, idx++, CS_GPR(2));
868 break;
869
870 case VK_QUERY_TYPE_TIMESTAMP:
871 emit_load_alu_reg_u64(&cmd_buffer->batch,
872 CS_GPR(2), anv_address_add(query_addr, 8));
873 gpu_write_query_result(&cmd_buffer->batch, dest_addr,
874 flags, 0, CS_GPR(2));
875 break;
876
877 default:
878 unreachable("unhandled query type");
879 }
880
881 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
882 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), query_addr);
883 gpu_write_query_result(&cmd_buffer->batch, dest_addr,
884 flags, idx, CS_GPR(0));
885 }
886
887 dest_addr = anv_address_add(dest_addr, destStride);
888 }
889 }
890
891 #else
892 void genX(CmdCopyQueryPoolResults)(
893 VkCommandBuffer commandBuffer,
894 VkQueryPool queryPool,
895 uint32_t firstQuery,
896 uint32_t queryCount,
897 VkBuffer destBuffer,
898 VkDeviceSize destOffset,
899 VkDeviceSize destStride,
900 VkQueryResultFlags flags)
901 {
902 anv_finishme("Queries not yet supported on Ivy Bridge");
903 }
904 #endif