anv/query: handle out of host memory without crashing in compute_query_result()
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 VkResult genX(CreateQueryPool)(
36 VkDevice _device,
37 const VkQueryPoolCreateInfo* pCreateInfo,
38 const VkAllocationCallbacks* pAllocator,
39 VkQueryPool* pQueryPool)
40 {
41 ANV_FROM_HANDLE(anv_device, device, _device);
42 struct anv_query_pool *pool;
43 VkResult result;
44
45 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
46
47 /* Query pool slots are made up of some number of 64-bit values packed
48 * tightly together. The first 64-bit value is always the "available" bit
49 * which is 0 when the query is unavailable and 1 when it is available.
50 * The 64-bit values that follow are determined by the type of query.
51 */
52 uint32_t uint64s_per_slot = 1;
53
54 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
55 switch (pCreateInfo->queryType) {
56 case VK_QUERY_TYPE_OCCLUSION:
57 /* Occlusion queries have two values: begin and end. */
58 uint64s_per_slot += 2;
59 break;
60 case VK_QUERY_TYPE_TIMESTAMP:
61 /* Timestamps just have the one timestamp value */
62 uint64s_per_slot += 1;
63 break;
64 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
65 pipeline_statistics = pCreateInfo->pipelineStatistics;
66 /* We're going to trust this field implicitly so we need to ensure that
67 * no unhandled extension bits leak in.
68 */
69 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
70
71 /* Statistics queries have a min and max for every statistic */
72 uint64s_per_slot += 2 * _mesa_bitcount(pipeline_statistics);
73 break;
74 default:
75 assert(!"Invalid query type");
76 }
77
78 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
79 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
80 if (pool == NULL)
81 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
82
83 pool->type = pCreateInfo->queryType;
84 pool->pipeline_statistics = pipeline_statistics;
85 pool->stride = uint64s_per_slot * sizeof(uint64_t);
86 pool->slots = pCreateInfo->queryCount;
87
88 uint64_t size = pool->slots * pool->stride;
89 result = anv_bo_init_new(&pool->bo, device, size);
90 if (result != VK_SUCCESS)
91 goto fail;
92
93 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
94
95 *pQueryPool = anv_query_pool_to_handle(pool);
96
97 return VK_SUCCESS;
98
99 fail:
100 vk_free2(&device->alloc, pAllocator, pool);
101
102 return result;
103 }
104
105 void genX(DestroyQueryPool)(
106 VkDevice _device,
107 VkQueryPool _pool,
108 const VkAllocationCallbacks* pAllocator)
109 {
110 ANV_FROM_HANDLE(anv_device, device, _device);
111 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
112
113 if (!pool)
114 return;
115
116 anv_gem_munmap(pool->bo.map, pool->bo.size);
117 anv_gem_close(device, pool->bo.gem_handle);
118 vk_free2(&device->alloc, pAllocator, pool);
119 }
120
121 static void
122 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
123 uint32_t value_index, uint64_t result)
124 {
125 if (flags & VK_QUERY_RESULT_64_BIT) {
126 uint64_t *dst64 = dst_slot;
127 dst64[value_index] = result;
128 } else {
129 uint32_t *dst32 = dst_slot;
130 dst32[value_index] = result;
131 }
132 }
133
134 VkResult genX(GetQueryPoolResults)(
135 VkDevice _device,
136 VkQueryPool queryPool,
137 uint32_t firstQuery,
138 uint32_t queryCount,
139 size_t dataSize,
140 void* pData,
141 VkDeviceSize stride,
142 VkQueryResultFlags flags)
143 {
144 ANV_FROM_HANDLE(anv_device, device, _device);
145 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
146 int64_t timeout = INT64_MAX;
147 int ret;
148
149 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
150 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
151 pool->type == VK_QUERY_TYPE_TIMESTAMP);
152
153 if (unlikely(device->lost))
154 return VK_ERROR_DEVICE_LOST;
155
156 if (pData == NULL)
157 return VK_SUCCESS;
158
159 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
160 ret = anv_gem_wait(device, pool->bo.gem_handle, &timeout);
161 if (ret == -1) {
162 /* We don't know the real error. */
163 return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
164 "gem_wait failed %m");
165 }
166 }
167
168 void *data_end = pData + dataSize;
169
170 if (!device->info.has_llc) {
171 uint64_t offset = firstQuery * pool->stride;
172 uint64_t size = queryCount * pool->stride;
173 anv_invalidate_range(pool->bo.map + offset,
174 MIN2(size, pool->bo.size - offset));
175 }
176
177 VkResult status = VK_SUCCESS;
178 for (uint32_t i = 0; i < queryCount; i++) {
179 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
180
181 /* Availability is always at the start of the slot */
182 bool available = slot[0];
183
184 /* From the Vulkan 1.0.42 spec:
185 *
186 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
187 * both not set then no result values are written to pData for
188 * queries that are in the unavailable state at the time of the call,
189 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
190 * availability state is still written to pData for those queries if
191 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
192 */
193 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
194
195 if (write_results) {
196 switch (pool->type) {
197 case VK_QUERY_TYPE_OCCLUSION: {
198 cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]);
199 break;
200 }
201
202 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
203 uint32_t statistics = pool->pipeline_statistics;
204 uint32_t idx = 0;
205 while (statistics) {
206 uint32_t stat = u_bit_scan(&statistics);
207 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
208
209 /* WaDividePSInvocationCountBy4:HSW,BDW */
210 if ((device->info.gen == 8 || device->info.is_haswell) &&
211 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
212 result >>= 2;
213
214 cpu_write_query_result(pData, flags, idx, result);
215
216 idx++;
217 }
218 assert(idx == _mesa_bitcount(pool->pipeline_statistics));
219 break;
220 }
221
222 case VK_QUERY_TYPE_TIMESTAMP: {
223 cpu_write_query_result(pData, flags, 0, slot[1]);
224 break;
225 }
226 default:
227 unreachable("invalid pool type");
228 }
229 } else {
230 status = VK_NOT_READY;
231 }
232
233 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
234 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
235 _mesa_bitcount(pool->pipeline_statistics) : 1;
236 cpu_write_query_result(pData, flags, idx, available);
237 }
238
239 pData += stride;
240 if (pData >= data_end)
241 break;
242 }
243
244 return status;
245 }
246
247 static void
248 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
249 struct anv_bo *bo, uint32_t offset)
250 {
251 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
252 pc.DestinationAddressType = DAT_PPGTT;
253 pc.PostSyncOperation = WritePSDepthCount;
254 pc.DepthStallEnable = true;
255 pc.Address = (struct anv_address) { bo, offset };
256
257 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
258 pc.CommandStreamerStallEnable = true;
259 }
260 }
261
262 static void
263 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
264 struct anv_bo *bo, uint32_t offset)
265 {
266 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
267 pc.DestinationAddressType = DAT_PPGTT;
268 pc.PostSyncOperation = WriteImmediateData;
269 pc.Address = (struct anv_address) { bo, offset };
270 pc.ImmediateData = 1;
271 }
272 }
273
274 void genX(CmdResetQueryPool)(
275 VkCommandBuffer commandBuffer,
276 VkQueryPool queryPool,
277 uint32_t firstQuery,
278 uint32_t queryCount)
279 {
280 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
281 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
282
283 for (uint32_t i = 0; i < queryCount; i++) {
284 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
285 sdm.Address = (struct anv_address) {
286 .bo = &pool->bo,
287 .offset = (firstQuery + i) * pool->stride,
288 };
289 sdm.ImmediateData = 0;
290 }
291 }
292 }
293
294 static const uint32_t vk_pipeline_stat_to_reg[] = {
295 GENX(IA_VERTICES_COUNT_num),
296 GENX(IA_PRIMITIVES_COUNT_num),
297 GENX(VS_INVOCATION_COUNT_num),
298 GENX(GS_INVOCATION_COUNT_num),
299 GENX(GS_PRIMITIVES_COUNT_num),
300 GENX(CL_INVOCATION_COUNT_num),
301 GENX(CL_PRIMITIVES_COUNT_num),
302 GENX(PS_INVOCATION_COUNT_num),
303 GENX(HS_INVOCATION_COUNT_num),
304 GENX(DS_INVOCATION_COUNT_num),
305 GENX(CS_INVOCATION_COUNT_num),
306 };
307
308 static void
309 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
310 struct anv_bo *bo, uint32_t offset)
311 {
312 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
313 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
314
315 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
316 uint32_t reg = vk_pipeline_stat_to_reg[stat];
317
318 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
319 lrm.RegisterAddress = reg,
320 lrm.MemoryAddress = (struct anv_address) { bo, offset };
321 }
322 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
323 lrm.RegisterAddress = reg + 4,
324 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
325 }
326 }
327
328 void genX(CmdBeginQuery)(
329 VkCommandBuffer commandBuffer,
330 VkQueryPool queryPool,
331 uint32_t query,
332 VkQueryControlFlags flags)
333 {
334 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
335 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
336
337 /* Workaround: When meta uses the pipeline with the VS disabled, it seems
338 * that the pipelining of the depth write breaks. What we see is that
339 * samples from the render pass clear leaks into the first query
340 * immediately after the clear. Doing a pipecontrol with a post-sync
341 * operation and DepthStallEnable seems to work around the issue.
342 */
343 if (cmd_buffer->state.need_query_wa) {
344 cmd_buffer->state.need_query_wa = false;
345 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
346 pc.DepthCacheFlushEnable = true;
347 pc.DepthStallEnable = true;
348 }
349 }
350
351 switch (pool->type) {
352 case VK_QUERY_TYPE_OCCLUSION:
353 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8);
354 break;
355
356 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
357 /* TODO: This might only be necessary for certain stats */
358 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
359 pc.CommandStreamerStallEnable = true;
360 pc.StallAtPixelScoreboard = true;
361 }
362
363 uint32_t statistics = pool->pipeline_statistics;
364 uint32_t offset = query * pool->stride + 8;
365 while (statistics) {
366 uint32_t stat = u_bit_scan(&statistics);
367 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
368 offset += 16;
369 }
370 break;
371 }
372
373 default:
374 unreachable("");
375 }
376 }
377
378 void genX(CmdEndQuery)(
379 VkCommandBuffer commandBuffer,
380 VkQueryPool queryPool,
381 uint32_t query)
382 {
383 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
384 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
385
386 switch (pool->type) {
387 case VK_QUERY_TYPE_OCCLUSION:
388 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16);
389 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
390 break;
391
392 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
393 /* TODO: This might only be necessary for certain stats */
394 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
395 pc.CommandStreamerStallEnable = true;
396 pc.StallAtPixelScoreboard = true;
397 }
398
399 uint32_t statistics = pool->pipeline_statistics;
400 uint32_t offset = query * pool->stride + 16;
401 while (statistics) {
402 uint32_t stat = u_bit_scan(&statistics);
403 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
404 offset += 16;
405 }
406
407 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
408 break;
409 }
410
411 default:
412 unreachable("");
413 }
414 }
415
416 #define TIMESTAMP 0x2358
417
418 void genX(CmdWriteTimestamp)(
419 VkCommandBuffer commandBuffer,
420 VkPipelineStageFlagBits pipelineStage,
421 VkQueryPool queryPool,
422 uint32_t query)
423 {
424 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
425 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
426 uint32_t offset = query * pool->stride;
427
428 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
429
430 switch (pipelineStage) {
431 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
432 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
433 srm.RegisterAddress = TIMESTAMP;
434 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 8 };
435 }
436 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
437 srm.RegisterAddress = TIMESTAMP + 4;
438 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 12 };
439 }
440 break;
441
442 default:
443 /* Everything else is bottom-of-pipe */
444 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
445 pc.DestinationAddressType = DAT_PPGTT;
446 pc.PostSyncOperation = WriteTimestamp;
447 pc.Address = (struct anv_address) { &pool->bo, offset + 8 };
448
449 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
450 pc.CommandStreamerStallEnable = true;
451 }
452 break;
453 }
454
455 emit_query_availability(cmd_buffer, &pool->bo, offset);
456 }
457
458 #if GEN_GEN > 7 || GEN_IS_HASWELL
459
460 #define alu_opcode(v) __gen_uint((v), 20, 31)
461 #define alu_operand1(v) __gen_uint((v), 10, 19)
462 #define alu_operand2(v) __gen_uint((v), 0, 9)
463 #define alu(opcode, operand1, operand2) \
464 alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
465
466 #define OPCODE_NOOP 0x000
467 #define OPCODE_LOAD 0x080
468 #define OPCODE_LOADINV 0x480
469 #define OPCODE_LOAD0 0x081
470 #define OPCODE_LOAD1 0x481
471 #define OPCODE_ADD 0x100
472 #define OPCODE_SUB 0x101
473 #define OPCODE_AND 0x102
474 #define OPCODE_OR 0x103
475 #define OPCODE_XOR 0x104
476 #define OPCODE_STORE 0x180
477 #define OPCODE_STOREINV 0x580
478
479 #define OPERAND_R0 0x00
480 #define OPERAND_R1 0x01
481 #define OPERAND_R2 0x02
482 #define OPERAND_R3 0x03
483 #define OPERAND_R4 0x04
484 #define OPERAND_SRCA 0x20
485 #define OPERAND_SRCB 0x21
486 #define OPERAND_ACCU 0x31
487 #define OPERAND_ZF 0x32
488 #define OPERAND_CF 0x33
489
490 #define CS_GPR(n) (0x2600 + (n) * 8)
491
492 static void
493 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
494 struct anv_bo *bo, uint32_t offset)
495 {
496 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
497 lrm.RegisterAddress = reg,
498 lrm.MemoryAddress = (struct anv_address) { bo, offset };
499 }
500 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
501 lrm.RegisterAddress = reg + 4;
502 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
503 }
504 }
505
506 static void
507 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
508 {
509 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
510 lri.RegisterOffset = reg;
511 lri.DataDWord = imm;
512 }
513 }
514
515 static void
516 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
517 {
518 emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
519 emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
520 }
521
522 static void
523 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
524 {
525 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
526 lrr.SourceRegisterAddress = src;
527 lrr.DestinationRegisterAddress = dst;
528 }
529 }
530
531 /*
532 * GPR0 = GPR0 & ((1ull << n) - 1);
533 */
534 static void
535 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
536 {
537 assert(n < 64);
538 emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
539
540 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
541 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
542 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
543 dw[3] = alu(OPCODE_AND, 0, 0);
544 dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
545 }
546
547 /*
548 * GPR0 = GPR0 << 30;
549 */
550 static void
551 shl_gpr0_by_30_bits(struct anv_batch *batch)
552 {
553 /* First we mask 34 bits of GPR0 to prevent overflow */
554 keep_gpr0_lower_n_bits(batch, 34);
555
556 const uint32_t outer_count = 5;
557 const uint32_t inner_count = 6;
558 STATIC_ASSERT(outer_count * inner_count == 30);
559 const uint32_t cmd_len = 1 + inner_count * 4;
560
561 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
562 * 30 left shifts.
563 */
564 for (int o = 0; o < outer_count; o++) {
565 /* Submit one MI_MATH to shift left by 6 bits */
566 uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
567 dw++;
568 for (int i = 0; i < inner_count; i++, dw += 4) {
569 dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
570 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
571 dw[2] = alu(OPCODE_ADD, 0, 0);
572 dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
573 }
574 }
575 }
576
577 /*
578 * GPR0 = GPR0 >> 2;
579 *
580 * Note that the upper 30 bits of GPR are lost!
581 */
582 static void
583 shr_gpr0_by_2_bits(struct anv_batch *batch)
584 {
585 shl_gpr0_by_30_bits(batch);
586 emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
587 emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
588 }
589
590 static void
591 gpu_write_query_result(struct anv_batch *batch,
592 struct anv_buffer *dst_buffer, uint32_t dst_offset,
593 VkQueryResultFlags flags,
594 uint32_t value_index, uint32_t reg)
595 {
596 if (flags & VK_QUERY_RESULT_64_BIT)
597 dst_offset += value_index * 8;
598 else
599 dst_offset += value_index * 4;
600
601 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
602 srm.RegisterAddress = reg;
603 srm.MemoryAddress = (struct anv_address) {
604 .bo = dst_buffer->bo,
605 .offset = dst_buffer->offset + dst_offset,
606 };
607 }
608
609 if (flags & VK_QUERY_RESULT_64_BIT) {
610 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
611 srm.RegisterAddress = reg + 4;
612 srm.MemoryAddress = (struct anv_address) {
613 .bo = dst_buffer->bo,
614 .offset = dst_buffer->offset + dst_offset + 4,
615 };
616 }
617 }
618 }
619
620 static void
621 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
622 struct anv_bo *bo, uint32_t offset)
623 {
624 emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
625 emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
626
627 /* FIXME: We need to clamp the result for 32 bit. */
628
629 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
630 if (!dw) {
631 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
632 return;
633 }
634
635 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
636 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
637 dw[3] = alu(OPCODE_SUB, 0, 0);
638 dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
639 }
640
641 void genX(CmdCopyQueryPoolResults)(
642 VkCommandBuffer commandBuffer,
643 VkQueryPool queryPool,
644 uint32_t firstQuery,
645 uint32_t queryCount,
646 VkBuffer destBuffer,
647 VkDeviceSize destOffset,
648 VkDeviceSize destStride,
649 VkQueryResultFlags flags)
650 {
651 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
652 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
653 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
654 uint32_t slot_offset;
655
656 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
657 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
658 pc.CommandStreamerStallEnable = true;
659 pc.StallAtPixelScoreboard = true;
660 }
661 }
662
663 for (uint32_t i = 0; i < queryCount; i++) {
664 slot_offset = (firstQuery + i) * pool->stride;
665 switch (pool->type) {
666 case VK_QUERY_TYPE_OCCLUSION:
667 compute_query_result(&cmd_buffer->batch, OPERAND_R2,
668 &pool->bo, slot_offset + 8);
669 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
670 flags, 0, CS_GPR(2));
671 break;
672
673 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
674 uint32_t statistics = pool->pipeline_statistics;
675 uint32_t idx = 0;
676 while (statistics) {
677 uint32_t stat = u_bit_scan(&statistics);
678
679 compute_query_result(&cmd_buffer->batch, OPERAND_R0,
680 &pool->bo, slot_offset + idx * 16 + 8);
681
682 /* WaDividePSInvocationCountBy4:HSW,BDW */
683 if ((cmd_buffer->device->info.gen == 8 ||
684 cmd_buffer->device->info.is_haswell) &&
685 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
686 shr_gpr0_by_2_bits(&cmd_buffer->batch);
687 }
688
689 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
690 flags, idx, CS_GPR(0));
691
692 idx++;
693 }
694 assert(idx == _mesa_bitcount(pool->pipeline_statistics));
695 break;
696 }
697
698 case VK_QUERY_TYPE_TIMESTAMP:
699 emit_load_alu_reg_u64(&cmd_buffer->batch,
700 CS_GPR(2), &pool->bo, slot_offset + 8);
701 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
702 flags, 0, CS_GPR(2));
703 break;
704
705 default:
706 unreachable("unhandled query type");
707 }
708
709 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
710 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
711 _mesa_bitcount(pool->pipeline_statistics) : 1;
712
713 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
714 &pool->bo, slot_offset);
715 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
716 flags, idx, CS_GPR(0));
717 }
718
719 destOffset += destStride;
720 }
721 }
722
723 #else
724 void genX(CmdCopyQueryPoolResults)(
725 VkCommandBuffer commandBuffer,
726 VkQueryPool queryPool,
727 uint32_t firstQuery,
728 uint32_t queryCount,
729 VkBuffer destBuffer,
730 VkDeviceSize destOffset,
731 VkDeviceSize destStride,
732 VkQueryResultFlags flags)
733 {
734 anv_finishme("Queries not yet supported on Ivy Bridge");
735 }
736 #endif