anv: provide required gem stubs for the tests
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 VkResult genX(CreateQueryPool)(
36 VkDevice _device,
37 const VkQueryPoolCreateInfo* pCreateInfo,
38 const VkAllocationCallbacks* pAllocator,
39 VkQueryPool* pQueryPool)
40 {
41 ANV_FROM_HANDLE(anv_device, device, _device);
42 struct anv_query_pool *pool;
43 VkResult result;
44
45 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
46
47 /* Query pool slots are made up of some number of 64-bit values packed
48 * tightly together. The first 64-bit value is always the "available" bit
49 * which is 0 when the query is unavailable and 1 when it is available.
50 * The 64-bit values that follow are determined by the type of query.
51 */
52 uint32_t uint64s_per_slot = 1;
53
54 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
55 switch (pCreateInfo->queryType) {
56 case VK_QUERY_TYPE_OCCLUSION:
57 /* Occlusion queries have two values: begin and end. */
58 uint64s_per_slot += 2;
59 break;
60 case VK_QUERY_TYPE_TIMESTAMP:
61 /* Timestamps just have the one timestamp value */
62 uint64s_per_slot += 1;
63 break;
64 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
65 pipeline_statistics = pCreateInfo->pipelineStatistics;
66 /* We're going to trust this field implicitly so we need to ensure that
67 * no unhandled extension bits leak in.
68 */
69 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
70
71 /* Statistics queries have a min and max for every statistic */
72 uint64s_per_slot += 2 * _mesa_bitcount(pipeline_statistics);
73 break;
74 default:
75 assert(!"Invalid query type");
76 }
77
78 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
79 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
80 if (pool == NULL)
81 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
82
83 pool->type = pCreateInfo->queryType;
84 pool->pipeline_statistics = pipeline_statistics;
85 pool->stride = uint64s_per_slot * sizeof(uint64_t);
86 pool->slots = pCreateInfo->queryCount;
87
88 uint64_t size = pool->slots * pool->stride;
89 result = anv_bo_init_new(&pool->bo, device, size);
90 if (result != VK_SUCCESS)
91 goto fail;
92
93 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
94
95 *pQueryPool = anv_query_pool_to_handle(pool);
96
97 return VK_SUCCESS;
98
99 fail:
100 vk_free2(&device->alloc, pAllocator, pool);
101
102 return result;
103 }
104
105 void genX(DestroyQueryPool)(
106 VkDevice _device,
107 VkQueryPool _pool,
108 const VkAllocationCallbacks* pAllocator)
109 {
110 ANV_FROM_HANDLE(anv_device, device, _device);
111 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
112
113 if (!pool)
114 return;
115
116 anv_gem_munmap(pool->bo.map, pool->bo.size);
117 anv_gem_close(device, pool->bo.gem_handle);
118 vk_free2(&device->alloc, pAllocator, pool);
119 }
120
121 static void
122 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
123 uint32_t value_index, uint64_t result)
124 {
125 if (flags & VK_QUERY_RESULT_64_BIT) {
126 uint64_t *dst64 = dst_slot;
127 dst64[value_index] = result;
128 } else {
129 uint32_t *dst32 = dst_slot;
130 dst32[value_index] = result;
131 }
132 }
133
134 VkResult genX(GetQueryPoolResults)(
135 VkDevice _device,
136 VkQueryPool queryPool,
137 uint32_t firstQuery,
138 uint32_t queryCount,
139 size_t dataSize,
140 void* pData,
141 VkDeviceSize stride,
142 VkQueryResultFlags flags)
143 {
144 ANV_FROM_HANDLE(anv_device, device, _device);
145 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
146
147 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
148 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
149 pool->type == VK_QUERY_TYPE_TIMESTAMP);
150
151 if (unlikely(device->lost))
152 return VK_ERROR_DEVICE_LOST;
153
154 if (pData == NULL)
155 return VK_SUCCESS;
156
157 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
158 VkResult result = anv_device_wait(device, &pool->bo, INT64_MAX);
159 if (result != VK_SUCCESS)
160 return result;
161 }
162
163 void *data_end = pData + dataSize;
164
165 if (!device->info.has_llc) {
166 uint64_t offset = firstQuery * pool->stride;
167 uint64_t size = queryCount * pool->stride;
168 anv_invalidate_range(pool->bo.map + offset,
169 MIN2(size, pool->bo.size - offset));
170 }
171
172 VkResult status = VK_SUCCESS;
173 for (uint32_t i = 0; i < queryCount; i++) {
174 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
175
176 /* Availability is always at the start of the slot */
177 bool available = slot[0];
178
179 /* From the Vulkan 1.0.42 spec:
180 *
181 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
182 * both not set then no result values are written to pData for
183 * queries that are in the unavailable state at the time of the call,
184 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
185 * availability state is still written to pData for those queries if
186 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
187 */
188 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
189
190 if (write_results) {
191 switch (pool->type) {
192 case VK_QUERY_TYPE_OCCLUSION: {
193 cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]);
194 break;
195 }
196
197 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
198 uint32_t statistics = pool->pipeline_statistics;
199 uint32_t idx = 0;
200 while (statistics) {
201 uint32_t stat = u_bit_scan(&statistics);
202 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
203
204 /* WaDividePSInvocationCountBy4:HSW,BDW */
205 if ((device->info.gen == 8 || device->info.is_haswell) &&
206 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
207 result >>= 2;
208
209 cpu_write_query_result(pData, flags, idx, result);
210
211 idx++;
212 }
213 assert(idx == _mesa_bitcount(pool->pipeline_statistics));
214 break;
215 }
216
217 case VK_QUERY_TYPE_TIMESTAMP: {
218 cpu_write_query_result(pData, flags, 0, slot[1]);
219 break;
220 }
221 default:
222 unreachable("invalid pool type");
223 }
224 } else {
225 status = VK_NOT_READY;
226 }
227
228 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
229 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
230 _mesa_bitcount(pool->pipeline_statistics) : 1;
231 cpu_write_query_result(pData, flags, idx, available);
232 }
233
234 pData += stride;
235 if (pData >= data_end)
236 break;
237 }
238
239 return status;
240 }
241
242 static void
243 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
244 struct anv_bo *bo, uint32_t offset)
245 {
246 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
247 pc.DestinationAddressType = DAT_PPGTT;
248 pc.PostSyncOperation = WritePSDepthCount;
249 pc.DepthStallEnable = true;
250 pc.Address = (struct anv_address) { bo, offset };
251
252 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
253 pc.CommandStreamerStallEnable = true;
254 }
255 }
256
257 static void
258 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
259 struct anv_bo *bo, uint32_t offset)
260 {
261 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
262 pc.DestinationAddressType = DAT_PPGTT;
263 pc.PostSyncOperation = WriteImmediateData;
264 pc.Address = (struct anv_address) { bo, offset };
265 pc.ImmediateData = 1;
266 }
267 }
268
269 void genX(CmdResetQueryPool)(
270 VkCommandBuffer commandBuffer,
271 VkQueryPool queryPool,
272 uint32_t firstQuery,
273 uint32_t queryCount)
274 {
275 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
276 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
277
278 for (uint32_t i = 0; i < queryCount; i++) {
279 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
280 sdm.Address = (struct anv_address) {
281 .bo = &pool->bo,
282 .offset = (firstQuery + i) * pool->stride,
283 };
284 sdm.ImmediateData = 0;
285 }
286 }
287 }
288
289 static const uint32_t vk_pipeline_stat_to_reg[] = {
290 GENX(IA_VERTICES_COUNT_num),
291 GENX(IA_PRIMITIVES_COUNT_num),
292 GENX(VS_INVOCATION_COUNT_num),
293 GENX(GS_INVOCATION_COUNT_num),
294 GENX(GS_PRIMITIVES_COUNT_num),
295 GENX(CL_INVOCATION_COUNT_num),
296 GENX(CL_PRIMITIVES_COUNT_num),
297 GENX(PS_INVOCATION_COUNT_num),
298 GENX(HS_INVOCATION_COUNT_num),
299 GENX(DS_INVOCATION_COUNT_num),
300 GENX(CS_INVOCATION_COUNT_num),
301 };
302
303 static void
304 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
305 struct anv_bo *bo, uint32_t offset)
306 {
307 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
308 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
309
310 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
311 uint32_t reg = vk_pipeline_stat_to_reg[stat];
312
313 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
314 lrm.RegisterAddress = reg,
315 lrm.MemoryAddress = (struct anv_address) { bo, offset };
316 }
317 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
318 lrm.RegisterAddress = reg + 4,
319 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
320 }
321 }
322
323 void genX(CmdBeginQuery)(
324 VkCommandBuffer commandBuffer,
325 VkQueryPool queryPool,
326 uint32_t query,
327 VkQueryControlFlags flags)
328 {
329 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
330 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
331
332 /* Workaround: When meta uses the pipeline with the VS disabled, it seems
333 * that the pipelining of the depth write breaks. What we see is that
334 * samples from the render pass clear leaks into the first query
335 * immediately after the clear. Doing a pipecontrol with a post-sync
336 * operation and DepthStallEnable seems to work around the issue.
337 */
338 if (cmd_buffer->state.need_query_wa) {
339 cmd_buffer->state.need_query_wa = false;
340 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
341 pc.DepthCacheFlushEnable = true;
342 pc.DepthStallEnable = true;
343 }
344 }
345
346 switch (pool->type) {
347 case VK_QUERY_TYPE_OCCLUSION:
348 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8);
349 break;
350
351 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
352 /* TODO: This might only be necessary for certain stats */
353 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
354 pc.CommandStreamerStallEnable = true;
355 pc.StallAtPixelScoreboard = true;
356 }
357
358 uint32_t statistics = pool->pipeline_statistics;
359 uint32_t offset = query * pool->stride + 8;
360 while (statistics) {
361 uint32_t stat = u_bit_scan(&statistics);
362 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
363 offset += 16;
364 }
365 break;
366 }
367
368 default:
369 unreachable("");
370 }
371 }
372
373 void genX(CmdEndQuery)(
374 VkCommandBuffer commandBuffer,
375 VkQueryPool queryPool,
376 uint32_t query)
377 {
378 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
379 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
380
381 switch (pool->type) {
382 case VK_QUERY_TYPE_OCCLUSION:
383 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16);
384 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
385 break;
386
387 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
388 /* TODO: This might only be necessary for certain stats */
389 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
390 pc.CommandStreamerStallEnable = true;
391 pc.StallAtPixelScoreboard = true;
392 }
393
394 uint32_t statistics = pool->pipeline_statistics;
395 uint32_t offset = query * pool->stride + 16;
396 while (statistics) {
397 uint32_t stat = u_bit_scan(&statistics);
398 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
399 offset += 16;
400 }
401
402 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
403 break;
404 }
405
406 default:
407 unreachable("");
408 }
409 }
410
411 #define TIMESTAMP 0x2358
412
413 void genX(CmdWriteTimestamp)(
414 VkCommandBuffer commandBuffer,
415 VkPipelineStageFlagBits pipelineStage,
416 VkQueryPool queryPool,
417 uint32_t query)
418 {
419 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
420 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
421 uint32_t offset = query * pool->stride;
422
423 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
424
425 switch (pipelineStage) {
426 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
427 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
428 srm.RegisterAddress = TIMESTAMP;
429 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 8 };
430 }
431 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
432 srm.RegisterAddress = TIMESTAMP + 4;
433 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 12 };
434 }
435 break;
436
437 default:
438 /* Everything else is bottom-of-pipe */
439 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
440 pc.DestinationAddressType = DAT_PPGTT;
441 pc.PostSyncOperation = WriteTimestamp;
442 pc.Address = (struct anv_address) { &pool->bo, offset + 8 };
443
444 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
445 pc.CommandStreamerStallEnable = true;
446 }
447 break;
448 }
449
450 emit_query_availability(cmd_buffer, &pool->bo, offset);
451 }
452
453 #if GEN_GEN > 7 || GEN_IS_HASWELL
454
455 #define alu_opcode(v) __gen_uint((v), 20, 31)
456 #define alu_operand1(v) __gen_uint((v), 10, 19)
457 #define alu_operand2(v) __gen_uint((v), 0, 9)
458 #define alu(opcode, operand1, operand2) \
459 alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
460
461 #define OPCODE_NOOP 0x000
462 #define OPCODE_LOAD 0x080
463 #define OPCODE_LOADINV 0x480
464 #define OPCODE_LOAD0 0x081
465 #define OPCODE_LOAD1 0x481
466 #define OPCODE_ADD 0x100
467 #define OPCODE_SUB 0x101
468 #define OPCODE_AND 0x102
469 #define OPCODE_OR 0x103
470 #define OPCODE_XOR 0x104
471 #define OPCODE_STORE 0x180
472 #define OPCODE_STOREINV 0x580
473
474 #define OPERAND_R0 0x00
475 #define OPERAND_R1 0x01
476 #define OPERAND_R2 0x02
477 #define OPERAND_R3 0x03
478 #define OPERAND_R4 0x04
479 #define OPERAND_SRCA 0x20
480 #define OPERAND_SRCB 0x21
481 #define OPERAND_ACCU 0x31
482 #define OPERAND_ZF 0x32
483 #define OPERAND_CF 0x33
484
485 #define CS_GPR(n) (0x2600 + (n) * 8)
486
487 static void
488 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
489 struct anv_bo *bo, uint32_t offset)
490 {
491 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
492 lrm.RegisterAddress = reg,
493 lrm.MemoryAddress = (struct anv_address) { bo, offset };
494 }
495 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
496 lrm.RegisterAddress = reg + 4;
497 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
498 }
499 }
500
501 static void
502 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
503 {
504 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
505 lri.RegisterOffset = reg;
506 lri.DataDWord = imm;
507 }
508 }
509
510 static void
511 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
512 {
513 emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
514 emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
515 }
516
517 static void
518 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
519 {
520 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
521 lrr.SourceRegisterAddress = src;
522 lrr.DestinationRegisterAddress = dst;
523 }
524 }
525
526 /*
527 * GPR0 = GPR0 & ((1ull << n) - 1);
528 */
529 static void
530 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
531 {
532 assert(n < 64);
533 emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
534
535 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
536 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
537 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
538 dw[3] = alu(OPCODE_AND, 0, 0);
539 dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
540 }
541
542 /*
543 * GPR0 = GPR0 << 30;
544 */
545 static void
546 shl_gpr0_by_30_bits(struct anv_batch *batch)
547 {
548 /* First we mask 34 bits of GPR0 to prevent overflow */
549 keep_gpr0_lower_n_bits(batch, 34);
550
551 const uint32_t outer_count = 5;
552 const uint32_t inner_count = 6;
553 STATIC_ASSERT(outer_count * inner_count == 30);
554 const uint32_t cmd_len = 1 + inner_count * 4;
555
556 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
557 * 30 left shifts.
558 */
559 for (int o = 0; o < outer_count; o++) {
560 /* Submit one MI_MATH to shift left by 6 bits */
561 uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
562 dw++;
563 for (int i = 0; i < inner_count; i++, dw += 4) {
564 dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
565 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
566 dw[2] = alu(OPCODE_ADD, 0, 0);
567 dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
568 }
569 }
570 }
571
572 /*
573 * GPR0 = GPR0 >> 2;
574 *
575 * Note that the upper 30 bits of GPR are lost!
576 */
577 static void
578 shr_gpr0_by_2_bits(struct anv_batch *batch)
579 {
580 shl_gpr0_by_30_bits(batch);
581 emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
582 emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
583 }
584
585 static void
586 gpu_write_query_result(struct anv_batch *batch,
587 struct anv_buffer *dst_buffer, uint32_t dst_offset,
588 VkQueryResultFlags flags,
589 uint32_t value_index, uint32_t reg)
590 {
591 if (flags & VK_QUERY_RESULT_64_BIT)
592 dst_offset += value_index * 8;
593 else
594 dst_offset += value_index * 4;
595
596 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
597 srm.RegisterAddress = reg;
598 srm.MemoryAddress = (struct anv_address) {
599 .bo = dst_buffer->bo,
600 .offset = dst_buffer->offset + dst_offset,
601 };
602 }
603
604 if (flags & VK_QUERY_RESULT_64_BIT) {
605 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
606 srm.RegisterAddress = reg + 4;
607 srm.MemoryAddress = (struct anv_address) {
608 .bo = dst_buffer->bo,
609 .offset = dst_buffer->offset + dst_offset + 4,
610 };
611 }
612 }
613 }
614
615 static void
616 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
617 struct anv_bo *bo, uint32_t offset)
618 {
619 emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
620 emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
621
622 /* FIXME: We need to clamp the result for 32 bit. */
623
624 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
625 if (!dw) {
626 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
627 return;
628 }
629
630 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
631 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
632 dw[3] = alu(OPCODE_SUB, 0, 0);
633 dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
634 }
635
636 void genX(CmdCopyQueryPoolResults)(
637 VkCommandBuffer commandBuffer,
638 VkQueryPool queryPool,
639 uint32_t firstQuery,
640 uint32_t queryCount,
641 VkBuffer destBuffer,
642 VkDeviceSize destOffset,
643 VkDeviceSize destStride,
644 VkQueryResultFlags flags)
645 {
646 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
647 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
648 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
649 uint32_t slot_offset;
650
651 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
652 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
653 pc.CommandStreamerStallEnable = true;
654 pc.StallAtPixelScoreboard = true;
655 }
656 }
657
658 for (uint32_t i = 0; i < queryCount; i++) {
659 slot_offset = (firstQuery + i) * pool->stride;
660 switch (pool->type) {
661 case VK_QUERY_TYPE_OCCLUSION:
662 compute_query_result(&cmd_buffer->batch, OPERAND_R2,
663 &pool->bo, slot_offset + 8);
664 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
665 flags, 0, CS_GPR(2));
666 break;
667
668 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
669 uint32_t statistics = pool->pipeline_statistics;
670 uint32_t idx = 0;
671 while (statistics) {
672 uint32_t stat = u_bit_scan(&statistics);
673
674 compute_query_result(&cmd_buffer->batch, OPERAND_R0,
675 &pool->bo, slot_offset + idx * 16 + 8);
676
677 /* WaDividePSInvocationCountBy4:HSW,BDW */
678 if ((cmd_buffer->device->info.gen == 8 ||
679 cmd_buffer->device->info.is_haswell) &&
680 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
681 shr_gpr0_by_2_bits(&cmd_buffer->batch);
682 }
683
684 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
685 flags, idx, CS_GPR(0));
686
687 idx++;
688 }
689 assert(idx == _mesa_bitcount(pool->pipeline_statistics));
690 break;
691 }
692
693 case VK_QUERY_TYPE_TIMESTAMP:
694 emit_load_alu_reg_u64(&cmd_buffer->batch,
695 CS_GPR(2), &pool->bo, slot_offset + 8);
696 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
697 flags, 0, CS_GPR(2));
698 break;
699
700 default:
701 unreachable("unhandled query type");
702 }
703
704 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
705 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
706 _mesa_bitcount(pool->pipeline_statistics) : 1;
707
708 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
709 &pool->bo, slot_offset);
710 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
711 flags, idx, CS_GPR(0));
712 }
713
714 destOffset += destStride;
715 }
716 }
717
718 #else
719 void genX(CmdCopyQueryPoolResults)(
720 VkCommandBuffer commandBuffer,
721 VkQueryPool queryPool,
722 uint32_t firstQuery,
723 uint32_t queryCount,
724 VkBuffer destBuffer,
725 VkDeviceSize destOffset,
726 VkDeviceSize destStride,
727 VkQueryResultFlags flags)
728 {
729 anv_finishme("Queries not yet supported on Ivy Bridge");
730 }
731 #endif