2c70b4f528e58d47976388f12c59fd66d3863bcc
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 VkResult genX(CreateQueryPool)(
36 VkDevice _device,
37 const VkQueryPoolCreateInfo* pCreateInfo,
38 const VkAllocationCallbacks* pAllocator,
39 VkQueryPool* pQueryPool)
40 {
41 ANV_FROM_HANDLE(anv_device, device, _device);
42 struct anv_query_pool *pool;
43 VkResult result;
44
45 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
46
47 /* Query pool slots are made up of some number of 64-bit values packed
48 * tightly together. The first 64-bit value is always the "available" bit
49 * which is 0 when the query is unavailable and 1 when it is available.
50 * The 64-bit values that follow are determined by the type of query.
51 */
52 uint32_t uint64s_per_slot = 1;
53
54 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
55 switch (pCreateInfo->queryType) {
56 case VK_QUERY_TYPE_OCCLUSION:
57 /* Occlusion queries have two values: begin and end. */
58 uint64s_per_slot += 2;
59 break;
60 case VK_QUERY_TYPE_TIMESTAMP:
61 /* Timestamps just have the one timestamp value */
62 uint64s_per_slot += 1;
63 break;
64 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
65 pipeline_statistics = pCreateInfo->pipelineStatistics;
66 /* We're going to trust this field implicitly so we need to ensure that
67 * no unhandled extension bits leak in.
68 */
69 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
70
71 /* Statistics queries have a min and max for every statistic */
72 uint64s_per_slot += 2 * _mesa_bitcount(pipeline_statistics);
73 break;
74 default:
75 assert(!"Invalid query type");
76 }
77
78 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
79 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
80 if (pool == NULL)
81 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
82
83 pool->type = pCreateInfo->queryType;
84 pool->pipeline_statistics = pipeline_statistics;
85 pool->stride = uint64s_per_slot * sizeof(uint64_t);
86 pool->slots = pCreateInfo->queryCount;
87
88 uint64_t size = pool->slots * pool->stride;
89 result = anv_bo_init_new(&pool->bo, device, size);
90 if (result != VK_SUCCESS)
91 goto fail;
92
93 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
94 * platforms, this does nothing. On non-LLC platforms, this means snooping
95 * which comes at a slight cost. However, the buffers aren't big, won't be
96 * written frequently, and trying to handle the flushing manually without
97 * doing too much flushing is extremely painful.
98 */
99 anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
100
101 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
102
103 *pQueryPool = anv_query_pool_to_handle(pool);
104
105 return VK_SUCCESS;
106
107 fail:
108 vk_free2(&device->alloc, pAllocator, pool);
109
110 return result;
111 }
112
113 void genX(DestroyQueryPool)(
114 VkDevice _device,
115 VkQueryPool _pool,
116 const VkAllocationCallbacks* pAllocator)
117 {
118 ANV_FROM_HANDLE(anv_device, device, _device);
119 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
120
121 if (!pool)
122 return;
123
124 anv_gem_munmap(pool->bo.map, pool->bo.size);
125 anv_gem_close(device, pool->bo.gem_handle);
126 vk_free2(&device->alloc, pAllocator, pool);
127 }
128
129 static void
130 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
131 uint32_t value_index, uint64_t result)
132 {
133 if (flags & VK_QUERY_RESULT_64_BIT) {
134 uint64_t *dst64 = dst_slot;
135 dst64[value_index] = result;
136 } else {
137 uint32_t *dst32 = dst_slot;
138 dst32[value_index] = result;
139 }
140 }
141
142 static bool
143 query_is_available(uint64_t *slot)
144 {
145 return *(volatile uint64_t *)slot;
146 }
147
148 static VkResult
149 wait_for_available(struct anv_device *device,
150 struct anv_query_pool *pool, uint64_t *slot)
151 {
152 while (true) {
153 if (query_is_available(slot))
154 return VK_SUCCESS;
155
156 int ret = anv_gem_busy(device, pool->bo.gem_handle);
157 if (ret == 1) {
158 /* The BO is still busy, keep waiting. */
159 continue;
160 } else if (ret == -1) {
161 /* We don't know the real error. */
162 device->lost = true;
163 return vk_errorf(VK_ERROR_DEVICE_LOST, "gem wait failed: %m");
164 } else {
165 assert(ret == 0);
166 /* The BO is no longer busy. */
167 if (query_is_available(slot)) {
168 return VK_SUCCESS;
169 } else {
170 VkResult status = anv_device_query_status(device);
171 if (status != VK_SUCCESS)
172 return status;
173
174 /* If we haven't seen availability yet, then we never will. This
175 * can only happen if we have a client error where they call
176 * GetQueryPoolResults on a query that they haven't submitted to
177 * the GPU yet. The spec allows us to do anything in this case,
178 * but returning VK_SUCCESS doesn't seem right and we shouldn't
179 * just keep spinning.
180 */
181 return VK_NOT_READY;
182 }
183 }
184 }
185 }
186
187 VkResult genX(GetQueryPoolResults)(
188 VkDevice _device,
189 VkQueryPool queryPool,
190 uint32_t firstQuery,
191 uint32_t queryCount,
192 size_t dataSize,
193 void* pData,
194 VkDeviceSize stride,
195 VkQueryResultFlags flags)
196 {
197 ANV_FROM_HANDLE(anv_device, device, _device);
198 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
199
200 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
201 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
202 pool->type == VK_QUERY_TYPE_TIMESTAMP);
203
204 if (unlikely(device->lost))
205 return VK_ERROR_DEVICE_LOST;
206
207 if (pData == NULL)
208 return VK_SUCCESS;
209
210 void *data_end = pData + dataSize;
211
212 VkResult status = VK_SUCCESS;
213 for (uint32_t i = 0; i < queryCount; i++) {
214 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
215
216 /* Availability is always at the start of the slot */
217 bool available = slot[0];
218
219 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
220 status = wait_for_available(device, pool, slot);
221 if (status != VK_SUCCESS)
222 return status;
223
224 available = true;
225 }
226
227 /* From the Vulkan 1.0.42 spec:
228 *
229 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
230 * both not set then no result values are written to pData for
231 * queries that are in the unavailable state at the time of the call,
232 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
233 * availability state is still written to pData for those queries if
234 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
235 */
236 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
237
238 if (write_results) {
239 switch (pool->type) {
240 case VK_QUERY_TYPE_OCCLUSION: {
241 cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]);
242 break;
243 }
244
245 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
246 uint32_t statistics = pool->pipeline_statistics;
247 uint32_t idx = 0;
248 while (statistics) {
249 uint32_t stat = u_bit_scan(&statistics);
250 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
251
252 /* WaDividePSInvocationCountBy4:HSW,BDW */
253 if ((device->info.gen == 8 || device->info.is_haswell) &&
254 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
255 result >>= 2;
256
257 cpu_write_query_result(pData, flags, idx, result);
258
259 idx++;
260 }
261 assert(idx == _mesa_bitcount(pool->pipeline_statistics));
262 break;
263 }
264
265 case VK_QUERY_TYPE_TIMESTAMP: {
266 cpu_write_query_result(pData, flags, 0, slot[1]);
267 break;
268 }
269 default:
270 unreachable("invalid pool type");
271 }
272 } else {
273 status = VK_NOT_READY;
274 }
275
276 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
277 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
278 _mesa_bitcount(pool->pipeline_statistics) : 1;
279 cpu_write_query_result(pData, flags, idx, available);
280 }
281
282 pData += stride;
283 if (pData >= data_end)
284 break;
285 }
286
287 return status;
288 }
289
290 static void
291 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
292 struct anv_bo *bo, uint32_t offset)
293 {
294 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
295 pc.DestinationAddressType = DAT_PPGTT;
296 pc.PostSyncOperation = WritePSDepthCount;
297 pc.DepthStallEnable = true;
298 pc.Address = (struct anv_address) { bo, offset };
299
300 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
301 pc.CommandStreamerStallEnable = true;
302 }
303 }
304
305 static void
306 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
307 struct anv_bo *bo, uint32_t offset)
308 {
309 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
310 pc.DestinationAddressType = DAT_PPGTT;
311 pc.PostSyncOperation = WriteImmediateData;
312 pc.Address = (struct anv_address) { bo, offset };
313 pc.ImmediateData = 1;
314 }
315 }
316
317 void genX(CmdResetQueryPool)(
318 VkCommandBuffer commandBuffer,
319 VkQueryPool queryPool,
320 uint32_t firstQuery,
321 uint32_t queryCount)
322 {
323 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
324 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
325
326 for (uint32_t i = 0; i < queryCount; i++) {
327 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
328 sdm.Address = (struct anv_address) {
329 .bo = &pool->bo,
330 .offset = (firstQuery + i) * pool->stride,
331 };
332 sdm.ImmediateData = 0;
333 }
334 }
335 }
336
337 static const uint32_t vk_pipeline_stat_to_reg[] = {
338 GENX(IA_VERTICES_COUNT_num),
339 GENX(IA_PRIMITIVES_COUNT_num),
340 GENX(VS_INVOCATION_COUNT_num),
341 GENX(GS_INVOCATION_COUNT_num),
342 GENX(GS_PRIMITIVES_COUNT_num),
343 GENX(CL_INVOCATION_COUNT_num),
344 GENX(CL_PRIMITIVES_COUNT_num),
345 GENX(PS_INVOCATION_COUNT_num),
346 GENX(HS_INVOCATION_COUNT_num),
347 GENX(DS_INVOCATION_COUNT_num),
348 GENX(CS_INVOCATION_COUNT_num),
349 };
350
351 static void
352 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
353 struct anv_bo *bo, uint32_t offset)
354 {
355 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
356 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
357
358 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
359 uint32_t reg = vk_pipeline_stat_to_reg[stat];
360
361 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
362 lrm.RegisterAddress = reg,
363 lrm.MemoryAddress = (struct anv_address) { bo, offset };
364 }
365 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
366 lrm.RegisterAddress = reg + 4,
367 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
368 }
369 }
370
371 void genX(CmdBeginQuery)(
372 VkCommandBuffer commandBuffer,
373 VkQueryPool queryPool,
374 uint32_t query,
375 VkQueryControlFlags flags)
376 {
377 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
378 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
379
380 /* Workaround: When meta uses the pipeline with the VS disabled, it seems
381 * that the pipelining of the depth write breaks. What we see is that
382 * samples from the render pass clear leaks into the first query
383 * immediately after the clear. Doing a pipecontrol with a post-sync
384 * operation and DepthStallEnable seems to work around the issue.
385 */
386 if (cmd_buffer->state.need_query_wa) {
387 cmd_buffer->state.need_query_wa = false;
388 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
389 pc.DepthCacheFlushEnable = true;
390 pc.DepthStallEnable = true;
391 }
392 }
393
394 switch (pool->type) {
395 case VK_QUERY_TYPE_OCCLUSION:
396 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8);
397 break;
398
399 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
400 /* TODO: This might only be necessary for certain stats */
401 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
402 pc.CommandStreamerStallEnable = true;
403 pc.StallAtPixelScoreboard = true;
404 }
405
406 uint32_t statistics = pool->pipeline_statistics;
407 uint32_t offset = query * pool->stride + 8;
408 while (statistics) {
409 uint32_t stat = u_bit_scan(&statistics);
410 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
411 offset += 16;
412 }
413 break;
414 }
415
416 default:
417 unreachable("");
418 }
419 }
420
421 void genX(CmdEndQuery)(
422 VkCommandBuffer commandBuffer,
423 VkQueryPool queryPool,
424 uint32_t query)
425 {
426 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
427 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
428
429 switch (pool->type) {
430 case VK_QUERY_TYPE_OCCLUSION:
431 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16);
432 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
433 break;
434
435 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
436 /* TODO: This might only be necessary for certain stats */
437 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
438 pc.CommandStreamerStallEnable = true;
439 pc.StallAtPixelScoreboard = true;
440 }
441
442 uint32_t statistics = pool->pipeline_statistics;
443 uint32_t offset = query * pool->stride + 16;
444 while (statistics) {
445 uint32_t stat = u_bit_scan(&statistics);
446 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
447 offset += 16;
448 }
449
450 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
451 break;
452 }
453
454 default:
455 unreachable("");
456 }
457 }
458
459 #define TIMESTAMP 0x2358
460
461 void genX(CmdWriteTimestamp)(
462 VkCommandBuffer commandBuffer,
463 VkPipelineStageFlagBits pipelineStage,
464 VkQueryPool queryPool,
465 uint32_t query)
466 {
467 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
468 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
469 uint32_t offset = query * pool->stride;
470
471 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
472
473 switch (pipelineStage) {
474 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
475 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
476 srm.RegisterAddress = TIMESTAMP;
477 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 8 };
478 }
479 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
480 srm.RegisterAddress = TIMESTAMP + 4;
481 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 12 };
482 }
483 break;
484
485 default:
486 /* Everything else is bottom-of-pipe */
487 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
488 pc.DestinationAddressType = DAT_PPGTT;
489 pc.PostSyncOperation = WriteTimestamp;
490 pc.Address = (struct anv_address) { &pool->bo, offset + 8 };
491
492 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
493 pc.CommandStreamerStallEnable = true;
494 }
495 break;
496 }
497
498 emit_query_availability(cmd_buffer, &pool->bo, offset);
499 }
500
501 #if GEN_GEN > 7 || GEN_IS_HASWELL
502
503 #define alu_opcode(v) __gen_uint((v), 20, 31)
504 #define alu_operand1(v) __gen_uint((v), 10, 19)
505 #define alu_operand2(v) __gen_uint((v), 0, 9)
506 #define alu(opcode, operand1, operand2) \
507 alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
508
509 #define OPCODE_NOOP 0x000
510 #define OPCODE_LOAD 0x080
511 #define OPCODE_LOADINV 0x480
512 #define OPCODE_LOAD0 0x081
513 #define OPCODE_LOAD1 0x481
514 #define OPCODE_ADD 0x100
515 #define OPCODE_SUB 0x101
516 #define OPCODE_AND 0x102
517 #define OPCODE_OR 0x103
518 #define OPCODE_XOR 0x104
519 #define OPCODE_STORE 0x180
520 #define OPCODE_STOREINV 0x580
521
522 #define OPERAND_R0 0x00
523 #define OPERAND_R1 0x01
524 #define OPERAND_R2 0x02
525 #define OPERAND_R3 0x03
526 #define OPERAND_R4 0x04
527 #define OPERAND_SRCA 0x20
528 #define OPERAND_SRCB 0x21
529 #define OPERAND_ACCU 0x31
530 #define OPERAND_ZF 0x32
531 #define OPERAND_CF 0x33
532
533 #define CS_GPR(n) (0x2600 + (n) * 8)
534
535 static void
536 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
537 struct anv_bo *bo, uint32_t offset)
538 {
539 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
540 lrm.RegisterAddress = reg,
541 lrm.MemoryAddress = (struct anv_address) { bo, offset };
542 }
543 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
544 lrm.RegisterAddress = reg + 4;
545 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
546 }
547 }
548
549 static void
550 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
551 {
552 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
553 lri.RegisterOffset = reg;
554 lri.DataDWord = imm;
555 }
556 }
557
558 static void
559 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
560 {
561 emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
562 emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
563 }
564
565 static void
566 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
567 {
568 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
569 lrr.SourceRegisterAddress = src;
570 lrr.DestinationRegisterAddress = dst;
571 }
572 }
573
574 /*
575 * GPR0 = GPR0 & ((1ull << n) - 1);
576 */
577 static void
578 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
579 {
580 assert(n < 64);
581 emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
582
583 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
584 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
585 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1);
586 dw[3] = alu(OPCODE_AND, 0, 0);
587 dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
588 }
589
590 /*
591 * GPR0 = GPR0 << 30;
592 */
593 static void
594 shl_gpr0_by_30_bits(struct anv_batch *batch)
595 {
596 /* First we mask 34 bits of GPR0 to prevent overflow */
597 keep_gpr0_lower_n_bits(batch, 34);
598
599 const uint32_t outer_count = 5;
600 const uint32_t inner_count = 6;
601 STATIC_ASSERT(outer_count * inner_count == 30);
602 const uint32_t cmd_len = 1 + inner_count * 4;
603
604 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
605 * 30 left shifts.
606 */
607 for (int o = 0; o < outer_count; o++) {
608 /* Submit one MI_MATH to shift left by 6 bits */
609 uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
610 dw++;
611 for (int i = 0; i < inner_count; i++, dw += 4) {
612 dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0);
613 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
614 dw[2] = alu(OPCODE_ADD, 0, 0);
615 dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU);
616 }
617 }
618 }
619
620 /*
621 * GPR0 = GPR0 >> 2;
622 *
623 * Note that the upper 30 bits of GPR are lost!
624 */
625 static void
626 shr_gpr0_by_2_bits(struct anv_batch *batch)
627 {
628 shl_gpr0_by_30_bits(batch);
629 emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
630 emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
631 }
632
633 static void
634 gpu_write_query_result(struct anv_batch *batch,
635 struct anv_buffer *dst_buffer, uint32_t dst_offset,
636 VkQueryResultFlags flags,
637 uint32_t value_index, uint32_t reg)
638 {
639 if (flags & VK_QUERY_RESULT_64_BIT)
640 dst_offset += value_index * 8;
641 else
642 dst_offset += value_index * 4;
643
644 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
645 srm.RegisterAddress = reg;
646 srm.MemoryAddress = (struct anv_address) {
647 .bo = dst_buffer->bo,
648 .offset = dst_buffer->offset + dst_offset,
649 };
650 }
651
652 if (flags & VK_QUERY_RESULT_64_BIT) {
653 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
654 srm.RegisterAddress = reg + 4;
655 srm.MemoryAddress = (struct anv_address) {
656 .bo = dst_buffer->bo,
657 .offset = dst_buffer->offset + dst_offset + 4,
658 };
659 }
660 }
661 }
662
663 static void
664 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
665 struct anv_bo *bo, uint32_t offset)
666 {
667 emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
668 emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
669
670 /* FIXME: We need to clamp the result for 32 bit. */
671
672 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
673 if (!dw) {
674 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
675 return;
676 }
677
678 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
679 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
680 dw[3] = alu(OPCODE_SUB, 0, 0);
681 dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
682 }
683
684 void genX(CmdCopyQueryPoolResults)(
685 VkCommandBuffer commandBuffer,
686 VkQueryPool queryPool,
687 uint32_t firstQuery,
688 uint32_t queryCount,
689 VkBuffer destBuffer,
690 VkDeviceSize destOffset,
691 VkDeviceSize destStride,
692 VkQueryResultFlags flags)
693 {
694 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
695 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
696 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
697 uint32_t slot_offset;
698
699 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
700 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
701 pc.CommandStreamerStallEnable = true;
702 pc.StallAtPixelScoreboard = true;
703 }
704 }
705
706 for (uint32_t i = 0; i < queryCount; i++) {
707 slot_offset = (firstQuery + i) * pool->stride;
708 switch (pool->type) {
709 case VK_QUERY_TYPE_OCCLUSION:
710 compute_query_result(&cmd_buffer->batch, OPERAND_R2,
711 &pool->bo, slot_offset + 8);
712 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
713 flags, 0, CS_GPR(2));
714 break;
715
716 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
717 uint32_t statistics = pool->pipeline_statistics;
718 uint32_t idx = 0;
719 while (statistics) {
720 uint32_t stat = u_bit_scan(&statistics);
721
722 compute_query_result(&cmd_buffer->batch, OPERAND_R0,
723 &pool->bo, slot_offset + idx * 16 + 8);
724
725 /* WaDividePSInvocationCountBy4:HSW,BDW */
726 if ((cmd_buffer->device->info.gen == 8 ||
727 cmd_buffer->device->info.is_haswell) &&
728 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
729 shr_gpr0_by_2_bits(&cmd_buffer->batch);
730 }
731
732 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
733 flags, idx, CS_GPR(0));
734
735 idx++;
736 }
737 assert(idx == _mesa_bitcount(pool->pipeline_statistics));
738 break;
739 }
740
741 case VK_QUERY_TYPE_TIMESTAMP:
742 emit_load_alu_reg_u64(&cmd_buffer->batch,
743 CS_GPR(2), &pool->bo, slot_offset + 8);
744 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
745 flags, 0, CS_GPR(2));
746 break;
747
748 default:
749 unreachable("unhandled query type");
750 }
751
752 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
753 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
754 _mesa_bitcount(pool->pipeline_statistics) : 1;
755
756 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
757 &pool->bo, slot_offset);
758 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
759 flags, idx, CS_GPR(0));
760 }
761
762 destOffset += destStride;
763 }
764 }
765
766 #else
767 void genX(CmdCopyQueryPoolResults)(
768 VkCommandBuffer commandBuffer,
769 VkQueryPool queryPool,
770 uint32_t firstQuery,
771 uint32_t queryCount,
772 VkBuffer destBuffer,
773 VkDeviceSize destOffset,
774 VkDeviceSize destStride,
775 VkQueryResultFlags flags)
776 {
777 anv_finishme("Queries not yet supported on Ivy Bridge");
778 }
779 #endif