anv: Soft-pin everything else
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 VkResult genX(CreateQueryPool)(
36 VkDevice _device,
37 const VkQueryPoolCreateInfo* pCreateInfo,
38 const VkAllocationCallbacks* pAllocator,
39 VkQueryPool* pQueryPool)
40 {
41 ANV_FROM_HANDLE(anv_device, device, _device);
42 const struct anv_physical_device *pdevice = &device->instance->physicalDevice;
43 struct anv_query_pool *pool;
44 VkResult result;
45
46 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
47
48 /* Query pool slots are made up of some number of 64-bit values packed
49 * tightly together. The first 64-bit value is always the "available" bit
50 * which is 0 when the query is unavailable and 1 when it is available.
51 * The 64-bit values that follow are determined by the type of query.
52 */
53 uint32_t uint64s_per_slot = 1;
54
55 VkQueryPipelineStatisticFlags pipeline_statistics = 0;
56 switch (pCreateInfo->queryType) {
57 case VK_QUERY_TYPE_OCCLUSION:
58 /* Occlusion queries have two values: begin and end. */
59 uint64s_per_slot += 2;
60 break;
61 case VK_QUERY_TYPE_TIMESTAMP:
62 /* Timestamps just have the one timestamp value */
63 uint64s_per_slot += 1;
64 break;
65 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
66 pipeline_statistics = pCreateInfo->pipelineStatistics;
67 /* We're going to trust this field implicitly so we need to ensure that
68 * no unhandled extension bits leak in.
69 */
70 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK;
71
72 /* Statistics queries have a min and max for every statistic */
73 uint64s_per_slot += 2 * _mesa_bitcount(pipeline_statistics);
74 break;
75 default:
76 assert(!"Invalid query type");
77 }
78
79 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
80 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
81 if (pool == NULL)
82 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
83
84 pool->type = pCreateInfo->queryType;
85 pool->pipeline_statistics = pipeline_statistics;
86 pool->stride = uint64s_per_slot * sizeof(uint64_t);
87 pool->slots = pCreateInfo->queryCount;
88
89 uint64_t size = pool->slots * pool->stride;
90 result = anv_bo_init_new(&pool->bo, device, size);
91 if (result != VK_SUCCESS)
92 goto fail;
93
94 if (pdevice->supports_48bit_addresses)
95 pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
96
97 if (pdevice->use_softpin)
98 pool->bo.flags |= EXEC_OBJECT_PINNED;
99
100 if (pdevice->has_exec_async)
101 pool->bo.flags |= EXEC_OBJECT_ASYNC;
102
103 anv_vma_alloc(device, &pool->bo);
104
105 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC
106 * platforms, this does nothing. On non-LLC platforms, this means snooping
107 * which comes at a slight cost. However, the buffers aren't big, won't be
108 * written frequently, and trying to handle the flushing manually without
109 * doing too much flushing is extremely painful.
110 */
111 anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED);
112
113 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
114
115 *pQueryPool = anv_query_pool_to_handle(pool);
116
117 return VK_SUCCESS;
118
119 fail:
120 vk_free2(&device->alloc, pAllocator, pool);
121
122 return result;
123 }
124
125 void genX(DestroyQueryPool)(
126 VkDevice _device,
127 VkQueryPool _pool,
128 const VkAllocationCallbacks* pAllocator)
129 {
130 ANV_FROM_HANDLE(anv_device, device, _device);
131 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
132
133 if (!pool)
134 return;
135
136 anv_gem_munmap(pool->bo.map, pool->bo.size);
137 anv_vma_free(device, &pool->bo);
138 anv_gem_close(device, pool->bo.gem_handle);
139 vk_free2(&device->alloc, pAllocator, pool);
140 }
141
142 static void
143 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
144 uint32_t value_index, uint64_t result)
145 {
146 if (flags & VK_QUERY_RESULT_64_BIT) {
147 uint64_t *dst64 = dst_slot;
148 dst64[value_index] = result;
149 } else {
150 uint32_t *dst32 = dst_slot;
151 dst32[value_index] = result;
152 }
153 }
154
155 static bool
156 query_is_available(uint64_t *slot)
157 {
158 return *(volatile uint64_t *)slot;
159 }
160
161 static VkResult
162 wait_for_available(struct anv_device *device,
163 struct anv_query_pool *pool, uint64_t *slot)
164 {
165 while (true) {
166 if (query_is_available(slot))
167 return VK_SUCCESS;
168
169 int ret = anv_gem_busy(device, pool->bo.gem_handle);
170 if (ret == 1) {
171 /* The BO is still busy, keep waiting. */
172 continue;
173 } else if (ret == -1) {
174 /* We don't know the real error. */
175 device->lost = true;
176 return vk_errorf(device->instance, device, VK_ERROR_DEVICE_LOST,
177 "gem wait failed: %m");
178 } else {
179 assert(ret == 0);
180 /* The BO is no longer busy. */
181 if (query_is_available(slot)) {
182 return VK_SUCCESS;
183 } else {
184 VkResult status = anv_device_query_status(device);
185 if (status != VK_SUCCESS)
186 return status;
187
188 /* If we haven't seen availability yet, then we never will. This
189 * can only happen if we have a client error where they call
190 * GetQueryPoolResults on a query that they haven't submitted to
191 * the GPU yet. The spec allows us to do anything in this case,
192 * but returning VK_SUCCESS doesn't seem right and we shouldn't
193 * just keep spinning.
194 */
195 return VK_NOT_READY;
196 }
197 }
198 }
199 }
200
201 VkResult genX(GetQueryPoolResults)(
202 VkDevice _device,
203 VkQueryPool queryPool,
204 uint32_t firstQuery,
205 uint32_t queryCount,
206 size_t dataSize,
207 void* pData,
208 VkDeviceSize stride,
209 VkQueryResultFlags flags)
210 {
211 ANV_FROM_HANDLE(anv_device, device, _device);
212 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
213
214 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
215 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS ||
216 pool->type == VK_QUERY_TYPE_TIMESTAMP);
217
218 if (unlikely(device->lost))
219 return VK_ERROR_DEVICE_LOST;
220
221 if (pData == NULL)
222 return VK_SUCCESS;
223
224 void *data_end = pData + dataSize;
225
226 VkResult status = VK_SUCCESS;
227 for (uint32_t i = 0; i < queryCount; i++) {
228 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
229
230 /* Availability is always at the start of the slot */
231 bool available = slot[0];
232
233 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
234 status = wait_for_available(device, pool, slot);
235 if (status != VK_SUCCESS)
236 return status;
237
238 available = true;
239 }
240
241 /* From the Vulkan 1.0.42 spec:
242 *
243 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
244 * both not set then no result values are written to pData for
245 * queries that are in the unavailable state at the time of the call,
246 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
247 * availability state is still written to pData for those queries if
248 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
249 */
250 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
251
252 if (write_results) {
253 switch (pool->type) {
254 case VK_QUERY_TYPE_OCCLUSION: {
255 cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]);
256 break;
257 }
258
259 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
260 uint32_t statistics = pool->pipeline_statistics;
261 uint32_t idx = 0;
262 while (statistics) {
263 uint32_t stat = u_bit_scan(&statistics);
264 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1];
265
266 /* WaDividePSInvocationCountBy4:HSW,BDW */
267 if ((device->info.gen == 8 || device->info.is_haswell) &&
268 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT)
269 result >>= 2;
270
271 cpu_write_query_result(pData, flags, idx, result);
272
273 idx++;
274 }
275 assert(idx == _mesa_bitcount(pool->pipeline_statistics));
276 break;
277 }
278
279 case VK_QUERY_TYPE_TIMESTAMP: {
280 cpu_write_query_result(pData, flags, 0, slot[1]);
281 break;
282 }
283 default:
284 unreachable("invalid pool type");
285 }
286 } else {
287 status = VK_NOT_READY;
288 }
289
290 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
291 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
292 _mesa_bitcount(pool->pipeline_statistics) : 1;
293 cpu_write_query_result(pData, flags, idx, available);
294 }
295
296 pData += stride;
297 if (pData >= data_end)
298 break;
299 }
300
301 return status;
302 }
303
304 static void
305 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
306 struct anv_bo *bo, uint32_t offset)
307 {
308 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
309 pc.DestinationAddressType = DAT_PPGTT;
310 pc.PostSyncOperation = WritePSDepthCount;
311 pc.DepthStallEnable = true;
312 pc.Address = (struct anv_address) { bo, offset };
313
314 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
315 pc.CommandStreamerStallEnable = true;
316 }
317 }
318
319 static void
320 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
321 struct anv_bo *bo, uint32_t offset)
322 {
323 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
324 pc.DestinationAddressType = DAT_PPGTT;
325 pc.PostSyncOperation = WriteImmediateData;
326 pc.Address = (struct anv_address) { bo, offset };
327 pc.ImmediateData = 1;
328 }
329 }
330
331 /**
332 * Goes through a series of consecutive query indices in the given pool
333 * setting all element values to 0 and emitting them as available.
334 */
335 static void
336 emit_zero_queries(struct anv_cmd_buffer *cmd_buffer,
337 struct anv_query_pool *pool,
338 uint32_t first_index, uint32_t num_queries)
339 {
340 const uint32_t num_elements = pool->stride / sizeof(uint64_t);
341
342 for (uint32_t i = 0; i < num_queries; i++) {
343 uint32_t slot_offset = (first_index + i) * pool->stride;
344 for (uint32_t j = 1; j < num_elements; j++) {
345 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
346 sdi.Address.bo = &pool->bo;
347 sdi.Address.offset = slot_offset + j * sizeof(uint64_t);
348 sdi.ImmediateData = 0ull;
349 }
350 }
351 emit_query_availability(cmd_buffer, &pool->bo, slot_offset);
352 }
353 }
354
355 void genX(CmdResetQueryPool)(
356 VkCommandBuffer commandBuffer,
357 VkQueryPool queryPool,
358 uint32_t firstQuery,
359 uint32_t queryCount)
360 {
361 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
362 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
363
364 for (uint32_t i = 0; i < queryCount; i++) {
365 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
366 sdm.Address = (struct anv_address) {
367 .bo = &pool->bo,
368 .offset = (firstQuery + i) * pool->stride,
369 };
370 sdm.ImmediateData = 0;
371 }
372 }
373 }
374
375 static const uint32_t vk_pipeline_stat_to_reg[] = {
376 GENX(IA_VERTICES_COUNT_num),
377 GENX(IA_PRIMITIVES_COUNT_num),
378 GENX(VS_INVOCATION_COUNT_num),
379 GENX(GS_INVOCATION_COUNT_num),
380 GENX(GS_PRIMITIVES_COUNT_num),
381 GENX(CL_INVOCATION_COUNT_num),
382 GENX(CL_PRIMITIVES_COUNT_num),
383 GENX(PS_INVOCATION_COUNT_num),
384 GENX(HS_INVOCATION_COUNT_num),
385 GENX(DS_INVOCATION_COUNT_num),
386 GENX(CS_INVOCATION_COUNT_num),
387 };
388
389 static void
390 emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat,
391 struct anv_bo *bo, uint32_t offset)
392 {
393 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK ==
394 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1);
395
396 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg));
397 uint32_t reg = vk_pipeline_stat_to_reg[stat];
398
399 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
400 lrm.RegisterAddress = reg,
401 lrm.MemoryAddress = (struct anv_address) { bo, offset };
402 }
403 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) {
404 lrm.RegisterAddress = reg + 4,
405 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
406 }
407 }
408
409 void genX(CmdBeginQuery)(
410 VkCommandBuffer commandBuffer,
411 VkQueryPool queryPool,
412 uint32_t query,
413 VkQueryControlFlags flags)
414 {
415 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
416 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
417
418 switch (pool->type) {
419 case VK_QUERY_TYPE_OCCLUSION:
420 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8);
421 break;
422
423 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
424 /* TODO: This might only be necessary for certain stats */
425 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
426 pc.CommandStreamerStallEnable = true;
427 pc.StallAtPixelScoreboard = true;
428 }
429
430 uint32_t statistics = pool->pipeline_statistics;
431 uint32_t offset = query * pool->stride + 8;
432 while (statistics) {
433 uint32_t stat = u_bit_scan(&statistics);
434 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
435 offset += 16;
436 }
437 break;
438 }
439
440 default:
441 unreachable("");
442 }
443 }
444
445 void genX(CmdEndQuery)(
446 VkCommandBuffer commandBuffer,
447 VkQueryPool queryPool,
448 uint32_t query)
449 {
450 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
451 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
452
453 switch (pool->type) {
454 case VK_QUERY_TYPE_OCCLUSION:
455 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16);
456 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
457 break;
458
459 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
460 /* TODO: This might only be necessary for certain stats */
461 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
462 pc.CommandStreamerStallEnable = true;
463 pc.StallAtPixelScoreboard = true;
464 }
465
466 uint32_t statistics = pool->pipeline_statistics;
467 uint32_t offset = query * pool->stride + 16;
468 while (statistics) {
469 uint32_t stat = u_bit_scan(&statistics);
470 emit_pipeline_stat(cmd_buffer, stat, &pool->bo, offset);
471 offset += 16;
472 }
473
474 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
475 break;
476 }
477
478 default:
479 unreachable("");
480 }
481
482 /* When multiview is active the spec requires that N consecutive query
483 * indices are used, where N is the number of active views in the subpass.
484 * The spec allows that we only write the results to one of the queries
485 * but we still need to manage result availability for all the query indices.
486 * Since we only emit a single query for all active views in the
487 * first index, mark the other query indices as being already available
488 * with result 0.
489 */
490 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
491 const uint32_t num_queries =
492 _mesa_bitcount(cmd_buffer->state.subpass->view_mask);
493 if (num_queries > 1)
494 emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
495 }
496 }
497
498 #define TIMESTAMP 0x2358
499
500 void genX(CmdWriteTimestamp)(
501 VkCommandBuffer commandBuffer,
502 VkPipelineStageFlagBits pipelineStage,
503 VkQueryPool queryPool,
504 uint32_t query)
505 {
506 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
507 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
508 uint32_t offset = query * pool->stride;
509
510 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
511
512 switch (pipelineStage) {
513 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
514 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
515 srm.RegisterAddress = TIMESTAMP;
516 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 8 };
517 }
518 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
519 srm.RegisterAddress = TIMESTAMP + 4;
520 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 12 };
521 }
522 break;
523
524 default:
525 /* Everything else is bottom-of-pipe */
526 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
527 pc.DestinationAddressType = DAT_PPGTT;
528 pc.PostSyncOperation = WriteTimestamp;
529 pc.Address = (struct anv_address) { &pool->bo, offset + 8 };
530
531 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
532 pc.CommandStreamerStallEnable = true;
533 }
534 break;
535 }
536
537 emit_query_availability(cmd_buffer, &pool->bo, offset);
538
539 /* When multiview is active the spec requires that N consecutive query
540 * indices are used, where N is the number of active views in the subpass.
541 * The spec allows that we only write the results to one of the queries
542 * but we still need to manage result availability for all the query indices.
543 * Since we only emit a single query for all active views in the
544 * first index, mark the other query indices as being already available
545 * with result 0.
546 */
547 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
548 const uint32_t num_queries =
549 _mesa_bitcount(cmd_buffer->state.subpass->view_mask);
550 if (num_queries > 1)
551 emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1);
552 }
553 }
554
555 #if GEN_GEN > 7 || GEN_IS_HASWELL
556
557 static uint32_t
558 mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2)
559 {
560 struct GENX(MI_MATH_ALU_INSTRUCTION) instr = {
561 .ALUOpcode = opcode,
562 .Operand1 = operand1,
563 .Operand2 = operand2,
564 };
565
566 uint32_t dw;
567 GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr);
568
569 return dw;
570 }
571
572 #define CS_GPR(n) (0x2600 + (n) * 8)
573
574 static void
575 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
576 struct anv_bo *bo, uint32_t offset)
577 {
578 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
579 lrm.RegisterAddress = reg,
580 lrm.MemoryAddress = (struct anv_address) { bo, offset };
581 }
582 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
583 lrm.RegisterAddress = reg + 4;
584 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
585 }
586 }
587
588 static void
589 emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm)
590 {
591 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
592 lri.RegisterOffset = reg;
593 lri.DataDWord = imm;
594 }
595 }
596
597 static void
598 emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm)
599 {
600 emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm);
601 emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32));
602 }
603
604 static void
605 emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst)
606 {
607 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
608 lrr.SourceRegisterAddress = src;
609 lrr.DestinationRegisterAddress = dst;
610 }
611 }
612
613 /*
614 * GPR0 = GPR0 & ((1ull << n) - 1);
615 */
616 static void
617 keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n)
618 {
619 assert(n < 64);
620 emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1);
621
622 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
623 if (!dw) {
624 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
625 return;
626 }
627
628 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
629 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1);
630 dw[3] = mi_alu(MI_ALU_AND, 0, 0);
631 dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
632 }
633
634 /*
635 * GPR0 = GPR0 << 30;
636 */
637 static void
638 shl_gpr0_by_30_bits(struct anv_batch *batch)
639 {
640 /* First we mask 34 bits of GPR0 to prevent overflow */
641 keep_gpr0_lower_n_bits(batch, 34);
642
643 const uint32_t outer_count = 5;
644 const uint32_t inner_count = 6;
645 STATIC_ASSERT(outer_count * inner_count == 30);
646 const uint32_t cmd_len = 1 + inner_count * 4;
647
648 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of
649 * 30 left shifts.
650 */
651 for (int o = 0; o < outer_count; o++) {
652 /* Submit one MI_MATH to shift left by 6 bits */
653 uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH));
654 if (!dw) {
655 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
656 return;
657 }
658
659 dw++;
660 for (int i = 0; i < inner_count; i++, dw += 4) {
661 dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0);
662 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
663 dw[2] = mi_alu(MI_ALU_ADD, 0, 0);
664 dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU);
665 }
666 }
667 }
668
669 /*
670 * GPR0 = GPR0 >> 2;
671 *
672 * Note that the upper 30 bits of GPR are lost!
673 */
674 static void
675 shr_gpr0_by_2_bits(struct anv_batch *batch)
676 {
677 shl_gpr0_by_30_bits(batch);
678 emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0));
679 emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0);
680 }
681
682 static void
683 gpu_write_query_result(struct anv_batch *batch,
684 struct anv_buffer *dst_buffer, uint32_t dst_offset,
685 VkQueryResultFlags flags,
686 uint32_t value_index, uint32_t reg)
687 {
688 if (flags & VK_QUERY_RESULT_64_BIT)
689 dst_offset += value_index * 8;
690 else
691 dst_offset += value_index * 4;
692
693 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
694 srm.RegisterAddress = reg;
695 srm.MemoryAddress = anv_address_add(dst_buffer->address, dst_offset);
696 }
697
698 if (flags & VK_QUERY_RESULT_64_BIT) {
699 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
700 srm.RegisterAddress = reg + 4;
701 srm.MemoryAddress = anv_address_add(dst_buffer->address,
702 dst_offset + 4);
703 }
704 }
705 }
706
707 static void
708 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
709 struct anv_bo *bo, uint32_t offset)
710 {
711 emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
712 emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
713
714 /* FIXME: We need to clamp the result for 32 bit. */
715
716 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
717 if (!dw) {
718 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY);
719 return;
720 }
721
722 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1);
723 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0);
724 dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
725 dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU);
726 }
727
728 void genX(CmdCopyQueryPoolResults)(
729 VkCommandBuffer commandBuffer,
730 VkQueryPool queryPool,
731 uint32_t firstQuery,
732 uint32_t queryCount,
733 VkBuffer destBuffer,
734 VkDeviceSize destOffset,
735 VkDeviceSize destStride,
736 VkQueryResultFlags flags)
737 {
738 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
739 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
740 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
741 uint32_t slot_offset;
742
743 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
744 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
745 pc.CommandStreamerStallEnable = true;
746 pc.StallAtPixelScoreboard = true;
747 }
748 }
749
750 for (uint32_t i = 0; i < queryCount; i++) {
751 slot_offset = (firstQuery + i) * pool->stride;
752 switch (pool->type) {
753 case VK_QUERY_TYPE_OCCLUSION:
754 compute_query_result(&cmd_buffer->batch, MI_ALU_REG2,
755 &pool->bo, slot_offset + 8);
756 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
757 flags, 0, CS_GPR(2));
758 break;
759
760 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
761 uint32_t statistics = pool->pipeline_statistics;
762 uint32_t idx = 0;
763 while (statistics) {
764 uint32_t stat = u_bit_scan(&statistics);
765
766 compute_query_result(&cmd_buffer->batch, MI_ALU_REG0,
767 &pool->bo, slot_offset + idx * 16 + 8);
768
769 /* WaDividePSInvocationCountBy4:HSW,BDW */
770 if ((cmd_buffer->device->info.gen == 8 ||
771 cmd_buffer->device->info.is_haswell) &&
772 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) {
773 shr_gpr0_by_2_bits(&cmd_buffer->batch);
774 }
775
776 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
777 flags, idx, CS_GPR(0));
778
779 idx++;
780 }
781 assert(idx == _mesa_bitcount(pool->pipeline_statistics));
782 break;
783 }
784
785 case VK_QUERY_TYPE_TIMESTAMP:
786 emit_load_alu_reg_u64(&cmd_buffer->batch,
787 CS_GPR(2), &pool->bo, slot_offset + 8);
788 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
789 flags, 0, CS_GPR(2));
790 break;
791
792 default:
793 unreachable("unhandled query type");
794 }
795
796 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
797 uint32_t idx = (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) ?
798 _mesa_bitcount(pool->pipeline_statistics) : 1;
799
800 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
801 &pool->bo, slot_offset);
802 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
803 flags, idx, CS_GPR(0));
804 }
805
806 destOffset += destStride;
807 }
808 }
809
810 #else
811 void genX(CmdCopyQueryPoolResults)(
812 VkCommandBuffer commandBuffer,
813 VkQueryPool queryPool,
814 uint32_t firstQuery,
815 uint32_t queryCount,
816 VkBuffer destBuffer,
817 VkDeviceSize destOffset,
818 VkDeviceSize destStride,
819 VkQueryResultFlags flags)
820 {
821 anv_finishme("Queries not yet supported on Ivy Bridge");
822 }
823 #endif