anv/query: Rework store_query_result
[mesa.git] / src / intel / vulkan / genX_query.c
1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <fcntl.h>
29
30 #include "anv_private.h"
31
32 #include "genxml/gen_macros.h"
33 #include "genxml/genX_pack.h"
34
35 VkResult genX(CreateQueryPool)(
36 VkDevice _device,
37 const VkQueryPoolCreateInfo* pCreateInfo,
38 const VkAllocationCallbacks* pAllocator,
39 VkQueryPool* pQueryPool)
40 {
41 ANV_FROM_HANDLE(anv_device, device, _device);
42 struct anv_query_pool *pool;
43 VkResult result;
44
45 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO);
46
47 /* Query pool slots are made up of some number of 64-bit values packed
48 * tightly together. The first 64-bit value is always the "available" bit
49 * which is 0 when the query is unavailable and 1 when it is available.
50 * The 64-bit values that follow are determined by the type of query.
51 */
52 uint32_t uint64s_per_slot = 1;
53
54 switch (pCreateInfo->queryType) {
55 case VK_QUERY_TYPE_OCCLUSION:
56 /* Occlusion queries have two values: begin and end. */
57 uint64s_per_slot += 2;
58 break;
59 case VK_QUERY_TYPE_TIMESTAMP:
60 /* Timestamps just have the one timestamp value */
61 uint64s_per_slot += 1;
62 break;
63 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
64 return VK_ERROR_INCOMPATIBLE_DRIVER;
65 default:
66 assert(!"Invalid query type");
67 }
68
69 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
70 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
71 if (pool == NULL)
72 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
73
74 pool->type = pCreateInfo->queryType;
75 pool->stride = uint64s_per_slot * sizeof(uint64_t);
76 pool->slots = pCreateInfo->queryCount;
77
78 uint64_t size = pool->slots * pool->stride;
79 result = anv_bo_init_new(&pool->bo, device, size);
80 if (result != VK_SUCCESS)
81 goto fail;
82
83 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0);
84
85 *pQueryPool = anv_query_pool_to_handle(pool);
86
87 return VK_SUCCESS;
88
89 fail:
90 vk_free2(&device->alloc, pAllocator, pool);
91
92 return result;
93 }
94
95 void genX(DestroyQueryPool)(
96 VkDevice _device,
97 VkQueryPool _pool,
98 const VkAllocationCallbacks* pAllocator)
99 {
100 ANV_FROM_HANDLE(anv_device, device, _device);
101 ANV_FROM_HANDLE(anv_query_pool, pool, _pool);
102
103 if (!pool)
104 return;
105
106 anv_gem_munmap(pool->bo.map, pool->bo.size);
107 anv_gem_close(device, pool->bo.gem_handle);
108 vk_free2(&device->alloc, pAllocator, pool);
109 }
110
111 static void
112 cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags,
113 uint32_t value_index, uint64_t result)
114 {
115 if (flags & VK_QUERY_RESULT_64_BIT) {
116 uint64_t *dst64 = dst_slot;
117 dst64[value_index] = result;
118 } else {
119 uint32_t *dst32 = dst_slot;
120 dst32[value_index] = result;
121 }
122 }
123
124 VkResult genX(GetQueryPoolResults)(
125 VkDevice _device,
126 VkQueryPool queryPool,
127 uint32_t firstQuery,
128 uint32_t queryCount,
129 size_t dataSize,
130 void* pData,
131 VkDeviceSize stride,
132 VkQueryResultFlags flags)
133 {
134 ANV_FROM_HANDLE(anv_device, device, _device);
135 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
136 int64_t timeout = INT64_MAX;
137 int ret;
138
139 assert(pool->type == VK_QUERY_TYPE_OCCLUSION ||
140 pool->type == VK_QUERY_TYPE_TIMESTAMP);
141
142 if (pData == NULL)
143 return VK_SUCCESS;
144
145 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
146 ret = anv_gem_wait(device, pool->bo.gem_handle, &timeout);
147 if (ret == -1) {
148 /* We don't know the real error. */
149 return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY,
150 "gem_wait failed %m");
151 }
152 }
153
154 void *data_end = pData + dataSize;
155
156 if (!device->info.has_llc) {
157 uint64_t offset = firstQuery * pool->stride;
158 uint64_t size = queryCount * pool->stride;
159 anv_invalidate_range(pool->bo.map + offset,
160 MIN2(size, pool->bo.size - offset));
161 }
162
163 VkResult status = VK_SUCCESS;
164 for (uint32_t i = 0; i < queryCount; i++) {
165 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride;
166
167 /* Availability is always at the start of the slot */
168 bool available = slot[0];
169
170 /* From the Vulkan 1.0.42 spec:
171 *
172 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are
173 * both not set then no result values are written to pData for
174 * queries that are in the unavailable state at the time of the call,
175 * and vkGetQueryPoolResults returns VK_NOT_READY. However,
176 * availability state is still written to pData for those queries if
177 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set."
178 */
179 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
180
181 if (write_results) {
182 switch (pool->type) {
183 case VK_QUERY_TYPE_OCCLUSION: {
184 cpu_write_query_result(pData, flags, 0, slot[2] - slot[1]);
185 break;
186 }
187 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
188 unreachable("pipeline stats not supported");
189 case VK_QUERY_TYPE_TIMESTAMP: {
190 cpu_write_query_result(pData, flags, 0, slot[1]);
191 break;
192 }
193 default:
194 unreachable("invalid pool type");
195 }
196 } else {
197 status = VK_NOT_READY;
198 }
199
200 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
201 cpu_write_query_result(pData, flags, 1, available);
202
203 pData += stride;
204 if (pData >= data_end)
205 break;
206 }
207
208 return status;
209 }
210
211 static void
212 emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer,
213 struct anv_bo *bo, uint32_t offset)
214 {
215 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
216 pc.DestinationAddressType = DAT_PPGTT;
217 pc.PostSyncOperation = WritePSDepthCount;
218 pc.DepthStallEnable = true;
219 pc.Address = (struct anv_address) { bo, offset };
220
221 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
222 pc.CommandStreamerStallEnable = true;
223 }
224 }
225
226 static void
227 emit_query_availability(struct anv_cmd_buffer *cmd_buffer,
228 struct anv_bo *bo, uint32_t offset)
229 {
230 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
231 pc.DestinationAddressType = DAT_PPGTT;
232 pc.PostSyncOperation = WriteImmediateData;
233 pc.Address = (struct anv_address) { bo, offset };
234 pc.ImmediateData = 1;
235 }
236 }
237
238 void genX(CmdResetQueryPool)(
239 VkCommandBuffer commandBuffer,
240 VkQueryPool queryPool,
241 uint32_t firstQuery,
242 uint32_t queryCount)
243 {
244 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
245 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
246
247 for (uint32_t i = 0; i < queryCount; i++) {
248 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) {
249 sdm.Address = (struct anv_address) {
250 .bo = &pool->bo,
251 .offset = (firstQuery + i) * pool->stride,
252 };
253 sdm.DataDWord0 = 0;
254 sdm.DataDWord1 = 0;
255 }
256 }
257 }
258
259 void genX(CmdBeginQuery)(
260 VkCommandBuffer commandBuffer,
261 VkQueryPool queryPool,
262 uint32_t query,
263 VkQueryControlFlags flags)
264 {
265 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
266 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
267
268 /* Workaround: When meta uses the pipeline with the VS disabled, it seems
269 * that the pipelining of the depth write breaks. What we see is that
270 * samples from the render pass clear leaks into the first query
271 * immediately after the clear. Doing a pipecontrol with a post-sync
272 * operation and DepthStallEnable seems to work around the issue.
273 */
274 if (cmd_buffer->state.need_query_wa) {
275 cmd_buffer->state.need_query_wa = false;
276 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
277 pc.DepthCacheFlushEnable = true;
278 pc.DepthStallEnable = true;
279 }
280 }
281
282 switch (pool->type) {
283 case VK_QUERY_TYPE_OCCLUSION:
284 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 8);
285 break;
286
287 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
288 default:
289 unreachable("");
290 }
291 }
292
293 void genX(CmdEndQuery)(
294 VkCommandBuffer commandBuffer,
295 VkQueryPool queryPool,
296 uint32_t query)
297 {
298 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
299 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
300
301 switch (pool->type) {
302 case VK_QUERY_TYPE_OCCLUSION:
303 emit_ps_depth_count(cmd_buffer, &pool->bo, query * pool->stride + 16);
304 emit_query_availability(cmd_buffer, &pool->bo, query * pool->stride);
305 break;
306
307 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
308 default:
309 unreachable("");
310 }
311 }
312
313 #define TIMESTAMP 0x2358
314
315 void genX(CmdWriteTimestamp)(
316 VkCommandBuffer commandBuffer,
317 VkPipelineStageFlagBits pipelineStage,
318 VkQueryPool queryPool,
319 uint32_t query)
320 {
321 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
322 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
323 uint32_t offset = query * pool->stride;
324
325 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP);
326
327 switch (pipelineStage) {
328 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
329 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
330 srm.RegisterAddress = TIMESTAMP;
331 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 8 };
332 }
333 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
334 srm.RegisterAddress = TIMESTAMP + 4;
335 srm.MemoryAddress = (struct anv_address) { &pool->bo, offset + 12 };
336 }
337 break;
338
339 default:
340 /* Everything else is bottom-of-pipe */
341 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
342 pc.DestinationAddressType = DAT_PPGTT;
343 pc.PostSyncOperation = WriteTimestamp;
344 pc.Address = (struct anv_address) { &pool->bo, offset + 8 };
345
346 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4)
347 pc.CommandStreamerStallEnable = true;
348 }
349 break;
350 }
351
352 emit_query_availability(cmd_buffer, &pool->bo, offset);
353 }
354
355 #if GEN_GEN > 7 || GEN_IS_HASWELL
356
357 #define alu_opcode(v) __gen_uint((v), 20, 31)
358 #define alu_operand1(v) __gen_uint((v), 10, 19)
359 #define alu_operand2(v) __gen_uint((v), 0, 9)
360 #define alu(opcode, operand1, operand2) \
361 alu_opcode(opcode) | alu_operand1(operand1) | alu_operand2(operand2)
362
363 #define OPCODE_NOOP 0x000
364 #define OPCODE_LOAD 0x080
365 #define OPCODE_LOADINV 0x480
366 #define OPCODE_LOAD0 0x081
367 #define OPCODE_LOAD1 0x481
368 #define OPCODE_ADD 0x100
369 #define OPCODE_SUB 0x101
370 #define OPCODE_AND 0x102
371 #define OPCODE_OR 0x103
372 #define OPCODE_XOR 0x104
373 #define OPCODE_STORE 0x180
374 #define OPCODE_STOREINV 0x580
375
376 #define OPERAND_R0 0x00
377 #define OPERAND_R1 0x01
378 #define OPERAND_R2 0x02
379 #define OPERAND_R3 0x03
380 #define OPERAND_R4 0x04
381 #define OPERAND_SRCA 0x20
382 #define OPERAND_SRCB 0x21
383 #define OPERAND_ACCU 0x31
384 #define OPERAND_ZF 0x32
385 #define OPERAND_CF 0x33
386
387 #define CS_GPR(n) (0x2600 + (n) * 8)
388
389 static void
390 emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg,
391 struct anv_bo *bo, uint32_t offset)
392 {
393 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
394 lrm.RegisterAddress = reg,
395 lrm.MemoryAddress = (struct anv_address) { bo, offset };
396 }
397 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
398 lrm.RegisterAddress = reg + 4;
399 lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 };
400 }
401 }
402
403 static void
404 gpu_write_query_result(struct anv_batch *batch,
405 struct anv_buffer *dst_buffer, uint32_t dst_offset,
406 VkQueryResultFlags flags,
407 uint32_t value_index, uint32_t reg)
408 {
409 if (flags & VK_QUERY_RESULT_64_BIT)
410 dst_offset += value_index * 8;
411 else
412 dst_offset += value_index * 4;
413
414 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
415 srm.RegisterAddress = reg;
416 srm.MemoryAddress = (struct anv_address) {
417 .bo = dst_buffer->bo,
418 .offset = dst_buffer->offset + dst_offset,
419 };
420 }
421
422 if (flags & VK_QUERY_RESULT_64_BIT) {
423 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
424 srm.RegisterAddress = reg + 4;
425 srm.MemoryAddress = (struct anv_address) {
426 .bo = dst_buffer->bo,
427 .offset = dst_buffer->offset + dst_offset + 4,
428 };
429 }
430 }
431 }
432
433 static void
434 compute_query_result(struct anv_batch *batch, uint32_t dst_reg,
435 struct anv_bo *bo, uint32_t offset)
436 {
437 emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset);
438 emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8);
439
440 /* FIXME: We need to clamp the result for 32 bit. */
441
442 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH));
443 dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1);
444 dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0);
445 dw[3] = alu(OPCODE_SUB, 0, 0);
446 dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU);
447 }
448
449 void genX(CmdCopyQueryPoolResults)(
450 VkCommandBuffer commandBuffer,
451 VkQueryPool queryPool,
452 uint32_t firstQuery,
453 uint32_t queryCount,
454 VkBuffer destBuffer,
455 VkDeviceSize destOffset,
456 VkDeviceSize destStride,
457 VkQueryResultFlags flags)
458 {
459 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
460 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool);
461 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer);
462 uint32_t slot_offset, dst_offset;
463
464 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
465 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
466 pc.CommandStreamerStallEnable = true;
467 pc.StallAtPixelScoreboard = true;
468 }
469 }
470
471 for (uint32_t i = 0; i < queryCount; i++) {
472
473 slot_offset = (firstQuery + i) * pool->stride;
474 switch (pool->type) {
475 case VK_QUERY_TYPE_OCCLUSION:
476 compute_query_result(&cmd_buffer->batch, OPERAND_R2,
477 &pool->bo, slot_offset + 8);
478 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
479 flags, 0, CS_GPR(2));
480 break;
481
482 case VK_QUERY_TYPE_TIMESTAMP:
483 emit_load_alu_reg_u64(&cmd_buffer->batch,
484 CS_GPR(2), &pool->bo, slot_offset + 8);
485 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
486 flags, 0, CS_GPR(2));
487 break;
488
489 default:
490 unreachable("unhandled query type");
491 }
492
493 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
494 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0),
495 &pool->bo, slot_offset);
496 gpu_write_query_result(&cmd_buffer->batch, buffer, destOffset,
497 flags, 1, CS_GPR(0));
498 }
499
500 destOffset += destStride;
501 }
502 }
503
504 #else
505 void genX(CmdCopyQueryPoolResults)(
506 VkCommandBuffer commandBuffer,
507 VkQueryPool queryPool,
508 uint32_t firstQuery,
509 uint32_t queryCount,
510 VkBuffer destBuffer,
511 VkDeviceSize destOffset,
512 VkDeviceSize destStride,
513 VkQueryResultFlags flags)
514 {
515 anv_finishme("Queries not yet supported on Ivy Bridge");
516 }
517 #endif