radv: Add occlusion query shader.
[mesa.git] / src / amd / vulkan / radv_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <fcntl.h>
31
32 #include "nir/nir_builder.h"
33 #include "radv_meta.h"
34 #include "radv_private.h"
35 #include "radv_cs.h"
36 #include "sid.h"
37
38 static unsigned get_max_db(struct radv_device *device)
39 {
40 unsigned num_db = device->physical_device->rad_info.num_render_backends;
41 MAYBE_UNUSED unsigned rb_mask = device->physical_device->rad_info.enabled_rb_mask;
42
43 if (device->physical_device->rad_info.chip_class == SI)
44 num_db = 8;
45 else
46 num_db = MAX2(8, num_db);
47
48 /* Otherwise we need to change the query reset procedure */
49 assert(rb_mask == ((1ull << num_db) - 1));
50
51 return num_db;
52 }
53
54 static void radv_break_on_count(nir_builder *b, nir_variable *var, int count)
55 {
56 nir_ssa_def *counter = nir_load_var(b, var);
57
58 nir_if *if_stmt = nir_if_create(b->shader);
59 if_stmt->condition = nir_src_for_ssa(nir_uge(b, counter, nir_imm_int(b, count)));
60 nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
61
62 b->cursor = nir_after_cf_list(&if_stmt->then_list);
63
64 nir_jump_instr *instr = nir_jump_instr_create(b->shader, nir_jump_break);
65 nir_builder_instr_insert(b, &instr->instr);
66
67 b->cursor = nir_after_cf_node(&if_stmt->cf_node);
68 counter = nir_iadd(b, counter, nir_imm_int(b, 1));
69 nir_store_var(b, var, counter, 0x1);
70 }
71
72 static struct nir_ssa_def *
73 radv_load_push_int(nir_builder *b, unsigned offset, const char *name)
74 {
75 nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
76 flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset));
77 flags->num_components = 1;
78 nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name);
79 nir_builder_instr_insert(b, &flags->instr);
80 return &flags->dest.ssa;
81 }
82
83 static nir_shader *
84 build_occlusion_query_shader(struct radv_device *device) {
85 /* the shader this builds is roughly
86 *
87 * push constants {
88 * uint32_t flags;
89 * uint32_t dst_stride;
90 * };
91 *
92 * uint32_t src_stride = 16 * db_count;
93 *
94 * location(binding = 0) buffer dst_buf;
95 * location(binding = 1) buffer src_buf;
96 *
97 * void main() {
98 * uint64_t result = 0;
99 * uint64_t src_offset = src_stride * global_id.x;
100 * uint64_t dst_offset = dst_stride * global_id.x;
101 * bool available = true;
102 * for (int i = 0; i < db_count; ++i) {
103 * uint64_t start = src_buf[src_offset + 16 * i];
104 * uint64_t end = src_buf[src_offset + 16 * i + 8];
105 * if ((start & (1ull << 63)) && (end & (1ull << 63)))
106 * result += end - start;
107 * else
108 * available = false;
109 * }
110 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
111 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
112 * if (flags & VK_QUERY_RESULT_64_BIT)
113 * dst_buf[dst_offset] = result;
114 * else
115 * dst_buf[dst_offset] = (uint32_t)result.
116 * }
117 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
118 * dst_buf[dst_offset + elem_size] = available;
119 * }
120 * }
121 */
122 nir_builder b;
123 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
124 b.shader->info->name = ralloc_strdup(b.shader, "occlusion_query");
125 b.shader->info->cs.local_size[0] = 64;
126 b.shader->info->cs.local_size[1] = 1;
127 b.shader->info->cs.local_size[2] = 1;
128
129 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
130 nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
131 nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
132 nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
133 nir_variable *available = nir_local_variable_create(b.impl, glsl_int_type(), "available");
134 unsigned db_count = get_max_db(device);
135
136 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
137
138 nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
139 nir_intrinsic_vulkan_resource_index);
140 dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
141 nir_intrinsic_set_desc_set(dst_buf, 0);
142 nir_intrinsic_set_binding(dst_buf, 0);
143 nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, 1, 32, NULL);
144 nir_builder_instr_insert(&b, &dst_buf->instr);
145
146 nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
147 nir_intrinsic_vulkan_resource_index);
148 src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
149 nir_intrinsic_set_desc_set(src_buf, 0);
150 nir_intrinsic_set_binding(src_buf, 1);
151 nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, 1, 32, NULL);
152 nir_builder_instr_insert(&b, &src_buf->instr);
153
154 nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
155 nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
156 nir_ssa_def *block_size = nir_imm_ivec4(&b,
157 b.shader->info->cs.local_size[0],
158 b.shader->info->cs.local_size[1],
159 b.shader->info->cs.local_size[2], 0);
160 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
161 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
162
163 nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
164 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
165 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
166 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
167
168
169 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
170 nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
171 nir_store_var(&b, available, nir_imm_int(&b, 1), 0x1);
172
173 nir_loop *outer_loop = nir_loop_create(b.shader);
174 nir_builder_cf_insert(&b, &outer_loop->cf_node);
175 b.cursor = nir_after_cf_list(&outer_loop->body);
176
177 nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
178 radv_break_on_count(&b, outer_counter, db_count);
179
180 nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16));
181 load_offset = nir_iadd(&b, input_base, load_offset);
182
183 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
184 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
185 load->src[1] = nir_src_for_ssa(load_offset);
186 nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL);
187 load->num_components = 2;
188 nir_builder_instr_insert(&b, &load->instr);
189
190 const unsigned swizzle0[] = {0,0,0,0};
191 const unsigned swizzle1[] = {1,1,1,1};
192 nir_store_var(&b, start, nir_swizzle(&b, &load->dest.ssa, swizzle0, 1, false), 0x1);
193 nir_store_var(&b, end, nir_swizzle(&b, &load->dest.ssa, swizzle1, 1, false), 0x1);
194
195 nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
196 nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
197
198 nir_if *update_if = nir_if_create(b.shader);
199 update_if->condition = nir_src_for_ssa(nir_iand(&b, start_done, end_done));
200 nir_cf_node_insert(b.cursor, &update_if->cf_node);
201
202 b.cursor = nir_after_cf_list(&update_if->then_list);
203
204 nir_store_var(&b, result,
205 nir_iadd(&b, nir_load_var(&b, result),
206 nir_isub(&b, nir_load_var(&b, end),
207 nir_load_var(&b, start))), 0x1);
208
209 b.cursor = nir_after_cf_list(&update_if->else_list);
210
211 nir_store_var(&b, available, nir_imm_int(&b, 0), 0x1);
212
213 b.cursor = nir_after_cf_node(&outer_loop->cf_node);
214
215 /* Store the result if complete or if partial results have been requested. */
216
217 nir_ssa_def *result_is_64bit = nir_iand(&b, flags,
218 nir_imm_int(&b, VK_QUERY_RESULT_64_BIT));
219 nir_ssa_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
220
221 nir_if *store_if = nir_if_create(b.shader);
222 store_if->condition = nir_src_for_ssa(nir_ior(&b, nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_PARTIAL_BIT)), nir_load_var(&b, available)));
223 nir_cf_node_insert(b.cursor, &store_if->cf_node);
224
225 b.cursor = nir_after_cf_list(&store_if->then_list);
226
227 nir_if *store_64bit_if = nir_if_create(b.shader);
228 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
229 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
230
231 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
232
233 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
234 store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
235 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
236 store->src[2] = nir_src_for_ssa(output_base);
237 nir_intrinsic_set_write_mask(store, 0x1);
238 store->num_components = 1;
239 nir_builder_instr_insert(&b, &store->instr);
240
241 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
242
243 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
244 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
245 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
246 store->src[2] = nir_src_for_ssa(output_base);
247 nir_intrinsic_set_write_mask(store, 0x1);
248 store->num_components = 1;
249 nir_builder_instr_insert(&b, &store->instr);
250
251 b.cursor = nir_after_cf_node(&store_if->cf_node);
252
253 /* Store the availability bit if requested. */
254
255 nir_if *availability_if = nir_if_create(b.shader);
256 availability_if->condition = nir_src_for_ssa(nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)));
257 nir_cf_node_insert(b.cursor, &availability_if->cf_node);
258
259 b.cursor = nir_after_cf_list(&availability_if->then_list);
260
261 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
262 store->src[0] = nir_src_for_ssa(nir_load_var(&b, available));
263 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
264 store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
265 nir_intrinsic_set_write_mask(store, 0x1);
266 store->num_components = 1;
267 nir_builder_instr_insert(&b, &store->instr);
268
269 return b.shader;
270 }
271
272 VkResult radv_device_init_meta_query_state(struct radv_device *device)
273 {
274 VkResult result;
275 struct radv_shader_module occlusion_cs = { .nir = NULL };
276
277 zero(device->meta_state.query);
278
279 occlusion_cs.nir = build_occlusion_query_shader(device);
280
281 VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
282 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
283 .bindingCount = 2,
284 .pBindings = (VkDescriptorSetLayoutBinding[]) {
285 {
286 .binding = 0,
287 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
288 .descriptorCount = 1,
289 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
290 .pImmutableSamplers = NULL
291 },
292 {
293 .binding = 1,
294 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
295 .descriptorCount = 1,
296 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
297 .pImmutableSamplers = NULL
298 },
299 }
300 };
301
302 result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
303 &occlusion_ds_create_info,
304 &device->meta_state.alloc,
305 &device->meta_state.query.occlusion_query_ds_layout);
306 if (result != VK_SUCCESS)
307 goto fail;
308
309 VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
310 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
311 .setLayoutCount = 1,
312 .pSetLayouts = &device->meta_state.query.occlusion_query_ds_layout,
313 .pushConstantRangeCount = 1,
314 .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 8},
315 };
316
317 result = radv_CreatePipelineLayout(radv_device_to_handle(device),
318 &occlusion_pl_create_info,
319 &device->meta_state.alloc,
320 &device->meta_state.query.occlusion_query_p_layout);
321 if (result != VK_SUCCESS)
322 goto fail;
323
324 VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
325 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
326 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
327 .module = radv_shader_module_to_handle(&occlusion_cs),
328 .pName = "main",
329 .pSpecializationInfo = NULL,
330 };
331
332 VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
333 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
334 .stage = occlusion_pipeline_shader_stage,
335 .flags = 0,
336 .layout = device->meta_state.query.occlusion_query_p_layout,
337 };
338
339 result = radv_CreateComputePipelines(radv_device_to_handle(device),
340 radv_pipeline_cache_to_handle(&device->meta_state.cache),
341 1, &occlusion_vk_pipeline_info, NULL,
342 &device->meta_state.query.occlusion_query_pipeline);
343 if (result != VK_SUCCESS)
344 goto fail;
345
346 return VK_SUCCESS;
347 fail:
348 radv_device_finish_meta_query_state(device);
349 ralloc_free(occlusion_cs.nir);
350 return result;
351 }
352
353 void radv_device_finish_meta_query_state(struct radv_device *device)
354 {
355 if (device->meta_state.query.occlusion_query_pipeline)
356 radv_DestroyPipeline(radv_device_to_handle(device),
357 device->meta_state.query.occlusion_query_pipeline,
358 &device->meta_state.alloc);
359
360 if (device->meta_state.query.occlusion_query_p_layout)
361 radv_DestroyPipelineLayout(radv_device_to_handle(device),
362 device->meta_state.query.occlusion_query_p_layout,
363 &device->meta_state.alloc);
364
365 if (device->meta_state.query.occlusion_query_ds_layout)
366 radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
367 device->meta_state.query.occlusion_query_ds_layout,
368 &device->meta_state.alloc);
369 }
370
371 static void occlusion_query_shader(struct radv_cmd_buffer *cmd_buffer,
372 struct radeon_winsys_bo *src_bo,
373 struct radeon_winsys_bo *dst_bo,
374 uint64_t src_offset, uint64_t dst_offset,
375 uint32_t dst_stride, uint32_t count,
376 uint32_t flags)
377 {
378 struct radv_device *device = cmd_buffer->device;
379 struct radv_meta_saved_compute_state saved_state;
380 unsigned stride = get_max_db(device) * 16;
381 VkDescriptorSet ds;
382
383 radv_meta_save_compute(&saved_state, cmd_buffer, 4);
384
385 radv_temp_descriptor_set_create(device, cmd_buffer,
386 device->meta_state.query.occlusion_query_ds_layout,
387 &ds);
388
389 struct radv_buffer dst_buffer = {
390 .bo = dst_bo,
391 .offset = dst_offset,
392 .size = dst_stride * count
393 };
394
395 struct radv_buffer src_buffer = {
396 .bo = src_bo,
397 .offset = src_offset,
398 .size = stride * count
399 };
400
401 radv_UpdateDescriptorSets(radv_device_to_handle(device),
402 2, /* writeCount */
403 (VkWriteDescriptorSet[]) {
404 {
405 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
406 .dstSet = ds,
407 .dstBinding = 0,
408 .dstArrayElement = 0,
409 .descriptorCount = 1,
410 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
411 .pBufferInfo = &(VkDescriptorBufferInfo) {
412 .buffer = radv_buffer_to_handle(&dst_buffer),
413 .offset = 0,
414 .range = dst_stride * count
415 }
416 },
417 {
418 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
419 .dstSet = ds,
420 .dstBinding = 1,
421 .dstArrayElement = 0,
422 .descriptorCount = 1,
423 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
424 .pBufferInfo = &(VkDescriptorBufferInfo) {
425 .buffer = radv_buffer_to_handle(&src_buffer),
426 .offset = 0,
427 .range = stride * count
428 }
429 }
430 }, 0, NULL);
431
432 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
433 VK_PIPELINE_BIND_POINT_COMPUTE,
434 device->meta_state.query.occlusion_query_pipeline);
435
436 radv_CmdBindDescriptorSets(radv_cmd_buffer_to_handle(cmd_buffer),
437 VK_PIPELINE_BIND_POINT_COMPUTE,
438 device->meta_state.query.occlusion_query_p_layout, 0, 1,
439 &ds, 0, NULL);
440
441 struct {
442 uint32_t flags;
443 uint32_t dst_stride;
444 } push_constants = {
445 flags,
446 dst_stride
447 };
448
449 radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
450 device->meta_state.query.occlusion_query_p_layout,
451 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
452 &push_constants);
453
454 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
455 RADV_CMD_FLAG_INV_VMEM_L1;
456
457 if (flags & VK_QUERY_RESULT_WAIT_BIT)
458 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
459
460 radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
461
462 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
463 RADV_CMD_FLAG_INV_VMEM_L1 |
464 RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
465
466 radv_temp_descriptor_set_destroy(device, ds);
467
468 radv_meta_restore_compute(&saved_state, cmd_buffer, 4);
469 }
470
471 VkResult radv_CreateQueryPool(
472 VkDevice _device,
473 const VkQueryPoolCreateInfo* pCreateInfo,
474 const VkAllocationCallbacks* pAllocator,
475 VkQueryPool* pQueryPool)
476 {
477 RADV_FROM_HANDLE(radv_device, device, _device);
478 uint64_t size;
479 struct radv_query_pool *pool = vk_alloc2(&device->alloc, pAllocator,
480 sizeof(*pool), 8,
481 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
482
483 if (!pool)
484 return VK_ERROR_OUT_OF_HOST_MEMORY;
485
486
487 switch(pCreateInfo->queryType) {
488 case VK_QUERY_TYPE_OCCLUSION:
489 /* 16 bytes tmp. buffer as the compute packet writes 64 bits, but
490 * the app. may have 32 bits of space. */
491 pool->stride = 16 * get_max_db(device) + 16;
492 break;
493 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
494 pool->stride = 16 * 11;
495 break;
496 case VK_QUERY_TYPE_TIMESTAMP:
497 pool->stride = 8;
498 break;
499 default:
500 unreachable("creating unhandled query type");
501 }
502
503 pool->type = pCreateInfo->queryType;
504 pool->availability_offset = pool->stride * pCreateInfo->queryCount;
505 size = pool->availability_offset + 4 * pCreateInfo->queryCount;
506
507 pool->bo = device->ws->buffer_create(device->ws, size,
508 64, RADEON_DOMAIN_GTT, 0);
509
510 if (!pool->bo) {
511 vk_free2(&device->alloc, pAllocator, pool);
512 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
513 }
514
515 pool->ptr = device->ws->buffer_map(pool->bo);
516
517 if (!pool->ptr) {
518 device->ws->buffer_destroy(pool->bo);
519 vk_free2(&device->alloc, pAllocator, pool);
520 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
521 }
522 memset(pool->ptr, 0, size);
523
524 *pQueryPool = radv_query_pool_to_handle(pool);
525 return VK_SUCCESS;
526 }
527
528 void radv_DestroyQueryPool(
529 VkDevice _device,
530 VkQueryPool _pool,
531 const VkAllocationCallbacks* pAllocator)
532 {
533 RADV_FROM_HANDLE(radv_device, device, _device);
534 RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
535
536 if (!pool)
537 return;
538
539 device->ws->buffer_destroy(pool->bo);
540 vk_free2(&device->alloc, pAllocator, pool);
541 }
542
543 VkResult radv_GetQueryPoolResults(
544 VkDevice _device,
545 VkQueryPool queryPool,
546 uint32_t firstQuery,
547 uint32_t queryCount,
548 size_t dataSize,
549 void* pData,
550 VkDeviceSize stride,
551 VkQueryResultFlags flags)
552 {
553 RADV_FROM_HANDLE(radv_device, device, _device);
554 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
555 char *data = pData;
556 VkResult result = VK_SUCCESS;
557
558 for(unsigned i = 0; i < queryCount; ++i, data += stride) {
559 char *dest = data;
560 unsigned query = firstQuery + i;
561 char *src = pool->ptr + query * pool->stride;
562 uint32_t available;
563
564 switch (pool->type) {
565 case VK_QUERY_TYPE_TIMESTAMP: {
566 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
567 while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query))
568 ;
569 }
570
571 available = *(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
572 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
573 result = VK_NOT_READY;
574 break;
575
576 }
577
578 if (flags & VK_QUERY_RESULT_64_BIT) {
579 *(uint64_t*)dest = *(uint64_t*)src;
580 dest += 8;
581 } else {
582 *(uint32_t*)dest = *(uint32_t*)src;
583 dest += 4;
584 }
585 break;
586 }
587 case VK_QUERY_TYPE_OCCLUSION: {
588 volatile uint64_t const *src64 = (volatile uint64_t const *)src;
589 uint64_t result = 0;
590 int db_count = get_max_db(device);
591 available = 1;
592
593 for (int i = 0; i < db_count; ++i) {
594 uint64_t start, end;
595 do {
596 start = src64[2 * i];
597 end = src64[2 * i + 1];
598 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT));
599
600 if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
601 available = 0;
602 else {
603 result += end - start;
604 }
605 }
606
607 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
608 result = VK_NOT_READY;
609 break;
610
611 }
612
613 if (flags & VK_QUERY_RESULT_64_BIT) {
614 *(uint64_t*)dest = result;
615 dest += 8;
616 } else {
617 *(uint32_t*)dest = result;
618 dest += 4;
619 }
620 break;
621 default:
622 unreachable("trying to get results of unhandled query type");
623 }
624 }
625
626 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
627 if (flags & VK_QUERY_RESULT_64_BIT) {
628 *(uint64_t*)dest = available;
629 } else {
630 *(uint32_t*)dest = available;
631 }
632 }
633 }
634
635 return result;
636 }
637
638 void radv_CmdCopyQueryPoolResults(
639 VkCommandBuffer commandBuffer,
640 VkQueryPool queryPool,
641 uint32_t firstQuery,
642 uint32_t queryCount,
643 VkBuffer dstBuffer,
644 VkDeviceSize dstOffset,
645 VkDeviceSize stride,
646 VkQueryResultFlags flags)
647 {
648 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
649 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
650 RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
651 struct radeon_winsys_cs *cs = cmd_buffer->cs;
652 uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo);
653 uint64_t dest_va = cmd_buffer->device->ws->buffer_get_va(dst_buffer->bo);
654 dest_va += dst_buffer->offset + dstOffset;
655
656 cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, pool->bo, 8);
657 cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, dst_buffer->bo, 8);
658
659 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
660 unsigned query = firstQuery + i;
661 uint64_t local_src_va = va + query * pool->stride;
662 unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4;
663
664 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 26);
665
666 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
667 /* TODO, not sure if there is any case where we won't always be ready yet */
668 uint64_t avail_va = va + pool->availability_offset + 4 * query;
669
670
671 /* This waits on the ME. All copies below are done on the ME */
672 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
673 radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
674 radeon_emit(cs, avail_va);
675 radeon_emit(cs, avail_va >> 32);
676 radeon_emit(cs, 1); /* reference value */
677 radeon_emit(cs, 0xffffffff); /* mask */
678 radeon_emit(cs, 4); /* poll interval */
679 }
680
681 switch (pool->type) {
682 case VK_QUERY_TYPE_OCCLUSION:
683 local_src_va += pool->stride - 16;
684
685 case VK_QUERY_TYPE_TIMESTAMP:
686 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
687 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
688 COPY_DATA_DST_SEL(COPY_DATA_MEM) |
689 ((flags & VK_QUERY_RESULT_64_BIT) ? COPY_DATA_COUNT_SEL : 0));
690 radeon_emit(cs, local_src_va);
691 radeon_emit(cs, local_src_va >> 32);
692 radeon_emit(cs, dest_va);
693 radeon_emit(cs, dest_va >> 32);
694 break;
695 default:
696 unreachable("trying to get results of unhandled query type");
697 }
698
699 /* The flag could be still changed while the data copy is busy and we
700 * then might have invalid data, but a ready flag. However, the availability
701 * writes happen on the ME too, so they should be synchronized. Might need to
702 * revisit this with multiple queues.
703 */
704 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
705 uint64_t avail_va = va + pool->availability_offset + 4 * query;
706 uint64_t avail_dest_va = dest_va;
707 if (pool->type != VK_QUERY_TYPE_PIPELINE_STATISTICS)
708 avail_dest_va += elem_size;
709 else
710 abort();
711
712 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
713 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
714 COPY_DATA_DST_SEL(COPY_DATA_MEM));
715 radeon_emit(cs, avail_va);
716 radeon_emit(cs, avail_va >> 32);
717 radeon_emit(cs, avail_dest_va);
718 radeon_emit(cs, avail_dest_va >> 32);
719 }
720
721 assert(cs->cdw <= cdw_max);
722 }
723
724 }
725
726 void radv_CmdResetQueryPool(
727 VkCommandBuffer commandBuffer,
728 VkQueryPool queryPool,
729 uint32_t firstQuery,
730 uint32_t queryCount)
731 {
732 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
733 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
734 uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo);
735
736 cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs, pool->bo, 8);
737
738 si_cp_dma_clear_buffer(cmd_buffer, va + firstQuery * pool->stride,
739 queryCount * pool->stride, 0);
740 si_cp_dma_clear_buffer(cmd_buffer, va + pool->availability_offset + firstQuery * 4,
741 queryCount * 4, 0);
742 }
743
744 void radv_CmdBeginQuery(
745 VkCommandBuffer commandBuffer,
746 VkQueryPool queryPool,
747 uint32_t query,
748 VkQueryControlFlags flags)
749 {
750 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
751 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
752 struct radeon_winsys_cs *cs = cmd_buffer->cs;
753 uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo);
754 va += pool->stride * query;
755
756 cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 8);
757
758 switch (pool->type) {
759 case VK_QUERY_TYPE_OCCLUSION:
760 radeon_check_space(cmd_buffer->device->ws, cs, 7);
761
762 ++cmd_buffer->state.active_occlusion_queries;
763 if (cmd_buffer->state.active_occlusion_queries == 1)
764 radv_set_db_count_control(cmd_buffer);
765
766 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
767 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
768 radeon_emit(cs, va);
769 radeon_emit(cs, va >> 32);
770 break;
771 default:
772 unreachable("beginning unhandled query type");
773 }
774 }
775
776
777 void radv_CmdEndQuery(
778 VkCommandBuffer commandBuffer,
779 VkQueryPool queryPool,
780 uint32_t query)
781 {
782 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
783 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
784 struct radeon_winsys_cs *cs = cmd_buffer->cs;
785 uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo);
786 uint64_t avail_va = va + pool->availability_offset + 4 * query;
787 va += pool->stride * query;
788
789 cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 8);
790
791 switch (pool->type) {
792 case VK_QUERY_TYPE_OCCLUSION:
793 radeon_check_space(cmd_buffer->device->ws, cs, 14);
794
795 cmd_buffer->state.active_occlusion_queries--;
796 if (cmd_buffer->state.active_occlusion_queries == 0)
797 radv_set_db_count_control(cmd_buffer);
798
799 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
800 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
801 radeon_emit(cs, va + 8);
802 radeon_emit(cs, (va + 8) >> 32);
803
804 /* hangs for VK_COMMAND_BUFFER_LEVEL_SECONDARY. */
805 if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
806 radeon_emit(cs, PKT3(PKT3_OCCLUSION_QUERY, 3, 0));
807 radeon_emit(cs, va);
808 radeon_emit(cs, va >> 32);
809 radeon_emit(cs, va + pool->stride - 16);
810 radeon_emit(cs, (va + pool->stride - 16) >> 32);
811 }
812
813 break;
814 default:
815 unreachable("ending unhandled query type");
816 }
817
818 radeon_check_space(cmd_buffer->device->ws, cs, 5);
819
820 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
821 radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
822 S_370_WR_CONFIRM(1) |
823 S_370_ENGINE_SEL(V_370_ME));
824 radeon_emit(cs, avail_va);
825 radeon_emit(cs, avail_va >> 32);
826 radeon_emit(cs, 1);
827 }
828
829 void radv_CmdWriteTimestamp(
830 VkCommandBuffer commandBuffer,
831 VkPipelineStageFlagBits pipelineStage,
832 VkQueryPool queryPool,
833 uint32_t query)
834 {
835 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
836 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
837 bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
838 struct radeon_winsys_cs *cs = cmd_buffer->cs;
839 uint64_t va = cmd_buffer->device->ws->buffer_get_va(pool->bo);
840 uint64_t avail_va = va + pool->availability_offset + 4 * query;
841 uint64_t query_va = va + pool->stride * query;
842
843 cmd_buffer->device->ws->cs_add_buffer(cs, pool->bo, 5);
844
845 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 12);
846
847 if (mec) {
848 radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 5, 0));
849 radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
850 radeon_emit(cs, 3 << 29);
851 radeon_emit(cs, query_va);
852 radeon_emit(cs, query_va >> 32);
853 radeon_emit(cs, 0);
854 radeon_emit(cs, 0);
855 } else {
856 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
857 radeon_emit(cs, EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | EVENT_INDEX(5));
858 radeon_emit(cs, query_va);
859 radeon_emit(cs, (3 << 29) | ((query_va >> 32) & 0xFFFF));
860 radeon_emit(cs, 0);
861 radeon_emit(cs, 0);
862 }
863
864 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
865 radeon_emit(cs, S_370_DST_SEL(mec ? V_370_MEM_ASYNC : V_370_MEMORY_SYNC) |
866 S_370_WR_CONFIRM(1) |
867 S_370_ENGINE_SEL(V_370_ME));
868 radeon_emit(cs, avail_va);
869 radeon_emit(cs, avail_va >> 32);
870 radeon_emit(cs, 1);
871
872 assert(cmd_buffer->cs->cdw <= cdw_max);
873 }