e3229ab59bbe827ab5364361cc44ce0abbaad0cf
[mesa.git] / src / amd / vulkan / radv_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <fcntl.h>
31
32 #include "nir/nir_builder.h"
33 #include "radv_meta.h"
34 #include "radv_private.h"
35 #include "radv_cs.h"
36 #include "sid.h"
37
38
39 static const int pipelinestat_block_size = 11 * 8;
40 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
41
42 static unsigned get_max_db(struct radv_device *device)
43 {
44 unsigned num_db = device->physical_device->rad_info.num_render_backends;
45 MAYBE_UNUSED unsigned rb_mask = device->physical_device->rad_info.enabled_rb_mask;
46
47 /* Otherwise we need to change the query reset procedure */
48 assert(rb_mask == ((1ull << num_db) - 1));
49
50 return num_db;
51 }
52
53 static void radv_break_on_count(nir_builder *b, nir_variable *var, nir_ssa_def *count)
54 {
55 nir_ssa_def *counter = nir_load_var(b, var);
56
57 nir_if *if_stmt = nir_if_create(b->shader);
58 if_stmt->condition = nir_src_for_ssa(nir_uge(b, counter, count));
59 nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
60
61 b->cursor = nir_after_cf_list(&if_stmt->then_list);
62
63 nir_jump_instr *instr = nir_jump_instr_create(b->shader, nir_jump_break);
64 nir_builder_instr_insert(b, &instr->instr);
65
66 b->cursor = nir_after_cf_node(&if_stmt->cf_node);
67 counter = nir_iadd(b, counter, nir_imm_int(b, 1));
68 nir_store_var(b, var, counter, 0x1);
69 }
70
71 static struct nir_ssa_def *
72 radv_load_push_int(nir_builder *b, unsigned offset, const char *name)
73 {
74 nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
75 nir_intrinsic_set_base(flags, 0);
76 nir_intrinsic_set_range(flags, 16);
77 flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset));
78 flags->num_components = 1;
79 nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name);
80 nir_builder_instr_insert(b, &flags->instr);
81 return &flags->dest.ssa;
82 }
83
84 static nir_shader *
85 build_occlusion_query_shader(struct radv_device *device) {
86 /* the shader this builds is roughly
87 *
88 * push constants {
89 * uint32_t flags;
90 * uint32_t dst_stride;
91 * };
92 *
93 * uint32_t src_stride = 16 * db_count;
94 *
95 * location(binding = 0) buffer dst_buf;
96 * location(binding = 1) buffer src_buf;
97 *
98 * void main() {
99 * uint64_t result = 0;
100 * uint64_t src_offset = src_stride * global_id.x;
101 * uint64_t dst_offset = dst_stride * global_id.x;
102 * bool available = true;
103 * for (int i = 0; i < db_count; ++i) {
104 * uint64_t start = src_buf[src_offset + 16 * i];
105 * uint64_t end = src_buf[src_offset + 16 * i + 8];
106 * if ((start & (1ull << 63)) && (end & (1ull << 63)))
107 * result += end - start;
108 * else
109 * available = false;
110 * }
111 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
112 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
113 * if (flags & VK_QUERY_RESULT_64_BIT)
114 * dst_buf[dst_offset] = result;
115 * else
116 * dst_buf[dst_offset] = (uint32_t)result.
117 * }
118 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
119 * dst_buf[dst_offset + elem_size] = available;
120 * }
121 * }
122 */
123 nir_builder b;
124 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
125 b.shader->info.name = ralloc_strdup(b.shader, "occlusion_query");
126 b.shader->info.cs.local_size[0] = 64;
127 b.shader->info.cs.local_size[1] = 1;
128 b.shader->info.cs.local_size[2] = 1;
129
130 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
131 nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
132 nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
133 nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
134 nir_variable *available = nir_local_variable_create(b.impl, glsl_int_type(), "available");
135 unsigned db_count = get_max_db(device);
136
137 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
138
139 nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
140 nir_intrinsic_vulkan_resource_index);
141 dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
142 nir_intrinsic_set_desc_set(dst_buf, 0);
143 nir_intrinsic_set_binding(dst_buf, 0);
144 nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, 1, 32, NULL);
145 nir_builder_instr_insert(&b, &dst_buf->instr);
146
147 nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
148 nir_intrinsic_vulkan_resource_index);
149 src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
150 nir_intrinsic_set_desc_set(src_buf, 0);
151 nir_intrinsic_set_binding(src_buf, 1);
152 nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, 1, 32, NULL);
153 nir_builder_instr_insert(&b, &src_buf->instr);
154
155 nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
156 nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
157 nir_ssa_def *block_size = nir_imm_ivec4(&b,
158 b.shader->info.cs.local_size[0],
159 b.shader->info.cs.local_size[1],
160 b.shader->info.cs.local_size[2], 0);
161 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
162 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
163
164 nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
165 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
166 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
167 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
168
169
170 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
171 nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
172 nir_store_var(&b, available, nir_imm_int(&b, 1), 0x1);
173
174 nir_loop *outer_loop = nir_loop_create(b.shader);
175 nir_builder_cf_insert(&b, &outer_loop->cf_node);
176 b.cursor = nir_after_cf_list(&outer_loop->body);
177
178 nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
179 radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
180
181 nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16));
182 load_offset = nir_iadd(&b, input_base, load_offset);
183
184 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
185 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
186 load->src[1] = nir_src_for_ssa(load_offset);
187 nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL);
188 load->num_components = 2;
189 nir_builder_instr_insert(&b, &load->instr);
190
191 nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1);
192 nir_store_var(&b, end, nir_channel(&b, &load->dest.ssa, 1), 0x1);
193
194 nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
195 nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
196
197 nir_if *update_if = nir_if_create(b.shader);
198 update_if->condition = nir_src_for_ssa(nir_iand(&b, start_done, end_done));
199 nir_cf_node_insert(b.cursor, &update_if->cf_node);
200
201 b.cursor = nir_after_cf_list(&update_if->then_list);
202
203 nir_store_var(&b, result,
204 nir_iadd(&b, nir_load_var(&b, result),
205 nir_isub(&b, nir_load_var(&b, end),
206 nir_load_var(&b, start))), 0x1);
207
208 b.cursor = nir_after_cf_list(&update_if->else_list);
209
210 nir_store_var(&b, available, nir_imm_int(&b, 0), 0x1);
211
212 b.cursor = nir_after_cf_node(&outer_loop->cf_node);
213
214 /* Store the result if complete or if partial results have been requested. */
215
216 nir_ssa_def *result_is_64bit = nir_iand(&b, flags,
217 nir_imm_int(&b, VK_QUERY_RESULT_64_BIT));
218 nir_ssa_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
219
220 nir_if *store_if = nir_if_create(b.shader);
221 store_if->condition = nir_src_for_ssa(nir_ior(&b, nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_PARTIAL_BIT)), nir_load_var(&b, available)));
222 nir_cf_node_insert(b.cursor, &store_if->cf_node);
223
224 b.cursor = nir_after_cf_list(&store_if->then_list);
225
226 nir_if *store_64bit_if = nir_if_create(b.shader);
227 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
228 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
229
230 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
231
232 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
233 store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
234 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
235 store->src[2] = nir_src_for_ssa(output_base);
236 nir_intrinsic_set_write_mask(store, 0x1);
237 store->num_components = 1;
238 nir_builder_instr_insert(&b, &store->instr);
239
240 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
241
242 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
243 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
244 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
245 store->src[2] = nir_src_for_ssa(output_base);
246 nir_intrinsic_set_write_mask(store, 0x1);
247 store->num_components = 1;
248 nir_builder_instr_insert(&b, &store->instr);
249
250 b.cursor = nir_after_cf_node(&store_if->cf_node);
251
252 /* Store the availability bit if requested. */
253
254 nir_if *availability_if = nir_if_create(b.shader);
255 availability_if->condition = nir_src_for_ssa(nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)));
256 nir_cf_node_insert(b.cursor, &availability_if->cf_node);
257
258 b.cursor = nir_after_cf_list(&availability_if->then_list);
259
260 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
261 store->src[0] = nir_src_for_ssa(nir_load_var(&b, available));
262 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
263 store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
264 nir_intrinsic_set_write_mask(store, 0x1);
265 store->num_components = 1;
266 nir_builder_instr_insert(&b, &store->instr);
267
268 return b.shader;
269 }
270
271 static nir_shader *
272 build_pipeline_statistics_query_shader(struct radv_device *device) {
273 /* the shader this builds is roughly
274 *
275 * push constants {
276 * uint32_t flags;
277 * uint32_t dst_stride;
278 * uint32_t stats_mask;
279 * uint32_t avail_offset;
280 * };
281 *
282 * uint32_t src_stride = pipelinestat_block_size * 2;
283 *
284 * location(binding = 0) buffer dst_buf;
285 * location(binding = 1) buffer src_buf;
286 *
287 * void main() {
288 * uint64_t src_offset = src_stride * global_id.x;
289 * uint64_t dst_base = dst_stride * global_id.x;
290 * uint64_t dst_offset = dst_base;
291 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
292 * uint32_t elem_count = stats_mask >> 16;
293 * uint32_t available = src_buf[avail_offset + 4 * global_id.x];
294 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
295 * dst_buf[dst_offset + elem_count * elem_size] = available;
296 * }
297 * if (available) {
298 * // repeat 11 times:
299 * if (stats_mask & (1 << 0)) {
300 * uint64_t start = src_buf[src_offset + 8 * indices[0]];
301 * uint64_t end = src_buf[src_offset + 8 * indices[0] + pipelinestat_block_size];
302 * uint64_t result = end - start;
303 * if (flags & VK_QUERY_RESULT_64_BIT)
304 * dst_buf[dst_offset] = result;
305 * else
306 * dst_buf[dst_offset] = (uint32_t)result.
307 * dst_offset += elem_size;
308 * }
309 * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
310 * // Set everything to 0 as we don't know what is valid.
311 * for (int i = 0; i < elem_count; ++i)
312 * dst_buf[dst_base + elem_size * i] = 0;
313 * }
314 * }
315 */
316 nir_builder b;
317 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
318 b.shader->info.name = ralloc_strdup(b.shader, "pipeline_statistics_query");
319 b.shader->info.cs.local_size[0] = 64;
320 b.shader->info.cs.local_size[1] = 1;
321 b.shader->info.cs.local_size[2] = 1;
322
323 nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
324
325 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
326 nir_ssa_def *stats_mask = radv_load_push_int(&b, 8, "stats_mask");
327 nir_ssa_def *avail_offset = radv_load_push_int(&b, 12, "avail_offset");
328
329 nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
330 nir_intrinsic_vulkan_resource_index);
331 dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
332 nir_intrinsic_set_desc_set(dst_buf, 0);
333 nir_intrinsic_set_binding(dst_buf, 0);
334 nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, 1, 32, NULL);
335 nir_builder_instr_insert(&b, &dst_buf->instr);
336
337 nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
338 nir_intrinsic_vulkan_resource_index);
339 src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
340 nir_intrinsic_set_desc_set(src_buf, 0);
341 nir_intrinsic_set_binding(src_buf, 1);
342 nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, 1, 32, NULL);
343 nir_builder_instr_insert(&b, &src_buf->instr);
344
345 nir_ssa_def *invoc_id = nir_load_system_value(&b, nir_intrinsic_load_local_invocation_id, 0);
346 nir_ssa_def *wg_id = nir_load_system_value(&b, nir_intrinsic_load_work_group_id, 0);
347 nir_ssa_def *block_size = nir_imm_ivec4(&b,
348 b.shader->info.cs.local_size[0],
349 b.shader->info.cs.local_size[1],
350 b.shader->info.cs.local_size[2], 0);
351 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
352 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
353
354 nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
355 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
356 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
357 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
358
359
360 avail_offset = nir_iadd(&b, avail_offset,
361 nir_imul(&b, global_id, nir_imm_int(&b, 4)));
362
363 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
364 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
365 load->src[1] = nir_src_for_ssa(avail_offset);
366 nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
367 load->num_components = 1;
368 nir_builder_instr_insert(&b, &load->instr);
369 nir_ssa_def *available = &load->dest.ssa;
370
371 nir_ssa_def *result_is_64bit = nir_iand(&b, flags,
372 nir_imm_int(&b, VK_QUERY_RESULT_64_BIT));
373 nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
374 nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16));
375
376 /* Store the availability bit if requested. */
377
378 nir_if *availability_if = nir_if_create(b.shader);
379 availability_if->condition = nir_src_for_ssa(nir_iand(&b, flags, nir_imm_int(&b, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)));
380 nir_cf_node_insert(b.cursor, &availability_if->cf_node);
381
382 b.cursor = nir_after_cf_list(&availability_if->then_list);
383
384 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
385 store->src[0] = nir_src_for_ssa(available);
386 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
387 store->src[2] = nir_src_for_ssa(nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)));
388 nir_intrinsic_set_write_mask(store, 0x1);
389 store->num_components = 1;
390 nir_builder_instr_insert(&b, &store->instr);
391
392 b.cursor = nir_after_cf_node(&availability_if->cf_node);
393
394 nir_if *available_if = nir_if_create(b.shader);
395 available_if->condition = nir_src_for_ssa(available);
396 nir_cf_node_insert(b.cursor, &available_if->cf_node);
397
398 b.cursor = nir_after_cf_list(&available_if->then_list);
399
400 nir_store_var(&b, output_offset, output_base, 0x1);
401 for (int i = 0; i < 11; ++i) {
402 nir_if *store_if = nir_if_create(b.shader);
403 store_if->condition = nir_src_for_ssa(nir_iand(&b, stats_mask, nir_imm_int(&b, 1u << i)));
404 nir_cf_node_insert(b.cursor, &store_if->cf_node);
405
406 b.cursor = nir_after_cf_list(&store_if->then_list);
407
408 load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
409 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
410 load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
411 nir_imm_int(&b, pipeline_statistics_indices[i] * 8)));
412 nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
413 load->num_components = 1;
414 nir_builder_instr_insert(&b, &load->instr);
415 nir_ssa_def *start = &load->dest.ssa;
416
417 load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
418 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
419 load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
420 nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size)));
421 nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
422 load->num_components = 1;
423 nir_builder_instr_insert(&b, &load->instr);
424 nir_ssa_def *end = &load->dest.ssa;
425
426 nir_ssa_def *result = nir_isub(&b, end, start);
427
428 /* Store result */
429 nir_if *store_64bit_if = nir_if_create(b.shader);
430 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
431 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
432
433 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
434
435 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
436 store->src[0] = nir_src_for_ssa(result);
437 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
438 store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
439 nir_intrinsic_set_write_mask(store, 0x1);
440 store->num_components = 1;
441 nir_builder_instr_insert(&b, &store->instr);
442
443 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
444
445 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
446 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, result));
447 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
448 store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
449 nir_intrinsic_set_write_mask(store, 0x1);
450 store->num_components = 1;
451 nir_builder_instr_insert(&b, &store->instr);
452
453 b.cursor = nir_after_cf_node(&store_64bit_if->cf_node);
454
455 nir_store_var(&b, output_offset,
456 nir_iadd(&b, nir_load_var(&b, output_offset),
457 elem_size), 0x1);
458
459 b.cursor = nir_after_cf_node(&store_if->cf_node);
460 }
461
462 b.cursor = nir_after_cf_list(&available_if->else_list);
463
464 available_if = nir_if_create(b.shader);
465 available_if->condition = nir_src_for_ssa(nir_iand(&b, flags,
466 nir_imm_int(&b, VK_QUERY_RESULT_PARTIAL_BIT)));
467 nir_cf_node_insert(b.cursor, &available_if->cf_node);
468
469 b.cursor = nir_after_cf_list(&available_if->then_list);
470
471 /* Stores zeros in all outputs. */
472
473 nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
474 nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
475
476 nir_loop *loop = nir_loop_create(b.shader);
477 nir_builder_cf_insert(&b, &loop->cf_node);
478 b.cursor = nir_after_cf_list(&loop->body);
479
480 nir_ssa_def *current_counter = nir_load_var(&b, counter);
481 radv_break_on_count(&b, counter, elem_count);
482
483 nir_ssa_def *output_elem = nir_iadd(&b, output_base,
484 nir_imul(&b, elem_size, current_counter));
485
486 nir_if *store_64bit_if = nir_if_create(b.shader);
487 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
488 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
489
490 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
491
492 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
493 store->src[0] = nir_src_for_ssa(nir_imm_int64(&b, 0));
494 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
495 store->src[2] = nir_src_for_ssa(output_elem);
496 nir_intrinsic_set_write_mask(store, 0x1);
497 store->num_components = 1;
498 nir_builder_instr_insert(&b, &store->instr);
499
500 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
501
502 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
503 store->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
504 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
505 store->src[2] = nir_src_for_ssa(output_elem);
506 nir_intrinsic_set_write_mask(store, 0x1);
507 store->num_components = 1;
508 nir_builder_instr_insert(&b, &store->instr);
509
510 b.cursor = nir_after_cf_node(&loop->cf_node);
511 return b.shader;
512 }
513
514 VkResult radv_device_init_meta_query_state(struct radv_device *device)
515 {
516 VkResult result;
517 struct radv_shader_module occlusion_cs = { .nir = NULL };
518 struct radv_shader_module pipeline_statistics_cs = { .nir = NULL };
519
520 occlusion_cs.nir = build_occlusion_query_shader(device);
521 pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device);
522
523 VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
524 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
525 .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
526 .bindingCount = 2,
527 .pBindings = (VkDescriptorSetLayoutBinding[]) {
528 {
529 .binding = 0,
530 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
531 .descriptorCount = 1,
532 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
533 .pImmutableSamplers = NULL
534 },
535 {
536 .binding = 1,
537 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
538 .descriptorCount = 1,
539 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
540 .pImmutableSamplers = NULL
541 },
542 }
543 };
544
545 result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
546 &occlusion_ds_create_info,
547 &device->meta_state.alloc,
548 &device->meta_state.query.ds_layout);
549 if (result != VK_SUCCESS)
550 goto fail;
551
552 VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
553 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
554 .setLayoutCount = 1,
555 .pSetLayouts = &device->meta_state.query.ds_layout,
556 .pushConstantRangeCount = 1,
557 .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16},
558 };
559
560 result = radv_CreatePipelineLayout(radv_device_to_handle(device),
561 &occlusion_pl_create_info,
562 &device->meta_state.alloc,
563 &device->meta_state.query.p_layout);
564 if (result != VK_SUCCESS)
565 goto fail;
566
567 VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
568 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
569 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
570 .module = radv_shader_module_to_handle(&occlusion_cs),
571 .pName = "main",
572 .pSpecializationInfo = NULL,
573 };
574
575 VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
576 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
577 .stage = occlusion_pipeline_shader_stage,
578 .flags = 0,
579 .layout = device->meta_state.query.p_layout,
580 };
581
582 result = radv_CreateComputePipelines(radv_device_to_handle(device),
583 radv_pipeline_cache_to_handle(&device->meta_state.cache),
584 1, &occlusion_vk_pipeline_info, NULL,
585 &device->meta_state.query.occlusion_query_pipeline);
586 if (result != VK_SUCCESS)
587 goto fail;
588
589 VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
590 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
591 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
592 .module = radv_shader_module_to_handle(&pipeline_statistics_cs),
593 .pName = "main",
594 .pSpecializationInfo = NULL,
595 };
596
597 VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
598 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
599 .stage = pipeline_statistics_pipeline_shader_stage,
600 .flags = 0,
601 .layout = device->meta_state.query.p_layout,
602 };
603
604 result = radv_CreateComputePipelines(radv_device_to_handle(device),
605 radv_pipeline_cache_to_handle(&device->meta_state.cache),
606 1, &pipeline_statistics_vk_pipeline_info, NULL,
607 &device->meta_state.query.pipeline_statistics_query_pipeline);
608
609 fail:
610 if (result != VK_SUCCESS)
611 radv_device_finish_meta_query_state(device);
612 ralloc_free(occlusion_cs.nir);
613 ralloc_free(pipeline_statistics_cs.nir);
614 return result;
615 }
616
617 void radv_device_finish_meta_query_state(struct radv_device *device)
618 {
619 if (device->meta_state.query.pipeline_statistics_query_pipeline)
620 radv_DestroyPipeline(radv_device_to_handle(device),
621 device->meta_state.query.pipeline_statistics_query_pipeline,
622 &device->meta_state.alloc);
623
624 if (device->meta_state.query.occlusion_query_pipeline)
625 radv_DestroyPipeline(radv_device_to_handle(device),
626 device->meta_state.query.occlusion_query_pipeline,
627 &device->meta_state.alloc);
628
629 if (device->meta_state.query.p_layout)
630 radv_DestroyPipelineLayout(radv_device_to_handle(device),
631 device->meta_state.query.p_layout,
632 &device->meta_state.alloc);
633
634 if (device->meta_state.query.ds_layout)
635 radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
636 device->meta_state.query.ds_layout,
637 &device->meta_state.alloc);
638 }
639
640 static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
641 VkPipeline pipeline,
642 struct radeon_winsys_bo *src_bo,
643 struct radeon_winsys_bo *dst_bo,
644 uint64_t src_offset, uint64_t dst_offset,
645 uint32_t src_stride, uint32_t dst_stride,
646 uint32_t count, uint32_t flags,
647 uint32_t pipeline_stats_mask, uint32_t avail_offset)
648 {
649 struct radv_device *device = cmd_buffer->device;
650 struct radv_meta_saved_state saved_state;
651
652 radv_meta_save(&saved_state, cmd_buffer,
653 RADV_META_SAVE_COMPUTE_PIPELINE |
654 RADV_META_SAVE_CONSTANTS |
655 RADV_META_SAVE_DESCRIPTORS);
656
657 struct radv_buffer dst_buffer = {
658 .bo = dst_bo,
659 .offset = dst_offset,
660 .size = dst_stride * count
661 };
662
663 struct radv_buffer src_buffer = {
664 .bo = src_bo,
665 .offset = src_offset,
666 .size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset)
667 };
668
669 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
670 VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
671
672 radv_meta_push_descriptor_set(cmd_buffer,
673 VK_PIPELINE_BIND_POINT_COMPUTE,
674 device->meta_state.query.p_layout,
675 0, /* set */
676 2, /* descriptorWriteCount */
677 (VkWriteDescriptorSet[]) {
678 {
679 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
680 .dstBinding = 0,
681 .dstArrayElement = 0,
682 .descriptorCount = 1,
683 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
684 .pBufferInfo = &(VkDescriptorBufferInfo) {
685 .buffer = radv_buffer_to_handle(&dst_buffer),
686 .offset = 0,
687 .range = VK_WHOLE_SIZE
688 }
689 },
690 {
691 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
692 .dstBinding = 1,
693 .dstArrayElement = 0,
694 .descriptorCount = 1,
695 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
696 .pBufferInfo = &(VkDescriptorBufferInfo) {
697 .buffer = radv_buffer_to_handle(&src_buffer),
698 .offset = 0,
699 .range = VK_WHOLE_SIZE
700 }
701 }
702 });
703
704 /* Encode the number of elements for easy access by the shader. */
705 pipeline_stats_mask &= 0x7ff;
706 pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
707
708 avail_offset -= src_offset;
709
710 struct {
711 uint32_t flags;
712 uint32_t dst_stride;
713 uint32_t pipeline_stats_mask;
714 uint32_t avail_offset;
715 } push_constants = {
716 flags,
717 dst_stride,
718 pipeline_stats_mask,
719 avail_offset
720 };
721
722 radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
723 device->meta_state.query.p_layout,
724 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
725 &push_constants);
726
727 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
728 RADV_CMD_FLAG_INV_VMEM_L1;
729
730 if (flags & VK_QUERY_RESULT_WAIT_BIT)
731 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
732
733 radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
734
735 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_GLOBAL_L2 |
736 RADV_CMD_FLAG_INV_VMEM_L1 |
737 RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
738
739 radv_meta_restore(&saved_state, cmd_buffer);
740 }
741
742 VkResult radv_CreateQueryPool(
743 VkDevice _device,
744 const VkQueryPoolCreateInfo* pCreateInfo,
745 const VkAllocationCallbacks* pAllocator,
746 VkQueryPool* pQueryPool)
747 {
748 RADV_FROM_HANDLE(radv_device, device, _device);
749 struct radv_query_pool *pool = vk_alloc2(&device->alloc, pAllocator,
750 sizeof(*pool), 8,
751 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
752
753 if (!pool)
754 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
755
756
757 switch(pCreateInfo->queryType) {
758 case VK_QUERY_TYPE_OCCLUSION:
759 pool->stride = 16 * get_max_db(device);
760 break;
761 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
762 pool->stride = pipelinestat_block_size * 2;
763 break;
764 case VK_QUERY_TYPE_TIMESTAMP:
765 pool->stride = 8;
766 break;
767 default:
768 unreachable("creating unhandled query type");
769 }
770
771 pool->type = pCreateInfo->queryType;
772 pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
773 pool->availability_offset = pool->stride * pCreateInfo->queryCount;
774 pool->size = pool->availability_offset;
775 if (pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP ||
776 pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
777 pool->size += 4 * pCreateInfo->queryCount;
778
779 pool->bo = device->ws->buffer_create(device->ws, pool->size,
780 64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING);
781
782 if (!pool->bo) {
783 vk_free2(&device->alloc, pAllocator, pool);
784 return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
785 }
786
787 pool->ptr = device->ws->buffer_map(pool->bo);
788
789 if (!pool->ptr) {
790 device->ws->buffer_destroy(pool->bo);
791 vk_free2(&device->alloc, pAllocator, pool);
792 return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
793 }
794 memset(pool->ptr, 0, pool->size);
795
796 *pQueryPool = radv_query_pool_to_handle(pool);
797 return VK_SUCCESS;
798 }
799
800 void radv_DestroyQueryPool(
801 VkDevice _device,
802 VkQueryPool _pool,
803 const VkAllocationCallbacks* pAllocator)
804 {
805 RADV_FROM_HANDLE(radv_device, device, _device);
806 RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
807
808 if (!pool)
809 return;
810
811 device->ws->buffer_destroy(pool->bo);
812 vk_free2(&device->alloc, pAllocator, pool);
813 }
814
815 VkResult radv_GetQueryPoolResults(
816 VkDevice _device,
817 VkQueryPool queryPool,
818 uint32_t firstQuery,
819 uint32_t queryCount,
820 size_t dataSize,
821 void* pData,
822 VkDeviceSize stride,
823 VkQueryResultFlags flags)
824 {
825 RADV_FROM_HANDLE(radv_device, device, _device);
826 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
827 char *data = pData;
828 VkResult result = VK_SUCCESS;
829
830 for(unsigned i = 0; i < queryCount; ++i, data += stride) {
831 char *dest = data;
832 unsigned query = firstQuery + i;
833 char *src = pool->ptr + query * pool->stride;
834 uint32_t available;
835
836 if (pool->type != VK_QUERY_TYPE_OCCLUSION) {
837 if (flags & VK_QUERY_RESULT_WAIT_BIT)
838 while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query))
839 ;
840 available = *(uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
841 }
842
843 switch (pool->type) {
844 case VK_QUERY_TYPE_TIMESTAMP: {
845 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
846 result = VK_NOT_READY;
847 break;
848
849 }
850
851 if (flags & VK_QUERY_RESULT_64_BIT) {
852 *(uint64_t*)dest = *(uint64_t*)src;
853 dest += 8;
854 } else {
855 *(uint32_t*)dest = *(uint32_t*)src;
856 dest += 4;
857 }
858 break;
859 }
860 case VK_QUERY_TYPE_OCCLUSION: {
861 volatile uint64_t const *src64 = (volatile uint64_t const *)src;
862 uint64_t sample_count = 0;
863 int db_count = get_max_db(device);
864 available = 1;
865
866 for (int i = 0; i < db_count; ++i) {
867 uint64_t start, end;
868 do {
869 start = src64[2 * i];
870 end = src64[2 * i + 1];
871 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT));
872
873 if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
874 available = 0;
875 else {
876 sample_count += end - start;
877 }
878 }
879
880 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
881 result = VK_NOT_READY;
882 break;
883
884 }
885
886 if (flags & VK_QUERY_RESULT_64_BIT) {
887 *(uint64_t*)dest = sample_count;
888 dest += 8;
889 } else {
890 *(uint32_t*)dest = sample_count;
891 dest += 4;
892 }
893 break;
894 }
895 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
896 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT)) {
897 result = VK_NOT_READY;
898 break;
899
900 }
901
902 const uint64_t *start = (uint64_t*)src;
903 const uint64_t *stop = (uint64_t*)(src + pipelinestat_block_size);
904 if (flags & VK_QUERY_RESULT_64_BIT) {
905 uint64_t *dst = (uint64_t*)dest;
906 dest += util_bitcount(pool->pipeline_stats_mask) * 8;
907 for(int i = 0; i < 11; ++i)
908 if(pool->pipeline_stats_mask & (1u << i))
909 *dst++ = stop[pipeline_statistics_indices[i]] -
910 start[pipeline_statistics_indices[i]];
911
912 } else {
913 uint32_t *dst = (uint32_t*)dest;
914 dest += util_bitcount(pool->pipeline_stats_mask) * 4;
915 for(int i = 0; i < 11; ++i)
916 if(pool->pipeline_stats_mask & (1u << i))
917 *dst++ = stop[pipeline_statistics_indices[i]] -
918 start[pipeline_statistics_indices[i]];
919 }
920 break;
921 }
922 default:
923 unreachable("trying to get results of unhandled query type");
924 }
925
926 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
927 if (flags & VK_QUERY_RESULT_64_BIT) {
928 *(uint64_t*)dest = available;
929 } else {
930 *(uint32_t*)dest = available;
931 }
932 }
933 }
934
935 return result;
936 }
937
938 void radv_CmdCopyQueryPoolResults(
939 VkCommandBuffer commandBuffer,
940 VkQueryPool queryPool,
941 uint32_t firstQuery,
942 uint32_t queryCount,
943 VkBuffer dstBuffer,
944 VkDeviceSize dstOffset,
945 VkDeviceSize stride,
946 VkQueryResultFlags flags)
947 {
948 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
949 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
950 RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
951 struct radeon_cmdbuf *cs = cmd_buffer->cs;
952 unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4;
953 uint64_t va = radv_buffer_get_va(pool->bo);
954 uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
955 dest_va += dst_buffer->offset + dstOffset;
956
957 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
958 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
959
960 switch (pool->type) {
961 case VK_QUERY_TYPE_OCCLUSION:
962 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
963 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
964 unsigned query = firstQuery + i;
965 uint64_t src_va = va + query * pool->stride + pool->stride - 4;
966
967 /* Waits on the upper word of the last DB entry */
968 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
969 radeon_emit(cs, 5 | WAIT_REG_MEM_MEM_SPACE(1));
970 radeon_emit(cs, src_va);
971 radeon_emit(cs, src_va >> 32);
972 radeon_emit(cs, 0x80000000); /* reference value */
973 radeon_emit(cs, 0xffffffff); /* mask */
974 radeon_emit(cs, 4); /* poll interval */
975 }
976 }
977 radv_query_shader(cmd_buffer, cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
978 pool->bo, dst_buffer->bo, firstQuery * pool->stride,
979 dst_buffer->offset + dstOffset,
980 get_max_db(cmd_buffer->device) * 16, stride,
981 queryCount, flags, 0, 0);
982 break;
983 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
984 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
985 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
986 unsigned query = firstQuery + i;
987
988 radeon_check_space(cmd_buffer->device->ws, cs, 7);
989
990 uint64_t avail_va = va + pool->availability_offset + 4 * query;
991
992 /* This waits on the ME. All copies below are done on the ME */
993 si_emit_wait_fence(cs, avail_va, 1, 0xffffffff);
994 }
995 }
996 radv_query_shader(cmd_buffer, cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
997 pool->bo, dst_buffer->bo, firstQuery * pool->stride,
998 dst_buffer->offset + dstOffset,
999 pipelinestat_block_size * 2, stride, queryCount, flags,
1000 pool->pipeline_stats_mask,
1001 pool->availability_offset + 4 * firstQuery);
1002 break;
1003 case VK_QUERY_TYPE_TIMESTAMP:
1004 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1005 unsigned query = firstQuery + i;
1006 uint64_t local_src_va = va + query * pool->stride;
1007
1008 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 19);
1009
1010
1011 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1012 /* TODO, not sure if there is any case where we won't always be ready yet */
1013 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1014
1015 /* This waits on the ME. All copies below are done on the ME */
1016 si_emit_wait_fence(cs, avail_va, 1, 0xffffffff);
1017 }
1018 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1019 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1020 uint64_t avail_dest_va = dest_va + elem_size;
1021
1022 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1023 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
1024 COPY_DATA_DST_SEL(COPY_DATA_MEM));
1025 radeon_emit(cs, avail_va);
1026 radeon_emit(cs, avail_va >> 32);
1027 radeon_emit(cs, avail_dest_va);
1028 radeon_emit(cs, avail_dest_va >> 32);
1029 }
1030
1031 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1032 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
1033 COPY_DATA_DST_SEL(COPY_DATA_MEM) |
1034 ((flags & VK_QUERY_RESULT_64_BIT) ? COPY_DATA_COUNT_SEL : 0));
1035 radeon_emit(cs, local_src_va);
1036 radeon_emit(cs, local_src_va >> 32);
1037 radeon_emit(cs, dest_va);
1038 radeon_emit(cs, dest_va >> 32);
1039
1040
1041 assert(cs->cdw <= cdw_max);
1042 }
1043 break;
1044 default:
1045 unreachable("trying to get results of unhandled query type");
1046 }
1047
1048 }
1049
1050 void radv_CmdResetQueryPool(
1051 VkCommandBuffer commandBuffer,
1052 VkQueryPool queryPool,
1053 uint32_t firstQuery,
1054 uint32_t queryCount)
1055 {
1056 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1057 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1058 uint32_t flush_bits = 0;
1059
1060 flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1061 firstQuery * pool->stride,
1062 queryCount * pool->stride, 0);
1063
1064 if (pool->type == VK_QUERY_TYPE_TIMESTAMP ||
1065 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1066 flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1067 pool->availability_offset + firstQuery * 4,
1068 queryCount * 4, 0);
1069 }
1070
1071 if (flush_bits) {
1072 /* Only need to flush caches for the compute shader path. */
1073 cmd_buffer->pending_reset_query = true;
1074 cmd_buffer->state.flush_bits |= flush_bits;
1075 }
1076 }
1077
1078 static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
1079 uint64_t va,
1080 VkQueryType query_type,
1081 VkQueryControlFlags flags)
1082 {
1083 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1084 switch (query_type) {
1085 case VK_QUERY_TYPE_OCCLUSION:
1086 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1087
1088 ++cmd_buffer->state.active_occlusion_queries;
1089 if (cmd_buffer->state.active_occlusion_queries == 1) {
1090 if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1091 /* This is the first occlusion query, enable
1092 * the hint if the precision bit is set.
1093 */
1094 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1095 }
1096
1097 radv_set_db_count_control(cmd_buffer);
1098 } else {
1099 if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1100 !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1101 /* This is not the first query, but this one
1102 * needs to enable precision, DB_COUNT_CONTROL
1103 * has to be updated accordingly.
1104 */
1105 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1106
1107 radv_set_db_count_control(cmd_buffer);
1108 }
1109 }
1110
1111 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1112 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1113 radeon_emit(cs, va);
1114 radeon_emit(cs, va >> 32);
1115 break;
1116 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1117 radeon_check_space(cmd_buffer->device->ws, cs, 4);
1118
1119 ++cmd_buffer->state.active_pipeline_queries;
1120 if (cmd_buffer->state.active_pipeline_queries == 1) {
1121 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1122 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1123 }
1124
1125 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1126 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1127 radeon_emit(cs, va);
1128 radeon_emit(cs, va >> 32);
1129 break;
1130 default:
1131 unreachable("beginning unhandled query type");
1132 }
1133
1134 }
1135
1136 static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
1137 uint64_t va, uint64_t avail_va,
1138 VkQueryType query_type)
1139 {
1140 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1141 switch (query_type) {
1142 case VK_QUERY_TYPE_OCCLUSION:
1143 radeon_check_space(cmd_buffer->device->ws, cs, 14);
1144
1145 cmd_buffer->state.active_occlusion_queries--;
1146 if (cmd_buffer->state.active_occlusion_queries == 0) {
1147 radv_set_db_count_control(cmd_buffer);
1148
1149 /* Reset the perfect occlusion queries hint now that no
1150 * queries are active.
1151 */
1152 cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1153 }
1154
1155 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1156 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1157 radeon_emit(cs, va + 8);
1158 radeon_emit(cs, (va + 8) >> 32);
1159
1160 break;
1161 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1162 radeon_check_space(cmd_buffer->device->ws, cs, 16);
1163
1164 cmd_buffer->state.active_pipeline_queries--;
1165 if (cmd_buffer->state.active_pipeline_queries == 0) {
1166 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1167 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1168 }
1169 va += pipelinestat_block_size;
1170
1171 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1172 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1173 radeon_emit(cs, va);
1174 radeon_emit(cs, va >> 32);
1175
1176 si_cs_emit_write_event_eop(cs,
1177 cmd_buffer->device->physical_device->rad_info.chip_class,
1178 radv_cmd_buffer_uses_mec(cmd_buffer),
1179 V_028A90_BOTTOM_OF_PIPE_TS, 0,
1180 EOP_DATA_SEL_VALUE_32BIT,
1181 avail_va, 0, 1,
1182 cmd_buffer->gfx9_eop_bug_va);
1183 break;
1184 default:
1185 unreachable("ending unhandled query type");
1186 }
1187 }
1188
1189 void radv_CmdBeginQuery(
1190 VkCommandBuffer commandBuffer,
1191 VkQueryPool queryPool,
1192 uint32_t query,
1193 VkQueryControlFlags flags)
1194 {
1195 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1196 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1197 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1198 uint64_t va = radv_buffer_get_va(pool->bo);
1199
1200 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1201
1202 if (cmd_buffer->pending_reset_query) {
1203 if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1204 /* Only need to flush caches if the query pool size is
1205 * large enough to be resetted using the compute shader
1206 * path. Small pools don't need any cache flushes
1207 * because we use a CP dma clear.
1208 */
1209 si_emit_cache_flush(cmd_buffer);
1210 cmd_buffer->pending_reset_query = false;
1211 }
1212 }
1213
1214 va += pool->stride * query;
1215
1216 emit_begin_query(cmd_buffer, va, pool->type, flags);
1217 }
1218
1219
1220 void radv_CmdEndQuery(
1221 VkCommandBuffer commandBuffer,
1222 VkQueryPool queryPool,
1223 uint32_t query)
1224 {
1225 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1226 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1227 uint64_t va = radv_buffer_get_va(pool->bo);
1228 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1229 va += pool->stride * query;
1230
1231 /* Do not need to add the pool BO to the list because the query must
1232 * currently be active, which means the BO is already in the list.
1233 */
1234 emit_end_query(cmd_buffer, va, avail_va, pool->type);
1235
1236 /*
1237 * For multiview we have to emit a query for each bit in the mask,
1238 * however the first query we emit will get the totals for all the
1239 * operations, so we don't want to get a real value in the other
1240 * queries. This emits a fake begin/end sequence so the waiting
1241 * code gets a completed query value and doesn't hang, but the
1242 * query returns 0.
1243 */
1244 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1245 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1246
1247
1248 for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
1249 va += pool->stride;
1250 avail_va += 4;
1251 emit_begin_query(cmd_buffer, va, pool->type, 0);
1252 emit_end_query(cmd_buffer, va, avail_va, pool->type);
1253 }
1254 }
1255 }
1256
1257 void radv_CmdWriteTimestamp(
1258 VkCommandBuffer commandBuffer,
1259 VkPipelineStageFlagBits pipelineStage,
1260 VkQueryPool queryPool,
1261 uint32_t query)
1262 {
1263 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1264 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1265 bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
1266 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1267 uint64_t va = radv_buffer_get_va(pool->bo);
1268 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1269 uint64_t query_va = va + pool->stride * query;
1270
1271 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1272
1273 int num_queries = 1;
1274 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
1275 num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
1276
1277 MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
1278
1279 for (unsigned i = 0; i < num_queries; i++) {
1280 switch(pipelineStage) {
1281 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
1282 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
1283 radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
1284 COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
1285 COPY_DATA_DST_SEL(V_370_MEM_ASYNC));
1286 radeon_emit(cs, 0);
1287 radeon_emit(cs, 0);
1288 radeon_emit(cs, query_va);
1289 radeon_emit(cs, query_va >> 32);
1290
1291 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
1292 radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
1293 S_370_WR_CONFIRM(1) |
1294 S_370_ENGINE_SEL(V_370_ME));
1295 radeon_emit(cs, avail_va);
1296 radeon_emit(cs, avail_va >> 32);
1297 radeon_emit(cs, 1);
1298 break;
1299 default:
1300 si_cs_emit_write_event_eop(cs,
1301 cmd_buffer->device->physical_device->rad_info.chip_class,
1302 mec,
1303 V_028A90_BOTTOM_OF_PIPE_TS, 0,
1304 EOP_DATA_SEL_TIMESTAMP,
1305 query_va, 0, 0,
1306 cmd_buffer->gfx9_eop_bug_va);
1307 si_cs_emit_write_event_eop(cs,
1308 cmd_buffer->device->physical_device->rad_info.chip_class,
1309 mec,
1310 V_028A90_BOTTOM_OF_PIPE_TS, 0,
1311 EOP_DATA_SEL_VALUE_32BIT,
1312 avail_va, 0, 1,
1313 cmd_buffer->gfx9_eop_bug_va);
1314 break;
1315 }
1316 query_va += pool->stride;
1317 avail_va += 4;
1318 }
1319 assert(cmd_buffer->cs->cdw <= cdw_max);
1320 }