radv/gfx10: implement NGG GS queries
[mesa.git] / src / amd / vulkan / radv_query.c
1 /*
2 * Copyrigh 2016 Red Hat Inc.
3 * Based on anv:
4 * Copyright © 2015 Intel Corporation
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26 #include <assert.h>
27 #include <stdbool.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <fcntl.h>
31
32 #include "nir/nir_builder.h"
33 #include "radv_meta.h"
34 #include "radv_private.h"
35 #include "radv_cs.h"
36 #include "sid.h"
37
38 #define TIMESTAMP_NOT_READY UINT64_MAX
39
40 static const int pipelinestat_block_size = 11 * 8;
41 static const unsigned pipeline_statistics_indices[] = {7, 6, 3, 4, 5, 2, 1, 0, 8, 9, 10};
42
43 static unsigned
44 radv_get_pipeline_statistics_index(const VkQueryPipelineStatisticFlagBits flag)
45 {
46 int offset = ffs(flag) - 1;
47 assert(offset < ARRAY_SIZE(pipeline_statistics_indices));
48 return pipeline_statistics_indices[offset];
49 }
50
51 static nir_ssa_def *nir_test_flag(nir_builder *b, nir_ssa_def *flags, uint32_t flag)
52 {
53 return nir_i2b(b, nir_iand(b, flags, nir_imm_int(b, flag)));
54 }
55
56 static void radv_break_on_count(nir_builder *b, nir_variable *var, nir_ssa_def *count)
57 {
58 nir_ssa_def *counter = nir_load_var(b, var);
59
60 nir_if *if_stmt = nir_if_create(b->shader);
61 if_stmt->condition = nir_src_for_ssa(nir_uge(b, counter, count));
62 nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
63
64 b->cursor = nir_after_cf_list(&if_stmt->then_list);
65
66 nir_jump_instr *instr = nir_jump_instr_create(b->shader, nir_jump_break);
67 nir_builder_instr_insert(b, &instr->instr);
68
69 b->cursor = nir_after_cf_node(&if_stmt->cf_node);
70 counter = nir_iadd(b, counter, nir_imm_int(b, 1));
71 nir_store_var(b, var, counter, 0x1);
72 }
73
74 static struct nir_ssa_def *
75 radv_load_push_int(nir_builder *b, unsigned offset, const char *name)
76 {
77 nir_intrinsic_instr *flags = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_push_constant);
78 nir_intrinsic_set_base(flags, 0);
79 nir_intrinsic_set_range(flags, 16);
80 flags->src[0] = nir_src_for_ssa(nir_imm_int(b, offset));
81 flags->num_components = 1;
82 nir_ssa_dest_init(&flags->instr, &flags->dest, 1, 32, name);
83 nir_builder_instr_insert(b, &flags->instr);
84 return &flags->dest.ssa;
85 }
86
87 static nir_shader *
88 build_occlusion_query_shader(struct radv_device *device) {
89 /* the shader this builds is roughly
90 *
91 * push constants {
92 * uint32_t flags;
93 * uint32_t dst_stride;
94 * };
95 *
96 * uint32_t src_stride = 16 * db_count;
97 *
98 * location(binding = 0) buffer dst_buf;
99 * location(binding = 1) buffer src_buf;
100 *
101 * void main() {
102 * uint64_t result = 0;
103 * uint64_t src_offset = src_stride * global_id.x;
104 * uint64_t dst_offset = dst_stride * global_id.x;
105 * bool available = true;
106 * for (int i = 0; i < db_count; ++i) {
107 * if (enabled_rb_mask & (1 << i)) {
108 * uint64_t start = src_buf[src_offset + 16 * i];
109 * uint64_t end = src_buf[src_offset + 16 * i + 8];
110 * if ((start & (1ull << 63)) && (end & (1ull << 63)))
111 * result += end - start;
112 * else
113 * available = false;
114 * }
115 * }
116 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
117 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
118 * if (flags & VK_QUERY_RESULT_64_BIT)
119 * dst_buf[dst_offset] = result;
120 * else
121 * dst_buf[dst_offset] = (uint32_t)result.
122 * }
123 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
124 * dst_buf[dst_offset + elem_size] = available;
125 * }
126 * }
127 */
128 nir_builder b;
129 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
130 b.shader->info.name = ralloc_strdup(b.shader, "occlusion_query");
131 b.shader->info.cs.local_size[0] = 64;
132 b.shader->info.cs.local_size[1] = 1;
133 b.shader->info.cs.local_size[2] = 1;
134
135 nir_variable *result = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
136 nir_variable *outer_counter = nir_local_variable_create(b.impl, glsl_int_type(), "outer_counter");
137 nir_variable *start = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "start");
138 nir_variable *end = nir_local_variable_create(b.impl, glsl_uint64_t_type(), "end");
139 nir_variable *available = nir_local_variable_create(b.impl, glsl_bool_type(), "available");
140 unsigned enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
141 unsigned db_count = device->physical_device->rad_info.num_render_backends;
142
143 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
144
145 nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
146 nir_intrinsic_vulkan_resource_index);
147 dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
148 dst_buf->num_components = 1;
149 nir_intrinsic_set_desc_set(dst_buf, 0);
150 nir_intrinsic_set_binding(dst_buf, 0);
151 nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL);
152 nir_builder_instr_insert(&b, &dst_buf->instr);
153
154 nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
155 nir_intrinsic_vulkan_resource_index);
156 src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
157 src_buf->num_components = 1;
158 nir_intrinsic_set_desc_set(src_buf, 0);
159 nir_intrinsic_set_binding(src_buf, 1);
160 nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL);
161 nir_builder_instr_insert(&b, &src_buf->instr);
162
163 nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
164 nir_ssa_def *wg_id = nir_load_work_group_id(&b);
165 nir_ssa_def *block_size = nir_imm_ivec4(&b,
166 b.shader->info.cs.local_size[0],
167 b.shader->info.cs.local_size[1],
168 b.shader->info.cs.local_size[2], 0);
169 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
170 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
171
172 nir_ssa_def *input_stride = nir_imm_int(&b, db_count * 16);
173 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
174 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
175 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
176
177
178 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
179 nir_store_var(&b, outer_counter, nir_imm_int(&b, 0), 0x1);
180 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
181
182 nir_loop *outer_loop = nir_loop_create(b.shader);
183 nir_builder_cf_insert(&b, &outer_loop->cf_node);
184 b.cursor = nir_after_cf_list(&outer_loop->body);
185
186 nir_ssa_def *current_outer_count = nir_load_var(&b, outer_counter);
187 radv_break_on_count(&b, outer_counter, nir_imm_int(&b, db_count));
188
189 nir_ssa_def *enabled_cond =
190 nir_iand(&b, nir_imm_int(&b, enabled_rb_mask),
191 nir_ishl(&b, nir_imm_int(&b, 1), current_outer_count));
192
193 nir_if *enabled_if = nir_if_create(b.shader);
194 enabled_if->condition = nir_src_for_ssa(nir_i2b(&b, enabled_cond));
195 nir_cf_node_insert(b.cursor, &enabled_if->cf_node);
196
197 b.cursor = nir_after_cf_list(&enabled_if->then_list);
198
199 nir_ssa_def *load_offset = nir_imul(&b, current_outer_count, nir_imm_int(&b, 16));
200 load_offset = nir_iadd(&b, input_base, load_offset);
201
202 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
203 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
204 load->src[1] = nir_src_for_ssa(load_offset);
205 nir_ssa_dest_init(&load->instr, &load->dest, 2, 64, NULL);
206 load->num_components = 2;
207 nir_intrinsic_set_align(load, 16, 0);
208 nir_builder_instr_insert(&b, &load->instr);
209
210 nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1);
211 nir_store_var(&b, end, nir_channel(&b, &load->dest.ssa, 1), 0x1);
212
213 nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
214 nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
215
216 nir_if *update_if = nir_if_create(b.shader);
217 update_if->condition = nir_src_for_ssa(nir_iand(&b, start_done, end_done));
218 nir_cf_node_insert(b.cursor, &update_if->cf_node);
219
220 b.cursor = nir_after_cf_list(&update_if->then_list);
221
222 nir_store_var(&b, result,
223 nir_iadd(&b, nir_load_var(&b, result),
224 nir_isub(&b, nir_load_var(&b, end),
225 nir_load_var(&b, start))), 0x1);
226
227 b.cursor = nir_after_cf_list(&update_if->else_list);
228
229 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
230
231 b.cursor = nir_after_cf_node(&outer_loop->cf_node);
232
233 /* Store the result if complete or if partial results have been requested. */
234
235 nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
236 nir_ssa_def *result_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
237
238 nir_if *store_if = nir_if_create(b.shader);
239 store_if->condition = nir_src_for_ssa(nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT), nir_load_var(&b, available)));
240 nir_cf_node_insert(b.cursor, &store_if->cf_node);
241
242 b.cursor = nir_after_cf_list(&store_if->then_list);
243
244 nir_if *store_64bit_if = nir_if_create(b.shader);
245 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
246 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
247
248 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
249
250 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
251 store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
252 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
253 store->src[2] = nir_src_for_ssa(output_base);
254 nir_intrinsic_set_write_mask(store, 0x1);
255 nir_intrinsic_set_align(store, 8, 0);
256 store->num_components = 1;
257 nir_builder_instr_insert(&b, &store->instr);
258
259 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
260
261 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
262 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
263 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
264 store->src[2] = nir_src_for_ssa(output_base);
265 nir_intrinsic_set_write_mask(store, 0x1);
266 nir_intrinsic_set_align(store, 4, 0);
267 store->num_components = 1;
268 nir_builder_instr_insert(&b, &store->instr);
269
270 b.cursor = nir_after_cf_node(&store_if->cf_node);
271
272 /* Store the availability bit if requested. */
273
274 nir_if *availability_if = nir_if_create(b.shader);
275 availability_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
276 nir_cf_node_insert(b.cursor, &availability_if->cf_node);
277
278 b.cursor = nir_after_cf_list(&availability_if->then_list);
279
280 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
281 store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available)));
282 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
283 store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
284 nir_intrinsic_set_write_mask(store, 0x1);
285 nir_intrinsic_set_align(store, 4, 0);
286 store->num_components = 1;
287 nir_builder_instr_insert(&b, &store->instr);
288
289 return b.shader;
290 }
291
292 static nir_shader *
293 build_pipeline_statistics_query_shader(struct radv_device *device) {
294 /* the shader this builds is roughly
295 *
296 * push constants {
297 * uint32_t flags;
298 * uint32_t dst_stride;
299 * uint32_t stats_mask;
300 * uint32_t avail_offset;
301 * };
302 *
303 * uint32_t src_stride = pipelinestat_block_size * 2;
304 *
305 * location(binding = 0) buffer dst_buf;
306 * location(binding = 1) buffer src_buf;
307 *
308 * void main() {
309 * uint64_t src_offset = src_stride * global_id.x;
310 * uint64_t dst_base = dst_stride * global_id.x;
311 * uint64_t dst_offset = dst_base;
312 * uint32_t elem_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
313 * uint32_t elem_count = stats_mask >> 16;
314 * uint32_t available32 = src_buf[avail_offset + 4 * global_id.x];
315 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
316 * dst_buf[dst_offset + elem_count * elem_size] = available32;
317 * }
318 * if ((bool)available32) {
319 * // repeat 11 times:
320 * if (stats_mask & (1 << 0)) {
321 * uint64_t start = src_buf[src_offset + 8 * indices[0]];
322 * uint64_t end = src_buf[src_offset + 8 * indices[0] + pipelinestat_block_size];
323 * uint64_t result = end - start;
324 * if (flags & VK_QUERY_RESULT_64_BIT)
325 * dst_buf[dst_offset] = result;
326 * else
327 * dst_buf[dst_offset] = (uint32_t)result.
328 * dst_offset += elem_size;
329 * }
330 * } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
331 * // Set everything to 0 as we don't know what is valid.
332 * for (int i = 0; i < elem_count; ++i)
333 * dst_buf[dst_base + elem_size * i] = 0;
334 * }
335 * }
336 */
337 nir_builder b;
338 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
339 b.shader->info.name = ralloc_strdup(b.shader, "pipeline_statistics_query");
340 b.shader->info.cs.local_size[0] = 64;
341 b.shader->info.cs.local_size[1] = 1;
342 b.shader->info.cs.local_size[2] = 1;
343
344 nir_variable *output_offset = nir_local_variable_create(b.impl, glsl_int_type(), "output_offset");
345
346 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
347 nir_ssa_def *stats_mask = radv_load_push_int(&b, 8, "stats_mask");
348 nir_ssa_def *avail_offset = radv_load_push_int(&b, 12, "avail_offset");
349
350 nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
351 nir_intrinsic_vulkan_resource_index);
352 dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
353 dst_buf->num_components = 1;;
354 nir_intrinsic_set_desc_set(dst_buf, 0);
355 nir_intrinsic_set_binding(dst_buf, 0);
356 nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL);
357 nir_builder_instr_insert(&b, &dst_buf->instr);
358
359 nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
360 nir_intrinsic_vulkan_resource_index);
361 src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
362 src_buf->num_components = 1;
363 nir_intrinsic_set_desc_set(src_buf, 0);
364 nir_intrinsic_set_binding(src_buf, 1);
365 nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL);
366 nir_builder_instr_insert(&b, &src_buf->instr);
367
368 nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
369 nir_ssa_def *wg_id = nir_load_work_group_id(&b);
370 nir_ssa_def *block_size = nir_imm_ivec4(&b,
371 b.shader->info.cs.local_size[0],
372 b.shader->info.cs.local_size[1],
373 b.shader->info.cs.local_size[2], 0);
374 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
375 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
376
377 nir_ssa_def *input_stride = nir_imm_int(&b, pipelinestat_block_size * 2);
378 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
379 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
380 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
381
382
383 avail_offset = nir_iadd(&b, avail_offset,
384 nir_imul(&b, global_id, nir_imm_int(&b, 4)));
385
386 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
387 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
388 load->src[1] = nir_src_for_ssa(avail_offset);
389 nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
390 load->num_components = 1;
391 nir_intrinsic_set_align(load, 4, 0);
392 nir_builder_instr_insert(&b, &load->instr);
393 nir_ssa_def *available32 = &load->dest.ssa;
394
395 nir_ssa_def *result_is_64bit = nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
396 nir_ssa_def *elem_size = nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8), nir_imm_int(&b, 4));
397 nir_ssa_def *elem_count = nir_ushr(&b, stats_mask, nir_imm_int(&b, 16));
398
399 /* Store the availability bit if requested. */
400
401 nir_if *availability_if = nir_if_create(b.shader);
402 availability_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
403 nir_cf_node_insert(b.cursor, &availability_if->cf_node);
404
405 b.cursor = nir_after_cf_list(&availability_if->then_list);
406
407 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
408 store->src[0] = nir_src_for_ssa(available32);
409 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
410 store->src[2] = nir_src_for_ssa(nir_iadd(&b, output_base, nir_imul(&b, elem_count, elem_size)));
411 nir_intrinsic_set_write_mask(store, 0x1);
412 nir_intrinsic_set_align(store, 4, 0);
413 store->num_components = 1;
414 nir_builder_instr_insert(&b, &store->instr);
415
416 b.cursor = nir_after_cf_node(&availability_if->cf_node);
417
418 nir_if *available_if = nir_if_create(b.shader);
419 available_if->condition = nir_src_for_ssa(nir_i2b(&b, available32));
420 nir_cf_node_insert(b.cursor, &available_if->cf_node);
421
422 b.cursor = nir_after_cf_list(&available_if->then_list);
423
424 nir_store_var(&b, output_offset, output_base, 0x1);
425 for (int i = 0; i < 11; ++i) {
426 nir_if *store_if = nir_if_create(b.shader);
427 store_if->condition = nir_src_for_ssa(nir_test_flag(&b, stats_mask, 1u << i));
428 nir_cf_node_insert(b.cursor, &store_if->cf_node);
429
430 b.cursor = nir_after_cf_list(&store_if->then_list);
431
432 load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
433 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
434 load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
435 nir_imm_int(&b, pipeline_statistics_indices[i] * 8)));
436 nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
437 load->num_components = 1;
438 nir_intrinsic_set_align(load, 8, 0);
439 nir_builder_instr_insert(&b, &load->instr);
440 nir_ssa_def *start = &load->dest.ssa;
441
442 load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
443 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
444 load->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base,
445 nir_imm_int(&b, pipeline_statistics_indices[i] * 8 + pipelinestat_block_size)));
446 nir_ssa_dest_init(&load->instr, &load->dest, 1, 64, NULL);
447 load->num_components = 1;
448 nir_intrinsic_set_align(load, 8, 0);
449 nir_builder_instr_insert(&b, &load->instr);
450 nir_ssa_def *end = &load->dest.ssa;
451
452 nir_ssa_def *result = nir_isub(&b, end, start);
453
454 /* Store result */
455 nir_if *store_64bit_if = nir_if_create(b.shader);
456 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
457 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
458
459 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
460
461 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
462 store->src[0] = nir_src_for_ssa(result);
463 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
464 store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
465 nir_intrinsic_set_write_mask(store, 0x1);
466 nir_intrinsic_set_align(store, 8, 0);
467 store->num_components = 1;
468 nir_builder_instr_insert(&b, &store->instr);
469
470 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
471
472 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
473 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, result));
474 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
475 store->src[2] = nir_src_for_ssa(nir_load_var(&b, output_offset));
476 nir_intrinsic_set_write_mask(store, 0x1);
477 nir_intrinsic_set_align(store, 4, 0);
478 store->num_components = 1;
479 nir_builder_instr_insert(&b, &store->instr);
480
481 b.cursor = nir_after_cf_node(&store_64bit_if->cf_node);
482
483 nir_store_var(&b, output_offset,
484 nir_iadd(&b, nir_load_var(&b, output_offset),
485 elem_size), 0x1);
486
487 b.cursor = nir_after_cf_node(&store_if->cf_node);
488 }
489
490 b.cursor = nir_after_cf_list(&available_if->else_list);
491
492 available_if = nir_if_create(b.shader);
493 available_if->condition = nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT));
494 nir_cf_node_insert(b.cursor, &available_if->cf_node);
495
496 b.cursor = nir_after_cf_list(&available_if->then_list);
497
498 /* Stores zeros in all outputs. */
499
500 nir_variable *counter = nir_local_variable_create(b.impl, glsl_int_type(), "counter");
501 nir_store_var(&b, counter, nir_imm_int(&b, 0), 0x1);
502
503 nir_loop *loop = nir_loop_create(b.shader);
504 nir_builder_cf_insert(&b, &loop->cf_node);
505 b.cursor = nir_after_cf_list(&loop->body);
506
507 nir_ssa_def *current_counter = nir_load_var(&b, counter);
508 radv_break_on_count(&b, counter, elem_count);
509
510 nir_ssa_def *output_elem = nir_iadd(&b, output_base,
511 nir_imul(&b, elem_size, current_counter));
512
513 nir_if *store_64bit_if = nir_if_create(b.shader);
514 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
515 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
516
517 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
518
519 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
520 store->src[0] = nir_src_for_ssa(nir_imm_int64(&b, 0));
521 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
522 store->src[2] = nir_src_for_ssa(output_elem);
523 nir_intrinsic_set_write_mask(store, 0x1);
524 nir_intrinsic_set_align(store, 8, 0);
525 store->num_components = 1;
526 nir_builder_instr_insert(&b, &store->instr);
527
528 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
529
530 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
531 store->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
532 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
533 store->src[2] = nir_src_for_ssa(output_elem);
534 nir_intrinsic_set_write_mask(store, 0x1);
535 nir_intrinsic_set_align(store, 4, 0);
536 store->num_components = 1;
537 nir_builder_instr_insert(&b, &store->instr);
538
539 b.cursor = nir_after_cf_node(&loop->cf_node);
540 return b.shader;
541 }
542
543 static nir_shader *
544 build_tfb_query_shader(struct radv_device *device)
545 {
546 /* the shader this builds is roughly
547 *
548 * uint32_t src_stride = 32;
549 *
550 * location(binding = 0) buffer dst_buf;
551 * location(binding = 1) buffer src_buf;
552 *
553 * void main() {
554 * uint64_t result[2] = {};
555 * bool available = false;
556 * uint64_t src_offset = src_stride * global_id.x;
557 * uint64_t dst_offset = dst_stride * global_id.x;
558 * uint64_t *src_data = src_buf[src_offset];
559 * uint32_t avail = (src_data[0] >> 32) &
560 * (src_data[1] >> 32) &
561 * (src_data[2] >> 32) &
562 * (src_data[3] >> 32);
563 * if (avail & 0x80000000) {
564 * result[0] = src_data[3] - src_data[1];
565 * result[1] = src_data[2] - src_data[0];
566 * available = true;
567 * }
568 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 16 : 8;
569 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
570 * if (flags & VK_QUERY_RESULT_64_BIT) {
571 * dst_buf[dst_offset] = result;
572 * } else {
573 * dst_buf[dst_offset] = (uint32_t)result;
574 * }
575 * }
576 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
577 * dst_buf[dst_offset + result_size] = available;
578 * }
579 * }
580 */
581 nir_builder b;
582 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
583 b.shader->info.name = ralloc_strdup(b.shader, "tfb_query");
584 b.shader->info.cs.local_size[0] = 64;
585 b.shader->info.cs.local_size[1] = 1;
586 b.shader->info.cs.local_size[2] = 1;
587
588 /* Create and initialize local variables. */
589 nir_variable *result =
590 nir_local_variable_create(b.impl,
591 glsl_vector_type(GLSL_TYPE_UINT64, 2),
592 "result");
593 nir_variable *available =
594 nir_local_variable_create(b.impl, glsl_bool_type(), "available");
595
596 nir_store_var(&b, result,
597 nir_vec2(&b, nir_imm_int64(&b, 0),
598 nir_imm_int64(&b, 0)), 0x3);
599 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
600
601 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
602
603 /* Load resources. */
604 nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
605 nir_intrinsic_vulkan_resource_index);
606 dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
607 dst_buf->num_components = 1;
608 nir_intrinsic_set_desc_set(dst_buf, 0);
609 nir_intrinsic_set_binding(dst_buf, 0);
610 nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL);
611 nir_builder_instr_insert(&b, &dst_buf->instr);
612
613 nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
614 nir_intrinsic_vulkan_resource_index);
615 src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
616 src_buf->num_components = 1;
617 nir_intrinsic_set_desc_set(src_buf, 0);
618 nir_intrinsic_set_binding(src_buf, 1);
619 nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL);
620 nir_builder_instr_insert(&b, &src_buf->instr);
621
622 /* Compute global ID. */
623 nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
624 nir_ssa_def *wg_id = nir_load_work_group_id(&b);
625 nir_ssa_def *block_size = nir_imm_ivec4(&b,
626 b.shader->info.cs.local_size[0],
627 b.shader->info.cs.local_size[1],
628 b.shader->info.cs.local_size[2], 0);
629 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
630 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
631
632 /* Compute src/dst strides. */
633 nir_ssa_def *input_stride = nir_imm_int(&b, 32);
634 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
635 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
636 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
637
638 /* Load data from the query pool. */
639 nir_intrinsic_instr *load1 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
640 load1->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
641 load1->src[1] = nir_src_for_ssa(input_base);
642 nir_ssa_dest_init(&load1->instr, &load1->dest, 4, 32, NULL);
643 load1->num_components = 4;
644 nir_intrinsic_set_align(load1, 32, 0);
645 nir_builder_instr_insert(&b, &load1->instr);
646
647 nir_intrinsic_instr *load2 = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
648 load2->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
649 load2->src[1] = nir_src_for_ssa(nir_iadd(&b, input_base, nir_imm_int(&b, 16)));
650 nir_ssa_dest_init(&load2->instr, &load2->dest, 4, 32, NULL);
651 load2->num_components = 4;
652 nir_intrinsic_set_align(load2, 16, 0);
653 nir_builder_instr_insert(&b, &load2->instr);
654
655 /* Check if result is available. */
656 nir_ssa_def *avails[2];
657 avails[0] = nir_iand(&b, nir_channel(&b, &load1->dest.ssa, 1),
658 nir_channel(&b, &load1->dest.ssa, 3));
659 avails[1] = nir_iand(&b, nir_channel(&b, &load2->dest.ssa, 1),
660 nir_channel(&b, &load2->dest.ssa, 3));
661 nir_ssa_def *result_is_available =
662 nir_i2b(&b, nir_iand(&b, nir_iand(&b, avails[0], avails[1]),
663 nir_imm_int(&b, 0x80000000)));
664
665 /* Only compute result if available. */
666 nir_if *available_if = nir_if_create(b.shader);
667 available_if->condition = nir_src_for_ssa(result_is_available);
668 nir_cf_node_insert(b.cursor, &available_if->cf_node);
669
670 b.cursor = nir_after_cf_list(&available_if->then_list);
671
672 /* Pack values. */
673 nir_ssa_def *packed64[4];
674 packed64[0] = nir_pack_64_2x32(&b, nir_vec2(&b,
675 nir_channel(&b, &load1->dest.ssa, 0),
676 nir_channel(&b, &load1->dest.ssa, 1)));
677 packed64[1] = nir_pack_64_2x32(&b, nir_vec2(&b,
678 nir_channel(&b, &load1->dest.ssa, 2),
679 nir_channel(&b, &load1->dest.ssa, 3)));
680 packed64[2] = nir_pack_64_2x32(&b, nir_vec2(&b,
681 nir_channel(&b, &load2->dest.ssa, 0),
682 nir_channel(&b, &load2->dest.ssa, 1)));
683 packed64[3] = nir_pack_64_2x32(&b, nir_vec2(&b,
684 nir_channel(&b, &load2->dest.ssa, 2),
685 nir_channel(&b, &load2->dest.ssa, 3)));
686
687 /* Compute result. */
688 nir_ssa_def *num_primitive_written =
689 nir_isub(&b, packed64[3], packed64[1]);
690 nir_ssa_def *primitive_storage_needed =
691 nir_isub(&b, packed64[2], packed64[0]);
692
693 nir_store_var(&b, result,
694 nir_vec2(&b, num_primitive_written,
695 primitive_storage_needed), 0x3);
696 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
697
698 b.cursor = nir_after_cf_node(&available_if->cf_node);
699
700 /* Determine if result is 64 or 32 bit. */
701 nir_ssa_def *result_is_64bit =
702 nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
703 nir_ssa_def *result_size =
704 nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 16),
705 nir_imm_int(&b, 8));
706
707 /* Store the result if complete or partial results have been requested. */
708 nir_if *store_if = nir_if_create(b.shader);
709 store_if->condition =
710 nir_src_for_ssa(nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
711 nir_load_var(&b, available)));
712 nir_cf_node_insert(b.cursor, &store_if->cf_node);
713
714 b.cursor = nir_after_cf_list(&store_if->then_list);
715
716 /* Store result. */
717 nir_if *store_64bit_if = nir_if_create(b.shader);
718 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
719 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
720
721 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
722
723 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
724 store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
725 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
726 store->src[2] = nir_src_for_ssa(output_base);
727 nir_intrinsic_set_write_mask(store, 0x3);
728 nir_intrinsic_set_align(store, 8, 0);
729 store->num_components = 2;
730 nir_builder_instr_insert(&b, &store->instr);
731
732 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
733
734 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
735 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
736 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
737 store->src[2] = nir_src_for_ssa(output_base);
738 nir_intrinsic_set_write_mask(store, 0x3);
739 nir_intrinsic_set_align(store, 4, 0);
740 store->num_components = 2;
741 nir_builder_instr_insert(&b, &store->instr);
742
743 b.cursor = nir_after_cf_node(&store_64bit_if->cf_node);
744
745 b.cursor = nir_after_cf_node(&store_if->cf_node);
746
747 /* Store the availability bit if requested. */
748 nir_if *availability_if = nir_if_create(b.shader);
749 availability_if->condition =
750 nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
751 nir_cf_node_insert(b.cursor, &availability_if->cf_node);
752
753 b.cursor = nir_after_cf_list(&availability_if->then_list);
754
755 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
756 store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available)));
757 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
758 store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
759 nir_intrinsic_set_write_mask(store, 0x1);
760 nir_intrinsic_set_align(store, 4, 0);
761 store->num_components = 1;
762 nir_builder_instr_insert(&b, &store->instr);
763
764 b.cursor = nir_after_cf_node(&availability_if->cf_node);
765
766 return b.shader;
767 }
768
769 static nir_shader *
770 build_timestamp_query_shader(struct radv_device *device)
771 {
772 /* the shader this builds is roughly
773 *
774 * uint32_t src_stride = 8;
775 *
776 * location(binding = 0) buffer dst_buf;
777 * location(binding = 1) buffer src_buf;
778 *
779 * void main() {
780 * uint64_t result = 0;
781 * bool available = false;
782 * uint64_t src_offset = src_stride * global_id.x;
783 * uint64_t dst_offset = dst_stride * global_id.x;
784 * uint64_t timestamp = src_buf[src_offset];
785 * if (timestamp != TIMESTAMP_NOT_READY) {
786 * result = timestamp;
787 * available = true;
788 * }
789 * uint32_t result_size = flags & VK_QUERY_RESULT_64_BIT ? 8 : 4;
790 * if ((flags & VK_QUERY_RESULT_PARTIAL_BIT) || available) {
791 * if (flags & VK_QUERY_RESULT_64_BIT) {
792 * dst_buf[dst_offset] = result;
793 * } else {
794 * dst_buf[dst_offset] = (uint32_t)result;
795 * }
796 * }
797 * if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
798 * dst_buf[dst_offset + result_size] = available;
799 * }
800 * }
801 */
802 nir_builder b;
803 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_COMPUTE, NULL);
804 b.shader->info.name = ralloc_strdup(b.shader, "timestamp_query");
805 b.shader->info.cs.local_size[0] = 64;
806 b.shader->info.cs.local_size[1] = 1;
807 b.shader->info.cs.local_size[2] = 1;
808
809 /* Create and initialize local variables. */
810 nir_variable *result =
811 nir_local_variable_create(b.impl, glsl_uint64_t_type(), "result");
812 nir_variable *available =
813 nir_local_variable_create(b.impl, glsl_bool_type(), "available");
814
815 nir_store_var(&b, result, nir_imm_int64(&b, 0), 0x1);
816 nir_store_var(&b, available, nir_imm_false(&b), 0x1);
817
818 nir_ssa_def *flags = radv_load_push_int(&b, 0, "flags");
819
820 /* Load resources. */
821 nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
822 nir_intrinsic_vulkan_resource_index);
823 dst_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
824 dst_buf->num_components = 1;
825 nir_intrinsic_set_desc_set(dst_buf, 0);
826 nir_intrinsic_set_binding(dst_buf, 0);
827 nir_ssa_dest_init(&dst_buf->instr, &dst_buf->dest, dst_buf->num_components, 32, NULL);
828 nir_builder_instr_insert(&b, &dst_buf->instr);
829
830 nir_intrinsic_instr *src_buf = nir_intrinsic_instr_create(b.shader,
831 nir_intrinsic_vulkan_resource_index);
832 src_buf->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
833 src_buf->num_components = 1;
834 nir_intrinsic_set_desc_set(src_buf, 0);
835 nir_intrinsic_set_binding(src_buf, 1);
836 nir_ssa_dest_init(&src_buf->instr, &src_buf->dest, src_buf->num_components, 32, NULL);
837 nir_builder_instr_insert(&b, &src_buf->instr);
838
839 /* Compute global ID. */
840 nir_ssa_def *invoc_id = nir_load_local_invocation_id(&b);
841 nir_ssa_def *wg_id = nir_load_work_group_id(&b);
842 nir_ssa_def *block_size = nir_imm_ivec4(&b,
843 b.shader->info.cs.local_size[0],
844 b.shader->info.cs.local_size[1],
845 b.shader->info.cs.local_size[2], 0);
846 nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
847 global_id = nir_channel(&b, global_id, 0); // We only care about x here.
848
849 /* Compute src/dst strides. */
850 nir_ssa_def *input_stride = nir_imm_int(&b, 8);
851 nir_ssa_def *input_base = nir_imul(&b, input_stride, global_id);
852 nir_ssa_def *output_stride = radv_load_push_int(&b, 4, "output_stride");
853 nir_ssa_def *output_base = nir_imul(&b, output_stride, global_id);
854
855 /* Load data from the query pool. */
856 nir_intrinsic_instr *load = nir_intrinsic_instr_create(b.shader, nir_intrinsic_load_ssbo);
857 load->src[0] = nir_src_for_ssa(&src_buf->dest.ssa);
858 load->src[1] = nir_src_for_ssa(input_base);
859 nir_ssa_dest_init(&load->instr, &load->dest, 2, 32, NULL);
860 load->num_components = 2;
861 nir_intrinsic_set_align(load, 8, 0);
862 nir_builder_instr_insert(&b, &load->instr);
863
864 /* Pack the timestamp. */
865 nir_ssa_def *timestamp;
866 timestamp = nir_pack_64_2x32(&b, nir_vec2(&b,
867 nir_channel(&b, &load->dest.ssa, 0),
868 nir_channel(&b, &load->dest.ssa, 1)));
869
870 /* Check if result is available. */
871 nir_ssa_def *result_is_available =
872 nir_i2b(&b, nir_ine(&b, timestamp,
873 nir_imm_int64(&b, TIMESTAMP_NOT_READY)));
874
875 /* Only store result if available. */
876 nir_if *available_if = nir_if_create(b.shader);
877 available_if->condition = nir_src_for_ssa(result_is_available);
878 nir_cf_node_insert(b.cursor, &available_if->cf_node);
879
880 b.cursor = nir_after_cf_list(&available_if->then_list);
881
882 nir_store_var(&b, result, timestamp, 0x1);
883 nir_store_var(&b, available, nir_imm_true(&b), 0x1);
884
885 b.cursor = nir_after_cf_node(&available_if->cf_node);
886
887 /* Determine if result is 64 or 32 bit. */
888 nir_ssa_def *result_is_64bit =
889 nir_test_flag(&b, flags, VK_QUERY_RESULT_64_BIT);
890 nir_ssa_def *result_size =
891 nir_bcsel(&b, result_is_64bit, nir_imm_int(&b, 8),
892 nir_imm_int(&b, 4));
893
894 /* Store the result if complete or partial results have been requested. */
895 nir_if *store_if = nir_if_create(b.shader);
896 store_if->condition =
897 nir_src_for_ssa(nir_ior(&b, nir_test_flag(&b, flags, VK_QUERY_RESULT_PARTIAL_BIT),
898 nir_load_var(&b, available)));
899 nir_cf_node_insert(b.cursor, &store_if->cf_node);
900
901 b.cursor = nir_after_cf_list(&store_if->then_list);
902
903 /* Store result. */
904 nir_if *store_64bit_if = nir_if_create(b.shader);
905 store_64bit_if->condition = nir_src_for_ssa(result_is_64bit);
906 nir_cf_node_insert(b.cursor, &store_64bit_if->cf_node);
907
908 b.cursor = nir_after_cf_list(&store_64bit_if->then_list);
909
910 nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
911 store->src[0] = nir_src_for_ssa(nir_load_var(&b, result));
912 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
913 store->src[2] = nir_src_for_ssa(output_base);
914 nir_intrinsic_set_write_mask(store, 0x1);
915 nir_intrinsic_set_align(store, 8, 0);
916 store->num_components = 1;
917 nir_builder_instr_insert(&b, &store->instr);
918
919 b.cursor = nir_after_cf_list(&store_64bit_if->else_list);
920
921 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
922 store->src[0] = nir_src_for_ssa(nir_u2u32(&b, nir_load_var(&b, result)));
923 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
924 store->src[2] = nir_src_for_ssa(output_base);
925 nir_intrinsic_set_write_mask(store, 0x1);
926 nir_intrinsic_set_align(store, 4, 0);
927 store->num_components = 1;
928 nir_builder_instr_insert(&b, &store->instr);
929
930 b.cursor = nir_after_cf_node(&store_64bit_if->cf_node);
931
932 b.cursor = nir_after_cf_node(&store_if->cf_node);
933
934 /* Store the availability bit if requested. */
935 nir_if *availability_if = nir_if_create(b.shader);
936 availability_if->condition =
937 nir_src_for_ssa(nir_test_flag(&b, flags, VK_QUERY_RESULT_WITH_AVAILABILITY_BIT));
938 nir_cf_node_insert(b.cursor, &availability_if->cf_node);
939
940 b.cursor = nir_after_cf_list(&availability_if->then_list);
941
942 store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_store_ssbo);
943 store->src[0] = nir_src_for_ssa(nir_b2i32(&b, nir_load_var(&b, available)));
944 store->src[1] = nir_src_for_ssa(&dst_buf->dest.ssa);
945 store->src[2] = nir_src_for_ssa(nir_iadd(&b, result_size, output_base));
946 nir_intrinsic_set_write_mask(store, 0x1);
947 nir_intrinsic_set_align(store, 4, 0);
948 store->num_components = 1;
949 nir_builder_instr_insert(&b, &store->instr);
950
951 b.cursor = nir_after_cf_node(&availability_if->cf_node);
952
953 return b.shader;
954 }
955
956 static VkResult radv_device_init_meta_query_state_internal(struct radv_device *device)
957 {
958 VkResult result;
959 struct radv_shader_module occlusion_cs = { .nir = NULL };
960 struct radv_shader_module pipeline_statistics_cs = { .nir = NULL };
961 struct radv_shader_module tfb_cs = { .nir = NULL };
962 struct radv_shader_module timestamp_cs = { .nir = NULL };
963
964 mtx_lock(&device->meta_state.mtx);
965 if (device->meta_state.query.pipeline_statistics_query_pipeline) {
966 mtx_unlock(&device->meta_state.mtx);
967 return VK_SUCCESS;
968 }
969 occlusion_cs.nir = build_occlusion_query_shader(device);
970 pipeline_statistics_cs.nir = build_pipeline_statistics_query_shader(device);
971 tfb_cs.nir = build_tfb_query_shader(device);
972 timestamp_cs.nir = build_timestamp_query_shader(device);
973
974 VkDescriptorSetLayoutCreateInfo occlusion_ds_create_info = {
975 .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
976 .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
977 .bindingCount = 2,
978 .pBindings = (VkDescriptorSetLayoutBinding[]) {
979 {
980 .binding = 0,
981 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
982 .descriptorCount = 1,
983 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
984 .pImmutableSamplers = NULL
985 },
986 {
987 .binding = 1,
988 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
989 .descriptorCount = 1,
990 .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
991 .pImmutableSamplers = NULL
992 },
993 }
994 };
995
996 result = radv_CreateDescriptorSetLayout(radv_device_to_handle(device),
997 &occlusion_ds_create_info,
998 &device->meta_state.alloc,
999 &device->meta_state.query.ds_layout);
1000 if (result != VK_SUCCESS)
1001 goto fail;
1002
1003 VkPipelineLayoutCreateInfo occlusion_pl_create_info = {
1004 .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
1005 .setLayoutCount = 1,
1006 .pSetLayouts = &device->meta_state.query.ds_layout,
1007 .pushConstantRangeCount = 1,
1008 .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16},
1009 };
1010
1011 result = radv_CreatePipelineLayout(radv_device_to_handle(device),
1012 &occlusion_pl_create_info,
1013 &device->meta_state.alloc,
1014 &device->meta_state.query.p_layout);
1015 if (result != VK_SUCCESS)
1016 goto fail;
1017
1018 VkPipelineShaderStageCreateInfo occlusion_pipeline_shader_stage = {
1019 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1020 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1021 .module = radv_shader_module_to_handle(&occlusion_cs),
1022 .pName = "main",
1023 .pSpecializationInfo = NULL,
1024 };
1025
1026 VkComputePipelineCreateInfo occlusion_vk_pipeline_info = {
1027 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1028 .stage = occlusion_pipeline_shader_stage,
1029 .flags = 0,
1030 .layout = device->meta_state.query.p_layout,
1031 };
1032
1033 result = radv_CreateComputePipelines(radv_device_to_handle(device),
1034 radv_pipeline_cache_to_handle(&device->meta_state.cache),
1035 1, &occlusion_vk_pipeline_info, NULL,
1036 &device->meta_state.query.occlusion_query_pipeline);
1037 if (result != VK_SUCCESS)
1038 goto fail;
1039
1040 VkPipelineShaderStageCreateInfo pipeline_statistics_pipeline_shader_stage = {
1041 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1042 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1043 .module = radv_shader_module_to_handle(&pipeline_statistics_cs),
1044 .pName = "main",
1045 .pSpecializationInfo = NULL,
1046 };
1047
1048 VkComputePipelineCreateInfo pipeline_statistics_vk_pipeline_info = {
1049 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1050 .stage = pipeline_statistics_pipeline_shader_stage,
1051 .flags = 0,
1052 .layout = device->meta_state.query.p_layout,
1053 };
1054
1055 result = radv_CreateComputePipelines(radv_device_to_handle(device),
1056 radv_pipeline_cache_to_handle(&device->meta_state.cache),
1057 1, &pipeline_statistics_vk_pipeline_info, NULL,
1058 &device->meta_state.query.pipeline_statistics_query_pipeline);
1059 if (result != VK_SUCCESS)
1060 goto fail;
1061
1062 VkPipelineShaderStageCreateInfo tfb_pipeline_shader_stage = {
1063 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1064 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1065 .module = radv_shader_module_to_handle(&tfb_cs),
1066 .pName = "main",
1067 .pSpecializationInfo = NULL,
1068 };
1069
1070 VkComputePipelineCreateInfo tfb_pipeline_info = {
1071 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1072 .stage = tfb_pipeline_shader_stage,
1073 .flags = 0,
1074 .layout = device->meta_state.query.p_layout,
1075 };
1076
1077 result = radv_CreateComputePipelines(radv_device_to_handle(device),
1078 radv_pipeline_cache_to_handle(&device->meta_state.cache),
1079 1, &tfb_pipeline_info, NULL,
1080 &device->meta_state.query.tfb_query_pipeline);
1081 if (result != VK_SUCCESS)
1082 goto fail;
1083
1084 VkPipelineShaderStageCreateInfo timestamp_pipeline_shader_stage = {
1085 .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
1086 .stage = VK_SHADER_STAGE_COMPUTE_BIT,
1087 .module = radv_shader_module_to_handle(&timestamp_cs),
1088 .pName = "main",
1089 .pSpecializationInfo = NULL,
1090 };
1091
1092 VkComputePipelineCreateInfo timestamp_pipeline_info = {
1093 .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
1094 .stage = timestamp_pipeline_shader_stage,
1095 .flags = 0,
1096 .layout = device->meta_state.query.p_layout,
1097 };
1098
1099 result = radv_CreateComputePipelines(radv_device_to_handle(device),
1100 radv_pipeline_cache_to_handle(&device->meta_state.cache),
1101 1, &timestamp_pipeline_info, NULL,
1102 &device->meta_state.query.timestamp_query_pipeline);
1103
1104 fail:
1105 if (result != VK_SUCCESS)
1106 radv_device_finish_meta_query_state(device);
1107 ralloc_free(occlusion_cs.nir);
1108 ralloc_free(pipeline_statistics_cs.nir);
1109 ralloc_free(tfb_cs.nir);
1110 ralloc_free(timestamp_cs.nir);
1111 mtx_unlock(&device->meta_state.mtx);
1112 return result;
1113 }
1114
1115 VkResult radv_device_init_meta_query_state(struct radv_device *device, bool on_demand)
1116 {
1117 if (on_demand)
1118 return VK_SUCCESS;
1119
1120 return radv_device_init_meta_query_state_internal(device);
1121 }
1122
1123 void radv_device_finish_meta_query_state(struct radv_device *device)
1124 {
1125 if (device->meta_state.query.tfb_query_pipeline)
1126 radv_DestroyPipeline(radv_device_to_handle(device),
1127 device->meta_state.query.tfb_query_pipeline,
1128 &device->meta_state.alloc);
1129
1130 if (device->meta_state.query.pipeline_statistics_query_pipeline)
1131 radv_DestroyPipeline(radv_device_to_handle(device),
1132 device->meta_state.query.pipeline_statistics_query_pipeline,
1133 &device->meta_state.alloc);
1134
1135 if (device->meta_state.query.occlusion_query_pipeline)
1136 radv_DestroyPipeline(radv_device_to_handle(device),
1137 device->meta_state.query.occlusion_query_pipeline,
1138 &device->meta_state.alloc);
1139
1140 if (device->meta_state.query.timestamp_query_pipeline)
1141 radv_DestroyPipeline(radv_device_to_handle(device),
1142 device->meta_state.query.timestamp_query_pipeline,
1143 &device->meta_state.alloc);
1144
1145 if (device->meta_state.query.p_layout)
1146 radv_DestroyPipelineLayout(radv_device_to_handle(device),
1147 device->meta_state.query.p_layout,
1148 &device->meta_state.alloc);
1149
1150 if (device->meta_state.query.ds_layout)
1151 radv_DestroyDescriptorSetLayout(radv_device_to_handle(device),
1152 device->meta_state.query.ds_layout,
1153 &device->meta_state.alloc);
1154 }
1155
1156 static void radv_query_shader(struct radv_cmd_buffer *cmd_buffer,
1157 VkPipeline *pipeline,
1158 struct radeon_winsys_bo *src_bo,
1159 struct radeon_winsys_bo *dst_bo,
1160 uint64_t src_offset, uint64_t dst_offset,
1161 uint32_t src_stride, uint32_t dst_stride,
1162 uint32_t count, uint32_t flags,
1163 uint32_t pipeline_stats_mask, uint32_t avail_offset)
1164 {
1165 struct radv_device *device = cmd_buffer->device;
1166 struct radv_meta_saved_state saved_state;
1167 bool old_predicating;
1168
1169 if (!*pipeline) {
1170 VkResult ret = radv_device_init_meta_query_state_internal(device);
1171 if (ret != VK_SUCCESS) {
1172 cmd_buffer->record_result = ret;
1173 return;
1174 }
1175 }
1176
1177 radv_meta_save(&saved_state, cmd_buffer,
1178 RADV_META_SAVE_COMPUTE_PIPELINE |
1179 RADV_META_SAVE_CONSTANTS |
1180 RADV_META_SAVE_DESCRIPTORS);
1181
1182 /* VK_EXT_conditional_rendering says that copy commands should not be
1183 * affected by conditional rendering.
1184 */
1185 old_predicating = cmd_buffer->state.predicating;
1186 cmd_buffer->state.predicating = false;
1187
1188 struct radv_buffer dst_buffer = {
1189 .bo = dst_bo,
1190 .offset = dst_offset,
1191 .size = dst_stride * count
1192 };
1193
1194 struct radv_buffer src_buffer = {
1195 .bo = src_bo,
1196 .offset = src_offset,
1197 .size = MAX2(src_stride * count, avail_offset + 4 * count - src_offset)
1198 };
1199
1200 radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer),
1201 VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
1202
1203 radv_meta_push_descriptor_set(cmd_buffer,
1204 VK_PIPELINE_BIND_POINT_COMPUTE,
1205 device->meta_state.query.p_layout,
1206 0, /* set */
1207 2, /* descriptorWriteCount */
1208 (VkWriteDescriptorSet[]) {
1209 {
1210 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1211 .dstBinding = 0,
1212 .dstArrayElement = 0,
1213 .descriptorCount = 1,
1214 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1215 .pBufferInfo = &(VkDescriptorBufferInfo) {
1216 .buffer = radv_buffer_to_handle(&dst_buffer),
1217 .offset = 0,
1218 .range = VK_WHOLE_SIZE
1219 }
1220 },
1221 {
1222 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
1223 .dstBinding = 1,
1224 .dstArrayElement = 0,
1225 .descriptorCount = 1,
1226 .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1227 .pBufferInfo = &(VkDescriptorBufferInfo) {
1228 .buffer = radv_buffer_to_handle(&src_buffer),
1229 .offset = 0,
1230 .range = VK_WHOLE_SIZE
1231 }
1232 }
1233 });
1234
1235 /* Encode the number of elements for easy access by the shader. */
1236 pipeline_stats_mask &= 0x7ff;
1237 pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;
1238
1239 avail_offset -= src_offset;
1240
1241 struct {
1242 uint32_t flags;
1243 uint32_t dst_stride;
1244 uint32_t pipeline_stats_mask;
1245 uint32_t avail_offset;
1246 } push_constants = {
1247 flags,
1248 dst_stride,
1249 pipeline_stats_mask,
1250 avail_offset
1251 };
1252
1253 radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer),
1254 device->meta_state.query.p_layout,
1255 VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
1256 &push_constants);
1257
1258 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2 |
1259 RADV_CMD_FLAG_INV_VCACHE;
1260
1261 if (flags & VK_QUERY_RESULT_WAIT_BIT)
1262 cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER;
1263
1264 radv_unaligned_dispatch(cmd_buffer, count, 1, 1);
1265
1266 /* Restore conditional rendering. */
1267 cmd_buffer->state.predicating = old_predicating;
1268
1269 radv_meta_restore(&saved_state, cmd_buffer);
1270 }
1271
1272 static bool
1273 radv_query_pool_needs_gds(struct radv_device *device,
1274 struct radv_query_pool *pool)
1275 {
1276 /* The number of primitives generated by geometry shader invocations is
1277 * only counted by the hardware if GS uses the legacy path. When NGG GS
1278 * is used, the hardware can't know the number of generated primitives
1279 * and we have to it manually inside the shader. To achieve that, the
1280 * driver does a plain GDS atomic to accumulate that value.
1281 * TODO: fix use of NGG GS and non-NGG GS inside the same begin/end
1282 * query.
1283 */
1284 return device->physical_device->use_ngg &&
1285 (pool->pipeline_stats_mask & VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1286 }
1287
1288 VkResult radv_CreateQueryPool(
1289 VkDevice _device,
1290 const VkQueryPoolCreateInfo* pCreateInfo,
1291 const VkAllocationCallbacks* pAllocator,
1292 VkQueryPool* pQueryPool)
1293 {
1294 RADV_FROM_HANDLE(radv_device, device, _device);
1295 struct radv_query_pool *pool = vk_alloc2(&device->alloc, pAllocator,
1296 sizeof(*pool), 8,
1297 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1298
1299 if (!pool)
1300 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1301
1302
1303 switch(pCreateInfo->queryType) {
1304 case VK_QUERY_TYPE_OCCLUSION:
1305 pool->stride = 16 * device->physical_device->rad_info.num_render_backends;
1306 break;
1307 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1308 pool->stride = pipelinestat_block_size * 2;
1309 break;
1310 case VK_QUERY_TYPE_TIMESTAMP:
1311 pool->stride = 8;
1312 break;
1313 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1314 pool->stride = 32;
1315 break;
1316 default:
1317 unreachable("creating unhandled query type");
1318 }
1319
1320 pool->type = pCreateInfo->queryType;
1321 pool->pipeline_stats_mask = pCreateInfo->pipelineStatistics;
1322 pool->availability_offset = pool->stride * pCreateInfo->queryCount;
1323 pool->size = pool->availability_offset;
1324 if (pCreateInfo->queryType == VK_QUERY_TYPE_PIPELINE_STATISTICS)
1325 pool->size += 4 * pCreateInfo->queryCount;
1326
1327 pool->bo = device->ws->buffer_create(device->ws, pool->size,
1328 64, RADEON_DOMAIN_GTT, RADEON_FLAG_NO_INTERPROCESS_SHARING,
1329 RADV_BO_PRIORITY_QUERY_POOL);
1330
1331 if (!pool->bo) {
1332 vk_free2(&device->alloc, pAllocator, pool);
1333 return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1334 }
1335
1336 pool->ptr = device->ws->buffer_map(pool->bo);
1337
1338 if (!pool->ptr) {
1339 device->ws->buffer_destroy(pool->bo);
1340 vk_free2(&device->alloc, pAllocator, pool);
1341 return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1342 }
1343
1344 *pQueryPool = radv_query_pool_to_handle(pool);
1345 return VK_SUCCESS;
1346 }
1347
1348 void radv_DestroyQueryPool(
1349 VkDevice _device,
1350 VkQueryPool _pool,
1351 const VkAllocationCallbacks* pAllocator)
1352 {
1353 RADV_FROM_HANDLE(radv_device, device, _device);
1354 RADV_FROM_HANDLE(radv_query_pool, pool, _pool);
1355
1356 if (!pool)
1357 return;
1358
1359 device->ws->buffer_destroy(pool->bo);
1360 vk_free2(&device->alloc, pAllocator, pool);
1361 }
1362
1363 VkResult radv_GetQueryPoolResults(
1364 VkDevice _device,
1365 VkQueryPool queryPool,
1366 uint32_t firstQuery,
1367 uint32_t queryCount,
1368 size_t dataSize,
1369 void* pData,
1370 VkDeviceSize stride,
1371 VkQueryResultFlags flags)
1372 {
1373 RADV_FROM_HANDLE(radv_device, device, _device);
1374 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1375 char *data = pData;
1376 VkResult result = VK_SUCCESS;
1377
1378 for(unsigned i = 0; i < queryCount; ++i, data += stride) {
1379 char *dest = data;
1380 unsigned query = firstQuery + i;
1381 char *src = pool->ptr + query * pool->stride;
1382 uint32_t available;
1383
1384 switch (pool->type) {
1385 case VK_QUERY_TYPE_TIMESTAMP: {
1386 volatile uint64_t const *src64 = (volatile uint64_t const *)src;
1387 available = *src64 != TIMESTAMP_NOT_READY;
1388
1389 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1390 while (*src64 == TIMESTAMP_NOT_READY)
1391 ;
1392 available = true;
1393 }
1394
1395 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1396 result = VK_NOT_READY;
1397
1398 if (flags & VK_QUERY_RESULT_64_BIT) {
1399 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1400 *(uint64_t*)dest = *src64;
1401 dest += 8;
1402 } else {
1403 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1404 *(uint32_t*)dest = *(volatile uint32_t*)src;
1405 dest += 4;
1406 }
1407 break;
1408 }
1409 case VK_QUERY_TYPE_OCCLUSION: {
1410 volatile uint64_t const *src64 = (volatile uint64_t const *)src;
1411 uint32_t db_count = device->physical_device->rad_info.num_render_backends;
1412 uint32_t enabled_rb_mask = device->physical_device->rad_info.enabled_rb_mask;
1413 uint64_t sample_count = 0;
1414 available = 1;
1415
1416 for (int i = 0; i < db_count; ++i) {
1417 uint64_t start, end;
1418
1419 if (!(enabled_rb_mask & (1 << i)))
1420 continue;
1421
1422 do {
1423 start = src64[2 * i];
1424 end = src64[2 * i + 1];
1425 } while ((!(start & (1ull << 63)) || !(end & (1ull << 63))) && (flags & VK_QUERY_RESULT_WAIT_BIT));
1426
1427 if (!(start & (1ull << 63)) || !(end & (1ull << 63)))
1428 available = 0;
1429 else {
1430 sample_count += end - start;
1431 }
1432 }
1433
1434 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1435 result = VK_NOT_READY;
1436
1437 if (flags & VK_QUERY_RESULT_64_BIT) {
1438 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1439 *(uint64_t*)dest = sample_count;
1440 dest += 8;
1441 } else {
1442 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1443 *(uint32_t*)dest = sample_count;
1444 dest += 4;
1445 }
1446 break;
1447 }
1448 case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
1449 if (flags & VK_QUERY_RESULT_WAIT_BIT)
1450 while(!*(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query))
1451 ;
1452 available = *(volatile uint32_t*)(pool->ptr + pool->availability_offset + 4 * query);
1453
1454 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1455 result = VK_NOT_READY;
1456
1457 const volatile uint64_t *start = (uint64_t*)src;
1458 const volatile uint64_t *stop = (uint64_t*)(src + pipelinestat_block_size);
1459 if (flags & VK_QUERY_RESULT_64_BIT) {
1460 uint64_t *dst = (uint64_t*)dest;
1461 dest += util_bitcount(pool->pipeline_stats_mask) * 8;
1462 for(int i = 0; i < 11; ++i) {
1463 if(pool->pipeline_stats_mask & (1u << i)) {
1464 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1465 *dst = stop[pipeline_statistics_indices[i]] -
1466 start[pipeline_statistics_indices[i]];
1467 dst++;
1468 }
1469 }
1470
1471 } else {
1472 uint32_t *dst = (uint32_t*)dest;
1473 dest += util_bitcount(pool->pipeline_stats_mask) * 4;
1474 for(int i = 0; i < 11; ++i) {
1475 if(pool->pipeline_stats_mask & (1u << i)) {
1476 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1477 *dst = stop[pipeline_statistics_indices[i]] -
1478 start[pipeline_statistics_indices[i]];
1479 dst++;
1480 }
1481 }
1482 }
1483 break;
1484 }
1485 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
1486 volatile uint64_t const *src64 = (volatile uint64_t const *)src;
1487 uint64_t num_primitives_written;
1488 uint64_t primitive_storage_needed;
1489
1490 /* SAMPLE_STREAMOUTSTATS stores this structure:
1491 * {
1492 * u64 NumPrimitivesWritten;
1493 * u64 PrimitiveStorageNeeded;
1494 * }
1495 */
1496 available = 1;
1497 for (int j = 0; j < 4; j++) {
1498 if (!(src64[j] & 0x8000000000000000UL))
1499 available = 0;
1500 }
1501
1502 if (!available && !(flags & VK_QUERY_RESULT_PARTIAL_BIT))
1503 result = VK_NOT_READY;
1504
1505 num_primitives_written = src64[3] - src64[1];
1506 primitive_storage_needed = src64[2] - src64[0];
1507
1508 if (flags & VK_QUERY_RESULT_64_BIT) {
1509 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1510 *(uint64_t *)dest = num_primitives_written;
1511 dest += 8;
1512 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1513 *(uint64_t *)dest = primitive_storage_needed;
1514 dest += 8;
1515 } else {
1516 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1517 *(uint32_t *)dest = num_primitives_written;
1518 dest += 4;
1519 if (available || (flags & VK_QUERY_RESULT_PARTIAL_BIT))
1520 *(uint32_t *)dest = primitive_storage_needed;
1521 dest += 4;
1522 }
1523 break;
1524 }
1525 default:
1526 unreachable("trying to get results of unhandled query type");
1527 }
1528
1529 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
1530 if (flags & VK_QUERY_RESULT_64_BIT) {
1531 *(uint64_t*)dest = available;
1532 } else {
1533 *(uint32_t*)dest = available;
1534 }
1535 }
1536 }
1537
1538 return result;
1539 }
1540
1541 static void emit_query_flush(struct radv_cmd_buffer *cmd_buffer,
1542 struct radv_query_pool *pool)
1543 {
1544 if (cmd_buffer->pending_reset_query) {
1545 if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
1546 /* Only need to flush caches if the query pool size is
1547 * large enough to be resetted using the compute shader
1548 * path. Small pools don't need any cache flushes
1549 * because we use a CP dma clear.
1550 */
1551 si_emit_cache_flush(cmd_buffer);
1552 }
1553 }
1554 }
1555
1556 void radv_CmdCopyQueryPoolResults(
1557 VkCommandBuffer commandBuffer,
1558 VkQueryPool queryPool,
1559 uint32_t firstQuery,
1560 uint32_t queryCount,
1561 VkBuffer dstBuffer,
1562 VkDeviceSize dstOffset,
1563 VkDeviceSize stride,
1564 VkQueryResultFlags flags)
1565 {
1566 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1567 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1568 RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
1569 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1570 uint64_t va = radv_buffer_get_va(pool->bo);
1571 uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
1572 dest_va += dst_buffer->offset + dstOffset;
1573
1574 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
1575 radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
1576
1577 /* From the Vulkan spec 1.1.108:
1578 *
1579 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of
1580 * previous uses of vkCmdResetQueryPool in the same queue, without any
1581 * additional synchronization."
1582 *
1583 * So, we have to flush the caches if the compute shader path was used.
1584 */
1585 emit_query_flush(cmd_buffer, pool);
1586
1587 switch (pool->type) {
1588 case VK_QUERY_TYPE_OCCLUSION:
1589 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1590 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1591 unsigned query = firstQuery + i;
1592 uint64_t src_va = va + query * pool->stride + pool->stride - 4;
1593
1594 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1595
1596 /* Waits on the upper word of the last DB entry */
1597 radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL,
1598 src_va, 0x80000000, 0xffffffff);
1599 }
1600 }
1601 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.occlusion_query_pipeline,
1602 pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1603 dst_buffer->offset + dstOffset,
1604 pool->stride, stride,
1605 queryCount, flags, 0, 0);
1606 break;
1607 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1608 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1609 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1610 unsigned query = firstQuery + i;
1611
1612 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1613
1614 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1615
1616 /* This waits on the ME. All copies below are done on the ME */
1617 radv_cp_wait_mem(cs, WAIT_REG_MEM_EQUAL,
1618 avail_va, 1, 0xffffffff);
1619 }
1620 }
1621 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
1622 pool->bo, dst_buffer->bo, firstQuery * pool->stride,
1623 dst_buffer->offset + dstOffset,
1624 pool->stride, stride, queryCount, flags,
1625 pool->pipeline_stats_mask,
1626 pool->availability_offset + 4 * firstQuery);
1627 break;
1628 case VK_QUERY_TYPE_TIMESTAMP:
1629 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1630 for(unsigned i = 0; i < queryCount; ++i, dest_va += stride) {
1631 unsigned query = firstQuery + i;
1632 uint64_t local_src_va = va + query * pool->stride;
1633
1634 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1635
1636 /* Wait on the high 32 bits of the timestamp in
1637 * case the low part is 0xffffffff.
1638 */
1639 radv_cp_wait_mem(cs, WAIT_REG_MEM_NOT_EQUAL,
1640 local_src_va + 4,
1641 TIMESTAMP_NOT_READY >> 32,
1642 0xffffffff);
1643 }
1644 }
1645
1646 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.timestamp_query_pipeline,
1647 pool->bo, dst_buffer->bo,
1648 firstQuery * pool->stride,
1649 dst_buffer->offset + dstOffset,
1650 pool->stride, stride,
1651 queryCount, flags, 0, 0);
1652 break;
1653 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1654 if (flags & VK_QUERY_RESULT_WAIT_BIT) {
1655 for(unsigned i = 0; i < queryCount; i++) {
1656 unsigned query = firstQuery + i;
1657 uint64_t src_va = va + query * pool->stride;
1658
1659 radeon_check_space(cmd_buffer->device->ws, cs, 7 * 4);
1660
1661 /* Wait on the upper word of all results. */
1662 for (unsigned j = 0; j < 4; j++, src_va += 8) {
1663 radv_cp_wait_mem(cs, WAIT_REG_MEM_GREATER_OR_EQUAL,
1664 src_va + 4, 0x80000000,
1665 0xffffffff);
1666 }
1667 }
1668 }
1669
1670 radv_query_shader(cmd_buffer, &cmd_buffer->device->meta_state.query.tfb_query_pipeline,
1671 pool->bo, dst_buffer->bo,
1672 firstQuery * pool->stride,
1673 dst_buffer->offset + dstOffset,
1674 pool->stride, stride,
1675 queryCount, flags, 0, 0);
1676 break;
1677 default:
1678 unreachable("trying to get results of unhandled query type");
1679 }
1680
1681 }
1682
1683 void radv_CmdResetQueryPool(
1684 VkCommandBuffer commandBuffer,
1685 VkQueryPool queryPool,
1686 uint32_t firstQuery,
1687 uint32_t queryCount)
1688 {
1689 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1690 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1691 uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1692 ? TIMESTAMP_NOT_READY : 0;
1693 uint32_t flush_bits = 0;
1694
1695 /* Make sure to sync all previous work if the given command buffer has
1696 * pending active queries. Otherwise the GPU might write queries data
1697 * after the reset operation.
1698 */
1699 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
1700
1701 flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1702 firstQuery * pool->stride,
1703 queryCount * pool->stride, value);
1704
1705 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1706 flush_bits |= radv_fill_buffer(cmd_buffer, pool->bo,
1707 pool->availability_offset + firstQuery * 4,
1708 queryCount * 4, 0);
1709 }
1710
1711 if (flush_bits) {
1712 /* Only need to flush caches for the compute shader path. */
1713 cmd_buffer->pending_reset_query = true;
1714 cmd_buffer->state.flush_bits |= flush_bits;
1715 }
1716 }
1717
1718 void radv_ResetQueryPool(
1719 VkDevice _device,
1720 VkQueryPool queryPool,
1721 uint32_t firstQuery,
1722 uint32_t queryCount)
1723 {
1724 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1725
1726 uint32_t value = pool->type == VK_QUERY_TYPE_TIMESTAMP
1727 ? TIMESTAMP_NOT_READY : 0;
1728 uint32_t *data = (uint32_t*)(pool->ptr + firstQuery * pool->stride);
1729 uint32_t *data_end = (uint32_t*)(pool->ptr + (firstQuery + queryCount) * pool->stride);
1730
1731 for(uint32_t *p = data; p != data_end; ++p)
1732 *p = value;
1733
1734 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) {
1735 memset(pool->ptr + pool->availability_offset + firstQuery * 4,
1736 0, queryCount * 4);
1737 }
1738 }
1739
1740 static unsigned event_type_for_stream(unsigned stream)
1741 {
1742 switch (stream) {
1743 default:
1744 case 0: return V_028A90_SAMPLE_STREAMOUTSTATS;
1745 case 1: return V_028A90_SAMPLE_STREAMOUTSTATS1;
1746 case 2: return V_028A90_SAMPLE_STREAMOUTSTATS2;
1747 case 3: return V_028A90_SAMPLE_STREAMOUTSTATS3;
1748 }
1749 }
1750
1751 static void emit_begin_query(struct radv_cmd_buffer *cmd_buffer,
1752 struct radv_query_pool *pool,
1753 uint64_t va,
1754 VkQueryType query_type,
1755 VkQueryControlFlags flags,
1756 uint32_t index)
1757 {
1758 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1759 switch (query_type) {
1760 case VK_QUERY_TYPE_OCCLUSION:
1761 radeon_check_space(cmd_buffer->device->ws, cs, 7);
1762
1763 ++cmd_buffer->state.active_occlusion_queries;
1764 if (cmd_buffer->state.active_occlusion_queries == 1) {
1765 if (flags & VK_QUERY_CONTROL_PRECISE_BIT) {
1766 /* This is the first occlusion query, enable
1767 * the hint if the precision bit is set.
1768 */
1769 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1770 }
1771
1772 radv_set_db_count_control(cmd_buffer);
1773 } else {
1774 if ((flags & VK_QUERY_CONTROL_PRECISE_BIT) &&
1775 !cmd_buffer->state.perfect_occlusion_queries_enabled) {
1776 /* This is not the first query, but this one
1777 * needs to enable precision, DB_COUNT_CONTROL
1778 * has to be updated accordingly.
1779 */
1780 cmd_buffer->state.perfect_occlusion_queries_enabled = true;
1781
1782 radv_set_db_count_control(cmd_buffer);
1783 }
1784 }
1785
1786 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1787 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1788 radeon_emit(cs, va);
1789 radeon_emit(cs, va >> 32);
1790 break;
1791 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1792 radeon_check_space(cmd_buffer->device->ws, cs, 4);
1793
1794 ++cmd_buffer->state.active_pipeline_queries;
1795 if (cmd_buffer->state.active_pipeline_queries == 1) {
1796 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1797 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
1798 }
1799
1800 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1801 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1802 radeon_emit(cs, va);
1803 radeon_emit(cs, va >> 32);
1804
1805 if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
1806 int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1807
1808 /* Make sure GDS is idle before copying the value. */
1809 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1810 RADV_CMD_FLAG_INV_L2;
1811 si_emit_cache_flush(cmd_buffer);
1812
1813 va += 8 * idx;
1814
1815 si_cs_emit_write_event_eop(cs,
1816 cmd_buffer->device->physical_device->rad_info.chip_class,
1817 radv_cmd_buffer_uses_mec(cmd_buffer),
1818 V_028A90_PS_DONE, 0,
1819 EOP_DST_SEL_TC_L2,
1820 EOP_DATA_SEL_GDS,
1821 va, EOP_DATA_GDS(0, 1), 0);
1822
1823 /* Record that the command buffer needs GDS. */
1824 cmd_buffer->gds_needed = true;
1825
1826 cmd_buffer->state.active_pipeline_gds_queries++;
1827 }
1828 break;
1829 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1830 radeon_check_space(cmd_buffer->device->ws, cs, 4);
1831
1832 assert(index < MAX_SO_STREAMS);
1833
1834 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1835 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1836 radeon_emit(cs, va);
1837 radeon_emit(cs, va >> 32);
1838 break;
1839 default:
1840 unreachable("beginning unhandled query type");
1841 }
1842
1843 }
1844
1845 static void emit_end_query(struct radv_cmd_buffer *cmd_buffer,
1846 struct radv_query_pool *pool,
1847 uint64_t va, uint64_t avail_va,
1848 VkQueryType query_type, uint32_t index)
1849 {
1850 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1851 switch (query_type) {
1852 case VK_QUERY_TYPE_OCCLUSION:
1853 radeon_check_space(cmd_buffer->device->ws, cs, 14);
1854
1855 cmd_buffer->state.active_occlusion_queries--;
1856 if (cmd_buffer->state.active_occlusion_queries == 0) {
1857 radv_set_db_count_control(cmd_buffer);
1858
1859 /* Reset the perfect occlusion queries hint now that no
1860 * queries are active.
1861 */
1862 cmd_buffer->state.perfect_occlusion_queries_enabled = false;
1863 }
1864
1865 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1866 radeon_emit(cs, EVENT_TYPE(V_028A90_ZPASS_DONE) | EVENT_INDEX(1));
1867 radeon_emit(cs, va + 8);
1868 radeon_emit(cs, (va + 8) >> 32);
1869
1870 break;
1871 case VK_QUERY_TYPE_PIPELINE_STATISTICS:
1872 radeon_check_space(cmd_buffer->device->ws, cs, 16);
1873
1874 cmd_buffer->state.active_pipeline_queries--;
1875 if (cmd_buffer->state.active_pipeline_queries == 0) {
1876 cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
1877 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
1878 }
1879 va += pipelinestat_block_size;
1880
1881 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1882 radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
1883 radeon_emit(cs, va);
1884 radeon_emit(cs, va >> 32);
1885
1886 si_cs_emit_write_event_eop(cs,
1887 cmd_buffer->device->physical_device->rad_info.chip_class,
1888 radv_cmd_buffer_uses_mec(cmd_buffer),
1889 V_028A90_BOTTOM_OF_PIPE_TS, 0,
1890 EOP_DST_SEL_MEM,
1891 EOP_DATA_SEL_VALUE_32BIT,
1892 avail_va, 1,
1893 cmd_buffer->gfx9_eop_bug_va);
1894
1895 if (radv_query_pool_needs_gds(cmd_buffer->device, pool)) {
1896 int idx = radv_get_pipeline_statistics_index(VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT);
1897
1898 /* Make sure GDS is idle before copying the value. */
1899 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1900 RADV_CMD_FLAG_INV_L2;
1901 si_emit_cache_flush(cmd_buffer);
1902
1903 va += 8 * idx;
1904
1905 si_cs_emit_write_event_eop(cs,
1906 cmd_buffer->device->physical_device->rad_info.chip_class,
1907 radv_cmd_buffer_uses_mec(cmd_buffer),
1908 V_028A90_PS_DONE, 0,
1909 EOP_DST_SEL_TC_L2,
1910 EOP_DATA_SEL_GDS,
1911 va, EOP_DATA_GDS(0, 1), 0);
1912
1913 cmd_buffer->state.active_pipeline_gds_queries--;
1914 }
1915 break;
1916 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
1917 radeon_check_space(cmd_buffer->device->ws, cs, 4);
1918
1919 assert(index < MAX_SO_STREAMS);
1920
1921 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1922 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(index)) | EVENT_INDEX(3));
1923 radeon_emit(cs, (va + 16));
1924 radeon_emit(cs, (va + 16) >> 32);
1925 break;
1926 default:
1927 unreachable("ending unhandled query type");
1928 }
1929
1930 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
1931 RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
1932 RADV_CMD_FLAG_INV_L2 |
1933 RADV_CMD_FLAG_INV_VCACHE;
1934 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
1935 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
1936 RADV_CMD_FLAG_FLUSH_AND_INV_DB;
1937 }
1938 }
1939
1940 void radv_CmdBeginQueryIndexedEXT(
1941 VkCommandBuffer commandBuffer,
1942 VkQueryPool queryPool,
1943 uint32_t query,
1944 VkQueryControlFlags flags,
1945 uint32_t index)
1946 {
1947 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1948 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1949 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1950 uint64_t va = radv_buffer_get_va(pool->bo);
1951
1952 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
1953
1954 emit_query_flush(cmd_buffer, pool);
1955
1956 va += pool->stride * query;
1957
1958 emit_begin_query(cmd_buffer, pool, va, pool->type, flags, index);
1959 }
1960
1961 void radv_CmdBeginQuery(
1962 VkCommandBuffer commandBuffer,
1963 VkQueryPool queryPool,
1964 uint32_t query,
1965 VkQueryControlFlags flags)
1966 {
1967 radv_CmdBeginQueryIndexedEXT(commandBuffer, queryPool, query, flags, 0);
1968 }
1969
1970 void radv_CmdEndQueryIndexedEXT(
1971 VkCommandBuffer commandBuffer,
1972 VkQueryPool queryPool,
1973 uint32_t query,
1974 uint32_t index)
1975 {
1976 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
1977 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
1978 uint64_t va = radv_buffer_get_va(pool->bo);
1979 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1980 va += pool->stride * query;
1981
1982 /* Do not need to add the pool BO to the list because the query must
1983 * currently be active, which means the BO is already in the list.
1984 */
1985 emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, index);
1986
1987 /*
1988 * For multiview we have to emit a query for each bit in the mask,
1989 * however the first query we emit will get the totals for all the
1990 * operations, so we don't want to get a real value in the other
1991 * queries. This emits a fake begin/end sequence so the waiting
1992 * code gets a completed query value and doesn't hang, but the
1993 * query returns 0.
1994 */
1995 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) {
1996 uint64_t avail_va = va + pool->availability_offset + 4 * query;
1997
1998
1999 for (unsigned i = 1; i < util_bitcount(cmd_buffer->state.subpass->view_mask); i++) {
2000 va += pool->stride;
2001 avail_va += 4;
2002 emit_begin_query(cmd_buffer, pool, va, pool->type, 0, 0);
2003 emit_end_query(cmd_buffer, pool, va, avail_va, pool->type, 0);
2004 }
2005 }
2006 }
2007
2008 void radv_CmdEndQuery(
2009 VkCommandBuffer commandBuffer,
2010 VkQueryPool queryPool,
2011 uint32_t query)
2012 {
2013 radv_CmdEndQueryIndexedEXT(commandBuffer, queryPool, query, 0);
2014 }
2015
2016 void radv_CmdWriteTimestamp(
2017 VkCommandBuffer commandBuffer,
2018 VkPipelineStageFlagBits pipelineStage,
2019 VkQueryPool queryPool,
2020 uint32_t query)
2021 {
2022 RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
2023 RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
2024 bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
2025 struct radeon_cmdbuf *cs = cmd_buffer->cs;
2026 uint64_t va = radv_buffer_get_va(pool->bo);
2027 uint64_t query_va = va + pool->stride * query;
2028
2029 radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
2030
2031 emit_query_flush(cmd_buffer, pool);
2032
2033 int num_queries = 1;
2034 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
2035 num_queries = util_bitcount(cmd_buffer->state.subpass->view_mask);
2036
2037 ASSERTED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 28 * num_queries);
2038
2039 for (unsigned i = 0; i < num_queries; i++) {
2040 switch(pipelineStage) {
2041 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT:
2042 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
2043 radeon_emit(cs, COPY_DATA_COUNT_SEL | COPY_DATA_WR_CONFIRM |
2044 COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
2045 COPY_DATA_DST_SEL(V_370_MEM));
2046 radeon_emit(cs, 0);
2047 radeon_emit(cs, 0);
2048 radeon_emit(cs, query_va);
2049 radeon_emit(cs, query_va >> 32);
2050 break;
2051 default:
2052 si_cs_emit_write_event_eop(cs,
2053 cmd_buffer->device->physical_device->rad_info.chip_class,
2054 mec,
2055 V_028A90_BOTTOM_OF_PIPE_TS, 0,
2056 EOP_DST_SEL_MEM,
2057 EOP_DATA_SEL_TIMESTAMP,
2058 query_va, 0,
2059 cmd_buffer->gfx9_eop_bug_va);
2060 break;
2061 }
2062 query_va += pool->stride;
2063 }
2064
2065 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
2066 RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
2067 RADV_CMD_FLAG_INV_L2 |
2068 RADV_CMD_FLAG_INV_VCACHE;
2069 if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
2070 cmd_buffer->active_query_flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
2071 RADV_CMD_FLAG_FLUSH_AND_INV_DB;
2072 }
2073
2074 assert(cmd_buffer->cs->cdw <= cdw_max);
2075 }