8584b2af505ebed7f16310f8d4041e60fd835a9f
[mesa.git] / src / gallium / drivers / radeonsi / gfx10_query.c
1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include <stddef.h>
26
27 #include "si_pipe.h"
28 #include "si_query.h"
29 #include "util/u_memory.h"
30 #include "util/u_suballoc.h"
31 #include "sid.h"
32
33 /**
34 * The query buffer is written to by ESGS NGG shaders with statistics about
35 * generated and (streamout-)emitted primitives.
36 *
37 * The context maintains a ring of these query buffers, and queries simply
38 * point into the ring, allowing an arbitrary number of queries to be active
39 * without additional GPU cost.
40 */
41 struct gfx10_sh_query_buffer {
42 struct list_head list;
43 struct si_resource *buf;
44 unsigned refcount;
45
46 /* Offset into the buffer in bytes; points at the first un-emitted entry. */
47 unsigned head;
48 };
49
50 /* Memory layout of the query buffer. Must be kept in sync with shaders
51 * (including QBO shaders) and should be aligned to cachelines.
52 *
53 * The somewhat awkward memory layout is for compatibility with the
54 * SET_PREDICATION packet, which also means that we're setting the high bit
55 * of all those values unconditionally.
56 */
57 struct gfx10_sh_query_buffer_mem {
58 struct {
59 uint64_t generated_primitives_start_dummy;
60 uint64_t emitted_primitives_start_dummy;
61 uint64_t generated_primitives;
62 uint64_t emitted_primitives;
63 } stream[4];
64 uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
65 uint32_t pad[31];
66 };
67
68 /* Shader-based queries. */
69 struct gfx10_sh_query {
70 struct si_query b;
71
72 struct gfx10_sh_query_buffer *first;
73 struct gfx10_sh_query_buffer *last;
74 unsigned first_begin;
75 unsigned last_end;
76
77 unsigned stream;
78 };
79
80 static void emit_shader_query(struct si_context *sctx)
81 {
82 assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers));
83
84 struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
85 struct gfx10_sh_query_buffer, list);
86 qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
87 }
88
89 static void gfx10_release_query_buffers(struct si_context *sctx,
90 struct gfx10_sh_query_buffer *first,
91 struct gfx10_sh_query_buffer *last)
92 {
93 while (first) {
94 struct gfx10_sh_query_buffer *qbuf = first;
95 if (first != last)
96 first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
97 else
98 first = NULL;
99
100 qbuf->refcount--;
101 if (qbuf->refcount)
102 continue;
103
104 if (qbuf->list.next == &sctx->shader_query_buffers)
105 continue; /* keep the most recent buffer; it may not be full yet */
106 if (qbuf->list.prev == &sctx->shader_query_buffers)
107 continue; /* keep the oldest buffer for recycling */
108
109 LIST_DEL(&qbuf->list);
110 si_resource_reference(&qbuf->buf, NULL);
111 FREE(qbuf);
112 }
113 }
114
115 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
116 {
117 if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
118 return true;
119
120 struct gfx10_sh_query_buffer *qbuf = NULL;
121
122 if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
123 qbuf = list_last_entry(&sctx->shader_query_buffers,
124 struct gfx10_sh_query_buffer, list);
125 if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
126 goto success;
127
128 qbuf = list_first_entry(&sctx->shader_query_buffers,
129 struct gfx10_sh_query_buffer, list);
130 if (!qbuf->refcount &&
131 !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
132 sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
133 /* Can immediately re-use the oldest buffer */
134 LIST_DEL(&qbuf->list);
135 } else {
136 qbuf = NULL;
137 }
138 }
139
140 if (!qbuf) {
141 qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
142 if (unlikely(!qbuf))
143 return false;
144
145 struct si_screen *screen = sctx->screen;
146 unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
147 screen->info.min_alloc_size);
148 qbuf->buf = si_resource(
149 pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
150 if (unlikely(!qbuf->buf)) {
151 FREE(qbuf);
152 return false;
153 }
154 }
155
156 /* The buffer is currently unused by the GPU. Initialize it.
157 *
158 * We need to set the high bit of all the primitive counters for
159 * compatibility with the SET_PREDICATION packet.
160 */
161 uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
162 PIPE_TRANSFER_WRITE |
163 PIPE_TRANSFER_UNSYNCHRONIZED);
164 assert(results);
165
166 for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
167 i < e; ++i) {
168 for (unsigned j = 0; j < 16; ++j)
169 results[32 * i + j] = (uint64_t)1 << 63;
170 results[32 * i + 16] = 0;
171 }
172
173 LIST_ADDTAIL(&qbuf->list, &sctx->shader_query_buffers);
174 qbuf->head = 0;
175 qbuf->refcount = sctx->num_active_shader_queries;
176
177 success:;
178 struct pipe_shader_buffer sbuf;
179 sbuf.buffer = &qbuf->buf->b.b;
180 sbuf.buffer_offset = qbuf->head;
181 sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
182 si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
183
184 si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
185 return true;
186 }
187
188 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
189 {
190 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
191 gfx10_release_query_buffers(sctx, query->first, query->last);
192 FREE(query);
193 }
194
195 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
196 {
197 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
198
199 gfx10_release_query_buffers(sctx, query->first, query->last);
200 query->first = query->last = NULL;
201
202 if (unlikely(!gfx10_alloc_query_buffer(sctx)))
203 return false;
204
205 query->first = list_last_entry(&sctx->shader_query_buffers,
206 struct gfx10_sh_query_buffer, list);
207 query->first_begin = query->first->head;
208
209 sctx->num_active_shader_queries++;
210 query->first->refcount++;
211
212 return true;
213 }
214
215 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
216 {
217 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
218
219 if (unlikely(!query->first))
220 return false; /* earlier out of memory error */
221
222 query->last = list_last_entry(&sctx->shader_query_buffers,
223 struct gfx10_sh_query_buffer, list);
224 query->last_end = query->last->head;
225
226 /* Signal the fence of the previous chunk */
227 if (query->last_end != 0) {
228 uint64_t fence_va = query->last->buf->gpu_address;
229 fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
230 fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
231 si_cp_release_mem(sctx, sctx->gfx_cs,
232 V_028A90_BOTTOM_OF_PIPE_TS, 0,
233 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
234 EOP_DATA_SEL_VALUE_32BIT,
235 query->last->buf, fence_va, 0xffffffff,
236 PIPE_QUERY_GPU_FINISHED);
237 }
238
239 sctx->num_active_shader_queries--;
240
241 if (sctx->num_active_shader_queries > 0) {
242 gfx10_alloc_query_buffer(sctx);
243 } else {
244 si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
245
246 /* If a query_begin is followed by a query_end without a draw
247 * in-between, we need to clear the atom to ensure that the
248 * next query_begin will re-initialize the shader buffer. */
249 si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
250 }
251
252 return true;
253 }
254
255 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
256 struct gfx10_sh_query_buffer_mem *qmem,
257 union pipe_query_result *result)
258 {
259 static const uint64_t mask = ((uint64_t)1 << 63) - 1;
260
261 switch (query->b.type) {
262 case PIPE_QUERY_PRIMITIVES_EMITTED:
263 result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
264 break;
265 case PIPE_QUERY_PRIMITIVES_GENERATED:
266 result->u64 += qmem->stream[query->stream].generated_primitives & mask;
267 break;
268 case PIPE_QUERY_SO_STATISTICS:
269 result->so_statistics.num_primitives_written +=
270 qmem->stream[query->stream].emitted_primitives & mask;
271 result->so_statistics.primitives_storage_needed +=
272 qmem->stream[query->stream].generated_primitives & mask;
273 break;
274 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
275 result->b |= qmem->stream[query->stream].emitted_primitives !=
276 qmem->stream[query->stream].generated_primitives;
277 break;
278 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
279 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
280 result->b |= qmem->stream[query->stream].emitted_primitives !=
281 qmem->stream[query->stream].generated_primitives;
282 }
283 break;
284 default:
285 assert(0);
286 }
287 }
288
289 static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
290 bool wait, union pipe_query_result *result)
291 {
292 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
293
294 util_query_clear_result(result, query->b.type);
295
296 if (unlikely(!query->first))
297 return false; /* earlier out of memory error */
298 assert(query->last);
299
300 for (struct gfx10_sh_query_buffer *qbuf = query->last;;
301 qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
302 unsigned usage = PIPE_TRANSFER_READ |
303 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
304 void *map;
305
306 if (rquery->b.flushed)
307 map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
308 else
309 map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
310
311 if (!map)
312 return false;
313
314 unsigned results_begin = 0;
315 unsigned results_end = qbuf->head;
316 if (qbuf == query->first)
317 results_begin = query->first_begin;
318 if (qbuf == query->last)
319 results_end = query->last_end;
320
321 while (results_begin != results_end) {
322 struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
323 results_begin += sizeof(*qmem);
324
325 gfx10_sh_query_add_result(query, qmem, result);
326 }
327
328 if (qbuf == query->first)
329 break;
330 }
331
332 return true;
333 }
334
335 static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
336 struct si_query *rquery,
337 bool wait,
338 enum pipe_query_value_type result_type,
339 int index,
340 struct pipe_resource *resource,
341 unsigned offset)
342 {
343 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
344 struct si_qbo_state saved_state = {};
345 struct pipe_resource *tmp_buffer = NULL;
346 unsigned tmp_buffer_offset = 0;
347
348 if (!sctx->sh_query_result_shader) {
349 sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
350 if (!sctx->sh_query_result_shader)
351 return;
352 }
353
354 if (query->first != query->last) {
355 u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
356 &tmp_buffer_offset, &tmp_buffer);
357 if (!tmp_buffer)
358 return;
359 }
360
361 si_save_qbo_state(sctx, &saved_state);
362
363 /* Pre-fill the constants configuring the shader behavior. */
364 struct {
365 uint32_t config;
366 uint32_t offset;
367 uint32_t chain;
368 uint32_t result_count;
369 } consts;
370 struct pipe_constant_buffer constant_buffer = {};
371
372 if (index >= 0) {
373 switch (query->b.type) {
374 case PIPE_QUERY_PRIMITIVES_GENERATED:
375 consts.offset = sizeof(uint32_t) * query->stream;
376 consts.config = 0;
377 break;
378 case PIPE_QUERY_PRIMITIVES_EMITTED:
379 consts.offset = sizeof(uint32_t) * (4 + query->stream);
380 consts.config = 0;
381 break;
382 case PIPE_QUERY_SO_STATISTICS:
383 consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
384 consts.config = 0;
385 break;
386 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
387 consts.offset = sizeof(uint32_t) * query->stream;
388 consts.config = 2;
389 break;
390 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
391 consts.offset = 0;
392 consts.config = 3;
393 break;
394 default: unreachable("bad query type");
395 }
396 } else {
397 /* Check result availability. */
398 consts.offset = 0;
399 consts.config = 1;
400 }
401
402 if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
403 consts.config |= 8;
404
405 constant_buffer.buffer_size = sizeof(consts);
406 constant_buffer.user_buffer = &consts;
407
408 /* Pre-fill the SSBOs and grid. */
409 struct pipe_shader_buffer ssbo[3];
410 struct pipe_grid_info grid = {};
411
412 ssbo[1].buffer = tmp_buffer;
413 ssbo[1].buffer_offset = tmp_buffer_offset;
414 ssbo[1].buffer_size = 16;
415
416 ssbo[2] = ssbo[1];
417
418 sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
419
420 grid.block[0] = 1;
421 grid.block[1] = 1;
422 grid.block[2] = 1;
423 grid.grid[0] = 1;
424 grid.grid[1] = 1;
425 grid.grid[2] = 1;
426
427 struct gfx10_sh_query_buffer *qbuf = query->first;
428 for (;;) {
429 unsigned begin = qbuf == query->first ? query->first_begin : 0;
430 unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
431 if (!end)
432 continue;
433
434 ssbo[0].buffer = &qbuf->buf->b.b;
435 ssbo[0].buffer_offset = begin;
436 ssbo[0].buffer_size = end - begin;
437
438 consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
439 consts.chain = 0;
440 if (qbuf != query->first)
441 consts.chain |= 1;
442 if (qbuf != query->last)
443 consts.chain |= 2;
444
445 if (qbuf == query->last) {
446 ssbo[2].buffer = resource;
447 ssbo[2].buffer_offset = offset;
448 ssbo[2].buffer_size = 8;
449 }
450
451 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
452 sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
453
454 if (wait) {
455 uint64_t va;
456
457 /* Wait for result availability. Wait only for readiness
458 * of the last entry, since the fence writes should be
459 * serialized in the CP.
460 */
461 va = qbuf->buf->gpu_address;
462 va += end - sizeof(struct gfx10_sh_query_buffer_mem);
463 va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
464
465 si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
466 }
467
468 sctx->b.launch_grid(&sctx->b, &grid);
469 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
470
471 if (qbuf == query->last)
472 break;
473 qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
474 }
475
476 si_restore_qbo_state(sctx, &saved_state);
477 pipe_resource_reference(&tmp_buffer, NULL);
478 }
479
480 static const struct si_query_ops gfx10_sh_query_ops = {
481 .destroy = gfx10_sh_query_destroy,
482 .begin = gfx10_sh_query_begin,
483 .end = gfx10_sh_query_end,
484 .get_result = gfx10_sh_query_get_result,
485 .get_result_resource = gfx10_sh_query_get_result_resource,
486 };
487
488 struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
489 enum pipe_query_type query_type,
490 unsigned index)
491 {
492 struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
493 if (unlikely(!query))
494 return NULL;
495
496 query->b.ops = &gfx10_sh_query_ops;
497 query->b.type = query_type;
498 query->stream = index;
499
500 return (struct pipe_query *)query;
501 }
502
503 void gfx10_init_query(struct si_context *sctx)
504 {
505 LIST_INITHEAD(&sctx->shader_query_buffers);
506 sctx->atoms.s.shader_query.emit = emit_shader_query;
507 }
508
509 void gfx10_destroy_query(struct si_context *sctx)
510 {
511 while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
512 struct gfx10_sh_query_buffer *qbuf =
513 list_first_entry(&sctx->shader_query_buffers,
514 struct gfx10_sh_query_buffer, list);
515 LIST_DEL(&qbuf->list);
516
517 assert(!qbuf->refcount);
518 si_resource_reference(&qbuf->buf, NULL);
519 FREE(qbuf);
520 }
521 }