util: remove LIST_ADDTAIL macro
[mesa.git] / src / gallium / drivers / radeonsi / gfx10_query.c
1 /*
2 * Copyright 2018 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include <stddef.h>
26
27 #include "si_pipe.h"
28 #include "si_query.h"
29 #include "util/u_memory.h"
30 #include "util/u_suballoc.h"
31 #include "sid.h"
32
33 /**
34 * The query buffer is written to by ESGS NGG shaders with statistics about
35 * generated and (streamout-)emitted primitives.
36 *
37 * The context maintains a ring of these query buffers, and queries simply
38 * point into the ring, allowing an arbitrary number of queries to be active
39 * without additional GPU cost.
40 */
41 struct gfx10_sh_query_buffer {
42 struct list_head list;
43 struct si_resource *buf;
44 unsigned refcount;
45
46 /* Offset into the buffer in bytes; points at the first un-emitted entry. */
47 unsigned head;
48 };
49
50 /* Memory layout of the query buffer. Must be kept in sync with shaders
51 * (including QBO shaders) and should be aligned to cachelines.
52 *
53 * The somewhat awkward memory layout is for compatibility with the
54 * SET_PREDICATION packet, which also means that we're setting the high bit
55 * of all those values unconditionally.
56 */
57 struct gfx10_sh_query_buffer_mem {
58 struct {
59 uint64_t generated_primitives_start_dummy;
60 uint64_t emitted_primitives_start_dummy;
61 uint64_t generated_primitives;
62 uint64_t emitted_primitives;
63 } stream[4];
64 uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
65 uint32_t pad[31];
66 };
67
68 /* Shader-based queries. */
69 struct gfx10_sh_query {
70 struct si_query b;
71
72 struct gfx10_sh_query_buffer *first;
73 struct gfx10_sh_query_buffer *last;
74 unsigned first_begin;
75 unsigned last_end;
76
77 unsigned stream;
78 };
79
80 static void emit_shader_query(struct si_context *sctx)
81 {
82 assert(!LIST_IS_EMPTY(&sctx->shader_query_buffers));
83
84 struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
85 struct gfx10_sh_query_buffer, list);
86 qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
87 }
88
89 static void gfx10_release_query_buffers(struct si_context *sctx,
90 struct gfx10_sh_query_buffer *first,
91 struct gfx10_sh_query_buffer *last)
92 {
93 while (first) {
94 struct gfx10_sh_query_buffer *qbuf = first;
95 if (first != last)
96 first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
97 else
98 first = NULL;
99
100 qbuf->refcount--;
101 if (qbuf->refcount)
102 continue;
103
104 if (qbuf->list.next == &sctx->shader_query_buffers)
105 continue; /* keep the most recent buffer; it may not be full yet */
106 if (qbuf->list.prev == &sctx->shader_query_buffers)
107 continue; /* keep the oldest buffer for recycling */
108
109 LIST_DEL(&qbuf->list);
110 si_resource_reference(&qbuf->buf, NULL);
111 FREE(qbuf);
112 }
113 }
114
115 static bool gfx10_alloc_query_buffer(struct si_context *sctx)
116 {
117 if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
118 return true;
119
120 struct gfx10_sh_query_buffer *qbuf = NULL;
121
122 if (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
123 qbuf = list_last_entry(&sctx->shader_query_buffers,
124 struct gfx10_sh_query_buffer, list);
125 if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
126 goto success;
127
128 qbuf = list_first_entry(&sctx->shader_query_buffers,
129 struct gfx10_sh_query_buffer, list);
130 if (!qbuf->refcount &&
131 !si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
132 sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
133 /* Can immediately re-use the oldest buffer */
134 LIST_DEL(&qbuf->list);
135 } else {
136 qbuf = NULL;
137 }
138 }
139
140 if (!qbuf) {
141 qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
142 if (unlikely(!qbuf))
143 return false;
144
145 struct si_screen *screen = sctx->screen;
146 unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
147 screen->info.min_alloc_size);
148 qbuf->buf = si_resource(
149 pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
150 if (unlikely(!qbuf->buf)) {
151 FREE(qbuf);
152 return false;
153 }
154 }
155
156 /* The buffer is currently unused by the GPU. Initialize it.
157 *
158 * We need to set the high bit of all the primitive counters for
159 * compatibility with the SET_PREDICATION packet.
160 */
161 uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
162 PIPE_TRANSFER_WRITE |
163 PIPE_TRANSFER_UNSYNCHRONIZED);
164 assert(results);
165
166 for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
167 i < e; ++i) {
168 for (unsigned j = 0; j < 16; ++j)
169 results[32 * i + j] = (uint64_t)1 << 63;
170 results[32 * i + 16] = 0;
171 }
172
173 list_addtail(&qbuf->list, &sctx->shader_query_buffers);
174 qbuf->head = 0;
175 qbuf->refcount = sctx->num_active_shader_queries;
176
177 success:;
178 struct pipe_shader_buffer sbuf;
179 sbuf.buffer = &qbuf->buf->b.b;
180 sbuf.buffer_offset = qbuf->head;
181 sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
182 si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
183 sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
184
185 si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
186 return true;
187 }
188
189 static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
190 {
191 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
192 gfx10_release_query_buffers(sctx, query->first, query->last);
193 FREE(query);
194 }
195
196 static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
197 {
198 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
199
200 gfx10_release_query_buffers(sctx, query->first, query->last);
201 query->first = query->last = NULL;
202
203 if (unlikely(!gfx10_alloc_query_buffer(sctx)))
204 return false;
205
206 query->first = list_last_entry(&sctx->shader_query_buffers,
207 struct gfx10_sh_query_buffer, list);
208 query->first_begin = query->first->head;
209
210 sctx->num_active_shader_queries++;
211 query->first->refcount++;
212
213 return true;
214 }
215
216 static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
217 {
218 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
219
220 if (unlikely(!query->first))
221 return false; /* earlier out of memory error */
222
223 query->last = list_last_entry(&sctx->shader_query_buffers,
224 struct gfx10_sh_query_buffer, list);
225 query->last_end = query->last->head;
226
227 /* Signal the fence of the previous chunk */
228 if (query->last_end != 0) {
229 uint64_t fence_va = query->last->buf->gpu_address;
230 fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
231 fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
232 si_cp_release_mem(sctx, sctx->gfx_cs,
233 V_028A90_BOTTOM_OF_PIPE_TS, 0,
234 EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
235 EOP_DATA_SEL_VALUE_32BIT,
236 query->last->buf, fence_va, 0xffffffff,
237 PIPE_QUERY_GPU_FINISHED);
238 }
239
240 sctx->num_active_shader_queries--;
241
242 if (sctx->num_active_shader_queries > 0) {
243 gfx10_alloc_query_buffer(sctx);
244 } else {
245 si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
246 sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
247
248 /* If a query_begin is followed by a query_end without a draw
249 * in-between, we need to clear the atom to ensure that the
250 * next query_begin will re-initialize the shader buffer. */
251 si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
252 }
253
254 return true;
255 }
256
257 static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
258 struct gfx10_sh_query_buffer_mem *qmem,
259 union pipe_query_result *result)
260 {
261 static const uint64_t mask = ((uint64_t)1 << 63) - 1;
262
263 switch (query->b.type) {
264 case PIPE_QUERY_PRIMITIVES_EMITTED:
265 result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
266 break;
267 case PIPE_QUERY_PRIMITIVES_GENERATED:
268 result->u64 += qmem->stream[query->stream].generated_primitives & mask;
269 break;
270 case PIPE_QUERY_SO_STATISTICS:
271 result->so_statistics.num_primitives_written +=
272 qmem->stream[query->stream].emitted_primitives & mask;
273 result->so_statistics.primitives_storage_needed +=
274 qmem->stream[query->stream].generated_primitives & mask;
275 break;
276 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
277 result->b |= qmem->stream[query->stream].emitted_primitives !=
278 qmem->stream[query->stream].generated_primitives;
279 break;
280 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
281 for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
282 result->b |= qmem->stream[query->stream].emitted_primitives !=
283 qmem->stream[query->stream].generated_primitives;
284 }
285 break;
286 default:
287 assert(0);
288 }
289 }
290
291 static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
292 bool wait, union pipe_query_result *result)
293 {
294 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
295
296 util_query_clear_result(result, query->b.type);
297
298 if (unlikely(!query->first))
299 return false; /* earlier out of memory error */
300 assert(query->last);
301
302 for (struct gfx10_sh_query_buffer *qbuf = query->last;;
303 qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
304 unsigned usage = PIPE_TRANSFER_READ |
305 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
306 void *map;
307
308 if (rquery->b.flushed)
309 map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
310 else
311 map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
312
313 if (!map)
314 return false;
315
316 unsigned results_begin = 0;
317 unsigned results_end = qbuf->head;
318 if (qbuf == query->first)
319 results_begin = query->first_begin;
320 if (qbuf == query->last)
321 results_end = query->last_end;
322
323 while (results_begin != results_end) {
324 struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
325 results_begin += sizeof(*qmem);
326
327 gfx10_sh_query_add_result(query, qmem, result);
328 }
329
330 if (qbuf == query->first)
331 break;
332 }
333
334 return true;
335 }
336
337 static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
338 struct si_query *rquery,
339 bool wait,
340 enum pipe_query_value_type result_type,
341 int index,
342 struct pipe_resource *resource,
343 unsigned offset)
344 {
345 struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
346 struct si_qbo_state saved_state = {};
347 struct pipe_resource *tmp_buffer = NULL;
348 unsigned tmp_buffer_offset = 0;
349
350 if (!sctx->sh_query_result_shader) {
351 sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
352 if (!sctx->sh_query_result_shader)
353 return;
354 }
355
356 if (query->first != query->last) {
357 u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
358 &tmp_buffer_offset, &tmp_buffer);
359 if (!tmp_buffer)
360 return;
361 }
362
363 si_save_qbo_state(sctx, &saved_state);
364
365 /* Pre-fill the constants configuring the shader behavior. */
366 struct {
367 uint32_t config;
368 uint32_t offset;
369 uint32_t chain;
370 uint32_t result_count;
371 } consts;
372 struct pipe_constant_buffer constant_buffer = {};
373
374 if (index >= 0) {
375 switch (query->b.type) {
376 case PIPE_QUERY_PRIMITIVES_GENERATED:
377 consts.offset = sizeof(uint32_t) * query->stream;
378 consts.config = 0;
379 break;
380 case PIPE_QUERY_PRIMITIVES_EMITTED:
381 consts.offset = sizeof(uint32_t) * (4 + query->stream);
382 consts.config = 0;
383 break;
384 case PIPE_QUERY_SO_STATISTICS:
385 consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
386 consts.config = 0;
387 break;
388 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
389 consts.offset = sizeof(uint32_t) * query->stream;
390 consts.config = 2;
391 break;
392 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
393 consts.offset = 0;
394 consts.config = 3;
395 break;
396 default: unreachable("bad query type");
397 }
398 } else {
399 /* Check result availability. */
400 consts.offset = 0;
401 consts.config = 1;
402 }
403
404 if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
405 consts.config |= 8;
406
407 constant_buffer.buffer_size = sizeof(consts);
408 constant_buffer.user_buffer = &consts;
409
410 /* Pre-fill the SSBOs and grid. */
411 struct pipe_shader_buffer ssbo[3];
412 struct pipe_grid_info grid = {};
413
414 ssbo[1].buffer = tmp_buffer;
415 ssbo[1].buffer_offset = tmp_buffer_offset;
416 ssbo[1].buffer_size = 16;
417
418 ssbo[2] = ssbo[1];
419
420 sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
421
422 grid.block[0] = 1;
423 grid.block[1] = 1;
424 grid.block[2] = 1;
425 grid.grid[0] = 1;
426 grid.grid[1] = 1;
427 grid.grid[2] = 1;
428
429 struct gfx10_sh_query_buffer *qbuf = query->first;
430 for (;;) {
431 unsigned begin = qbuf == query->first ? query->first_begin : 0;
432 unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
433 if (!end)
434 continue;
435
436 ssbo[0].buffer = &qbuf->buf->b.b;
437 ssbo[0].buffer_offset = begin;
438 ssbo[0].buffer_size = end - begin;
439
440 consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
441 consts.chain = 0;
442 if (qbuf != query->first)
443 consts.chain |= 1;
444 if (qbuf != query->last)
445 consts.chain |= 2;
446
447 if (qbuf == query->last) {
448 ssbo[2].buffer = resource;
449 ssbo[2].buffer_offset = offset;
450 ssbo[2].buffer_size = 8;
451 }
452
453 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
454 sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
455
456 if (wait) {
457 uint64_t va;
458
459 /* Wait for result availability. Wait only for readiness
460 * of the last entry, since the fence writes should be
461 * serialized in the CP.
462 */
463 va = qbuf->buf->gpu_address;
464 va += end - sizeof(struct gfx10_sh_query_buffer_mem);
465 va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
466
467 si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
468 }
469
470 sctx->b.launch_grid(&sctx->b, &grid);
471 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
472
473 if (qbuf == query->last)
474 break;
475 qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
476 }
477
478 si_restore_qbo_state(sctx, &saved_state);
479 pipe_resource_reference(&tmp_buffer, NULL);
480 }
481
482 static const struct si_query_ops gfx10_sh_query_ops = {
483 .destroy = gfx10_sh_query_destroy,
484 .begin = gfx10_sh_query_begin,
485 .end = gfx10_sh_query_end,
486 .get_result = gfx10_sh_query_get_result,
487 .get_result_resource = gfx10_sh_query_get_result_resource,
488 };
489
490 struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
491 enum pipe_query_type query_type,
492 unsigned index)
493 {
494 struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
495 if (unlikely(!query))
496 return NULL;
497
498 query->b.ops = &gfx10_sh_query_ops;
499 query->b.type = query_type;
500 query->stream = index;
501
502 return (struct pipe_query *)query;
503 }
504
505 void gfx10_init_query(struct si_context *sctx)
506 {
507 list_inithead(&sctx->shader_query_buffers);
508 sctx->atoms.s.shader_query.emit = emit_shader_query;
509 }
510
511 void gfx10_destroy_query(struct si_context *sctx)
512 {
513 while (!LIST_IS_EMPTY(&sctx->shader_query_buffers)) {
514 struct gfx10_sh_query_buffer *qbuf =
515 list_first_entry(&sctx->shader_query_buffers,
516 struct gfx10_sh_query_buffer, list);
517 LIST_DEL(&qbuf->list);
518
519 assert(!qbuf->refcount);
520 si_resource_reference(&qbuf->buf, NULL);
521 FREE(qbuf);
522 }
523 }