2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
29 #include "util/u_memory.h"
30 #include "util/u_suballoc.h"
34 * The query buffer is written to by ESGS NGG shaders with statistics about
35 * generated and (streamout-)emitted primitives.
37 * The context maintains a ring of these query buffers, and queries simply
38 * point into the ring, allowing an arbitrary number of queries to be active
39 * without additional GPU cost.
41 struct gfx10_sh_query_buffer
{
42 struct list_head list
;
43 struct si_resource
*buf
;
46 /* Offset into the buffer in bytes; points at the first un-emitted entry. */
50 /* Memory layout of the query buffer. Must be kept in sync with shaders
51 * (including QBO shaders) and should be aligned to cachelines.
53 * The somewhat awkward memory layout is for compatibility with the
54 * SET_PREDICATION packet, which also means that we're setting the high bit
55 * of all those values unconditionally.
57 struct gfx10_sh_query_buffer_mem
{
59 uint64_t generated_primitives_start_dummy
;
60 uint64_t emitted_primitives_start_dummy
;
61 uint64_t generated_primitives
;
62 uint64_t emitted_primitives
;
64 uint32_t fence
; /* bottom-of-pipe fence: set to ~0 when draws have finished */
68 /* Shader-based queries. */
69 struct gfx10_sh_query
{
72 struct gfx10_sh_query_buffer
*first
;
73 struct gfx10_sh_query_buffer
*last
;
80 static void emit_shader_query(struct si_context
*sctx
)
82 assert(!LIST_IS_EMPTY(&sctx
->shader_query_buffers
));
84 struct gfx10_sh_query_buffer
*qbuf
= list_last_entry(&sctx
->shader_query_buffers
,
85 struct gfx10_sh_query_buffer
, list
);
86 qbuf
->head
+= sizeof(struct gfx10_sh_query_buffer_mem
);
89 static void gfx10_release_query_buffers(struct si_context
*sctx
,
90 struct gfx10_sh_query_buffer
*first
,
91 struct gfx10_sh_query_buffer
*last
)
94 struct gfx10_sh_query_buffer
*qbuf
= first
;
96 first
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.next
, list
);
104 if (qbuf
->list
.next
== &sctx
->shader_query_buffers
)
105 continue; /* keep the most recent buffer; it may not be full yet */
106 if (qbuf
->list
.prev
== &sctx
->shader_query_buffers
)
107 continue; /* keep the oldest buffer for recycling */
109 LIST_DEL(&qbuf
->list
);
110 si_resource_reference(&qbuf
->buf
, NULL
);
115 static bool gfx10_alloc_query_buffer(struct si_context
*sctx
)
117 if (si_is_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
))
120 struct gfx10_sh_query_buffer
*qbuf
= NULL
;
122 if (!LIST_IS_EMPTY(&sctx
->shader_query_buffers
)) {
123 qbuf
= list_last_entry(&sctx
->shader_query_buffers
,
124 struct gfx10_sh_query_buffer
, list
);
125 if (qbuf
->head
+ sizeof(struct gfx10_sh_query_buffer_mem
) <= qbuf
->buf
->b
.b
.width0
)
128 qbuf
= list_first_entry(&sctx
->shader_query_buffers
,
129 struct gfx10_sh_query_buffer
, list
);
130 if (!qbuf
->refcount
&&
131 !si_rings_is_buffer_referenced(sctx
, qbuf
->buf
->buf
, RADEON_USAGE_READWRITE
) &&
132 sctx
->ws
->buffer_wait(qbuf
->buf
->buf
, 0, RADEON_USAGE_READWRITE
)) {
133 /* Can immediately re-use the oldest buffer */
134 LIST_DEL(&qbuf
->list
);
141 qbuf
= CALLOC_STRUCT(gfx10_sh_query_buffer
);
145 struct si_screen
*screen
= sctx
->screen
;
146 unsigned buf_size
= MAX2(sizeof(struct gfx10_sh_query_buffer_mem
),
147 screen
->info
.min_alloc_size
);
148 qbuf
->buf
= si_resource(
149 pipe_buffer_create(&screen
->b
, 0, PIPE_USAGE_STAGING
, buf_size
));
150 if (unlikely(!qbuf
->buf
)) {
156 /* The buffer is currently unused by the GPU. Initialize it.
158 * We need to set the high bit of all the primitive counters for
159 * compatibility with the SET_PREDICATION packet.
161 uint64_t *results
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
,
162 PIPE_TRANSFER_WRITE
|
163 PIPE_TRANSFER_UNSYNCHRONIZED
);
166 for (unsigned i
= 0, e
= qbuf
->buf
->b
.b
.width0
/ sizeof(struct gfx10_sh_query_buffer_mem
);
168 for (unsigned j
= 0; j
< 16; ++j
)
169 results
[32 * i
+ j
] = (uint64_t)1 << 63;
170 results
[32 * i
+ 16] = 0;
173 LIST_ADDTAIL(&qbuf
->list
, &sctx
->shader_query_buffers
);
175 qbuf
->refcount
= sctx
->num_active_shader_queries
;
178 struct pipe_shader_buffer sbuf
;
179 sbuf
.buffer
= &qbuf
->buf
->b
.b
;
180 sbuf
.buffer_offset
= qbuf
->head
;
181 sbuf
.buffer_size
= sizeof(struct gfx10_sh_query_buffer_mem
);
182 si_set_rw_shader_buffer(sctx
, GFX10_GS_QUERY_BUF
, &sbuf
);
184 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
);
188 static void gfx10_sh_query_destroy(struct si_context
*sctx
, struct si_query
*rquery
)
190 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
191 gfx10_release_query_buffers(sctx
, query
->first
, query
->last
);
195 static bool gfx10_sh_query_begin(struct si_context
*sctx
, struct si_query
*rquery
)
197 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
199 gfx10_release_query_buffers(sctx
, query
->first
, query
->last
);
200 query
->first
= query
->last
= NULL
;
202 if (unlikely(!gfx10_alloc_query_buffer(sctx
)))
205 query
->first
= list_last_entry(&sctx
->shader_query_buffers
,
206 struct gfx10_sh_query_buffer
, list
);
207 query
->first_begin
= query
->first
->head
;
209 sctx
->num_active_shader_queries
++;
210 query
->first
->refcount
++;
215 static bool gfx10_sh_query_end(struct si_context
*sctx
, struct si_query
*rquery
)
217 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
219 if (unlikely(!query
->first
))
220 return false; /* earlier out of memory error */
222 query
->last
= list_last_entry(&sctx
->shader_query_buffers
,
223 struct gfx10_sh_query_buffer
, list
);
224 query
->last_end
= query
->last
->head
;
226 /* Signal the fence of the previous chunk */
227 if (query
->last_end
!= 0) {
228 uint64_t fence_va
= query
->last
->buf
->gpu_address
;
229 fence_va
+= query
->last_end
- sizeof(struct gfx10_sh_query_buffer_mem
);
230 fence_va
+= offsetof(struct gfx10_sh_query_buffer_mem
, fence
);
231 si_cp_release_mem(sctx
, sctx
->gfx_cs
,
232 V_028A90_BOTTOM_OF_PIPE_TS
, 0,
233 EOP_DST_SEL_MEM
, EOP_INT_SEL_NONE
,
234 EOP_DATA_SEL_VALUE_32BIT
,
235 query
->last
->buf
, fence_va
, 0xffffffff,
236 PIPE_QUERY_GPU_FINISHED
);
239 sctx
->num_active_shader_queries
--;
241 if (sctx
->num_active_shader_queries
> 0) {
242 gfx10_alloc_query_buffer(sctx
);
244 si_set_rw_shader_buffer(sctx
, GFX10_GS_QUERY_BUF
, NULL
);
246 /* If a query_begin is followed by a query_end without a draw
247 * in-between, we need to clear the atom to ensure that the
248 * next query_begin will re-initialize the shader buffer. */
249 si_set_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
, false);
255 static void gfx10_sh_query_add_result(struct gfx10_sh_query
*query
,
256 struct gfx10_sh_query_buffer_mem
*qmem
,
257 union pipe_query_result
*result
)
259 static const uint64_t mask
= ((uint64_t)1 << 63) - 1;
261 switch (query
->b
.type
) {
262 case PIPE_QUERY_PRIMITIVES_EMITTED
:
263 result
->u64
+= qmem
->stream
[query
->stream
].emitted_primitives
& mask
;
265 case PIPE_QUERY_PRIMITIVES_GENERATED
:
266 result
->u64
+= qmem
->stream
[query
->stream
].generated_primitives
& mask
;
268 case PIPE_QUERY_SO_STATISTICS
:
269 result
->so_statistics
.num_primitives_written
+=
270 qmem
->stream
[query
->stream
].emitted_primitives
& mask
;
271 result
->so_statistics
.primitives_storage_needed
+=
272 qmem
->stream
[query
->stream
].generated_primitives
& mask
;
274 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
275 result
->b
|= qmem
->stream
[query
->stream
].emitted_primitives
!=
276 qmem
->stream
[query
->stream
].generated_primitives
;
278 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
279 for (unsigned stream
= 0; stream
< SI_MAX_STREAMS
; ++stream
) {
280 result
->b
|= qmem
->stream
[query
->stream
].emitted_primitives
!=
281 qmem
->stream
[query
->stream
].generated_primitives
;
289 static bool gfx10_sh_query_get_result(struct si_context
*sctx
, struct si_query
*rquery
,
290 bool wait
, union pipe_query_result
*result
)
292 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
294 util_query_clear_result(result
, query
->b
.type
);
296 if (unlikely(!query
->first
))
297 return false; /* earlier out of memory error */
300 for (struct gfx10_sh_query_buffer
*qbuf
= query
->last
;;
301 qbuf
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.prev
, list
)) {
302 unsigned usage
= PIPE_TRANSFER_READ
|
303 (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
);
306 if (rquery
->b
.flushed
)
307 map
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
, usage
);
309 map
= si_buffer_map_sync_with_rings(sctx
, qbuf
->buf
, usage
);
314 unsigned results_begin
= 0;
315 unsigned results_end
= qbuf
->head
;
316 if (qbuf
== query
->first
)
317 results_begin
= query
->first_begin
;
318 if (qbuf
== query
->last
)
319 results_end
= query
->last_end
;
321 while (results_begin
!= results_end
) {
322 struct gfx10_sh_query_buffer_mem
*qmem
= map
+ results_begin
;
323 results_begin
+= sizeof(*qmem
);
325 gfx10_sh_query_add_result(query
, qmem
, result
);
328 if (qbuf
== query
->first
)
335 static void gfx10_sh_query_get_result_resource(struct si_context
*sctx
,
336 struct si_query
*rquery
,
338 enum pipe_query_value_type result_type
,
340 struct pipe_resource
*resource
,
343 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
344 struct si_qbo_state saved_state
= {};
345 struct pipe_resource
*tmp_buffer
= NULL
;
346 unsigned tmp_buffer_offset
= 0;
348 if (!sctx
->sh_query_result_shader
) {
349 sctx
->sh_query_result_shader
= gfx10_create_sh_query_result_cs(sctx
);
350 if (!sctx
->sh_query_result_shader
)
354 if (query
->first
!= query
->last
) {
355 u_suballocator_alloc(sctx
->allocator_zeroed_memory
, 16, 16,
356 &tmp_buffer_offset
, &tmp_buffer
);
361 si_save_qbo_state(sctx
, &saved_state
);
363 /* Pre-fill the constants configuring the shader behavior. */
368 uint32_t result_count
;
370 struct pipe_constant_buffer constant_buffer
= {};
373 switch (query
->b
.type
) {
374 case PIPE_QUERY_PRIMITIVES_GENERATED
:
375 consts
.offset
= sizeof(uint32_t) * query
->stream
;
378 case PIPE_QUERY_PRIMITIVES_EMITTED
:
379 consts
.offset
= sizeof(uint32_t) * (4 + query
->stream
);
382 case PIPE_QUERY_SO_STATISTICS
:
383 consts
.offset
= sizeof(uint32_t) * (4 * index
+ query
->stream
);
386 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
387 consts
.offset
= sizeof(uint32_t) * query
->stream
;
390 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
394 default: unreachable("bad query type");
397 /* Check result availability. */
402 if (result_type
== PIPE_QUERY_TYPE_I64
|| result_type
== PIPE_QUERY_TYPE_U64
)
405 constant_buffer
.buffer_size
= sizeof(consts
);
406 constant_buffer
.user_buffer
= &consts
;
408 /* Pre-fill the SSBOs and grid. */
409 struct pipe_shader_buffer ssbo
[3];
410 struct pipe_grid_info grid
= {};
412 ssbo
[1].buffer
= tmp_buffer
;
413 ssbo
[1].buffer_offset
= tmp_buffer_offset
;
414 ssbo
[1].buffer_size
= 16;
418 sctx
->b
.bind_compute_state(&sctx
->b
, sctx
->sh_query_result_shader
);
427 struct gfx10_sh_query_buffer
*qbuf
= query
->first
;
429 unsigned begin
= qbuf
== query
->first
? query
->first_begin
: 0;
430 unsigned end
= qbuf
== query
->last
? query
->last_end
: qbuf
->buf
->b
.b
.width0
;
434 ssbo
[0].buffer
= &qbuf
->buf
->b
.b
;
435 ssbo
[0].buffer_offset
= begin
;
436 ssbo
[0].buffer_size
= end
- begin
;
438 consts
.result_count
= (end
- begin
) / sizeof(struct gfx10_sh_query_buffer_mem
);
440 if (qbuf
!= query
->first
)
442 if (qbuf
!= query
->last
)
445 if (qbuf
== query
->last
) {
446 ssbo
[2].buffer
= resource
;
447 ssbo
[2].buffer_offset
= offset
;
448 ssbo
[2].buffer_size
= 8;
451 sctx
->b
.set_constant_buffer(&sctx
->b
, PIPE_SHADER_COMPUTE
, 0, &constant_buffer
);
452 sctx
->b
.set_shader_buffers(&sctx
->b
, PIPE_SHADER_COMPUTE
, 0, 3, ssbo
, 0x6);
457 /* Wait for result availability. Wait only for readiness
458 * of the last entry, since the fence writes should be
459 * serialized in the CP.
461 va
= qbuf
->buf
->gpu_address
;
462 va
+= end
- sizeof(struct gfx10_sh_query_buffer_mem
);
463 va
+= offsetof(struct gfx10_sh_query_buffer_mem
, fence
);
465 si_cp_wait_mem(sctx
, sctx
->gfx_cs
, va
, 0x00000001, 0x00000001, 0);
468 sctx
->b
.launch_grid(&sctx
->b
, &grid
);
469 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
;
471 if (qbuf
== query
->last
)
473 qbuf
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.next
, list
);
476 si_restore_qbo_state(sctx
, &saved_state
);
477 pipe_resource_reference(&tmp_buffer
, NULL
);
480 static const struct si_query_ops gfx10_sh_query_ops
= {
481 .destroy
= gfx10_sh_query_destroy
,
482 .begin
= gfx10_sh_query_begin
,
483 .end
= gfx10_sh_query_end
,
484 .get_result
= gfx10_sh_query_get_result
,
485 .get_result_resource
= gfx10_sh_query_get_result_resource
,
488 struct pipe_query
*gfx10_sh_query_create(struct si_screen
*screen
,
489 enum pipe_query_type query_type
,
492 struct gfx10_sh_query
*query
= CALLOC_STRUCT(gfx10_sh_query
);
493 if (unlikely(!query
))
496 query
->b
.ops
= &gfx10_sh_query_ops
;
497 query
->b
.type
= query_type
;
498 query
->stream
= index
;
500 return (struct pipe_query
*)query
;
503 void gfx10_init_query(struct si_context
*sctx
)
505 LIST_INITHEAD(&sctx
->shader_query_buffers
);
506 sctx
->atoms
.s
.shader_query
.emit
= emit_shader_query
;
509 void gfx10_destroy_query(struct si_context
*sctx
)
511 while (!LIST_IS_EMPTY(&sctx
->shader_query_buffers
)) {
512 struct gfx10_sh_query_buffer
*qbuf
=
513 list_first_entry(&sctx
->shader_query_buffers
,
514 struct gfx10_sh_query_buffer
, list
);
515 LIST_DEL(&qbuf
->list
);
517 assert(!qbuf
->refcount
);
518 si_resource_reference(&qbuf
->buf
, NULL
);