2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
29 #include "util/u_memory.h"
30 #include "util/u_suballoc.h"
34 * The query buffer is written to by ESGS NGG shaders with statistics about
35 * generated and (streamout-)emitted primitives.
37 * The context maintains a ring of these query buffers, and queries simply
38 * point into the ring, allowing an arbitrary number of queries to be active
39 * without additional GPU cost.
41 struct gfx10_sh_query_buffer
{
42 struct list_head list
;
43 struct si_resource
*buf
;
46 /* Offset into the buffer in bytes; points at the first un-emitted entry. */
50 /* Memory layout of the query buffer. Must be kept in sync with shaders
51 * (including QBO shaders) and should be aligned to cachelines.
53 * The somewhat awkward memory layout is for compatibility with the
54 * SET_PREDICATION packet, which also means that we're setting the high bit
55 * of all those values unconditionally.
57 struct gfx10_sh_query_buffer_mem
{
59 uint64_t generated_primitives_start_dummy
;
60 uint64_t emitted_primitives_start_dummy
;
61 uint64_t generated_primitives
;
62 uint64_t emitted_primitives
;
64 uint32_t fence
; /* bottom-of-pipe fence: set to ~0 when draws have finished */
68 /* Shader-based queries. */
69 struct gfx10_sh_query
{
72 struct gfx10_sh_query_buffer
*first
;
73 struct gfx10_sh_query_buffer
*last
;
80 static void emit_shader_query(struct si_context
*sctx
)
82 assert(!LIST_IS_EMPTY(&sctx
->shader_query_buffers
));
84 struct gfx10_sh_query_buffer
*qbuf
= list_last_entry(&sctx
->shader_query_buffers
,
85 struct gfx10_sh_query_buffer
, list
);
86 qbuf
->head
+= sizeof(struct gfx10_sh_query_buffer_mem
);
89 static void gfx10_release_query_buffers(struct si_context
*sctx
,
90 struct gfx10_sh_query_buffer
*first
,
91 struct gfx10_sh_query_buffer
*last
)
94 struct gfx10_sh_query_buffer
*qbuf
= first
;
96 first
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.next
, list
);
104 if (qbuf
->list
.next
== &sctx
->shader_query_buffers
)
105 continue; /* keep the most recent buffer; it may not be full yet */
106 if (qbuf
->list
.prev
== &sctx
->shader_query_buffers
)
107 continue; /* keep the oldest buffer for recycling */
109 LIST_DEL(&qbuf
->list
);
110 si_resource_reference(&qbuf
->buf
, NULL
);
115 static bool gfx10_alloc_query_buffer(struct si_context
*sctx
)
117 if (si_is_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
))
120 struct gfx10_sh_query_buffer
*qbuf
= NULL
;
122 if (!LIST_IS_EMPTY(&sctx
->shader_query_buffers
)) {
123 qbuf
= list_last_entry(&sctx
->shader_query_buffers
,
124 struct gfx10_sh_query_buffer
, list
);
125 if (qbuf
->head
+ sizeof(struct gfx10_sh_query_buffer_mem
) <= qbuf
->buf
->b
.b
.width0
)
128 qbuf
= list_first_entry(&sctx
->shader_query_buffers
,
129 struct gfx10_sh_query_buffer
, list
);
130 if (!qbuf
->refcount
&&
131 !si_rings_is_buffer_referenced(sctx
, qbuf
->buf
->buf
, RADEON_USAGE_READWRITE
) &&
132 sctx
->ws
->buffer_wait(qbuf
->buf
->buf
, 0, RADEON_USAGE_READWRITE
)) {
133 /* Can immediately re-use the oldest buffer */
134 LIST_DEL(&qbuf
->list
);
141 qbuf
= CALLOC_STRUCT(gfx10_sh_query_buffer
);
145 struct si_screen
*screen
= sctx
->screen
;
146 unsigned buf_size
= MAX2(sizeof(struct gfx10_sh_query_buffer_mem
),
147 screen
->info
.min_alloc_size
);
148 qbuf
->buf
= si_resource(
149 pipe_buffer_create(&screen
->b
, 0, PIPE_USAGE_STAGING
, buf_size
));
150 if (unlikely(!qbuf
->buf
)) {
156 /* The buffer is currently unused by the GPU. Initialize it.
158 * We need to set the high bit of all the primitive counters for
159 * compatibility with the SET_PREDICATION packet.
161 uint64_t *results
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
,
162 PIPE_TRANSFER_WRITE
|
163 PIPE_TRANSFER_UNSYNCHRONIZED
);
166 for (unsigned i
= 0, e
= qbuf
->buf
->b
.b
.width0
/ sizeof(struct gfx10_sh_query_buffer_mem
);
168 for (unsigned j
= 0; j
< 16; ++j
)
169 results
[32 * i
+ j
] = (uint64_t)1 << 63;
170 results
[32 * i
+ 16] = 0;
173 LIST_ADDTAIL(&qbuf
->list
, &sctx
->shader_query_buffers
);
175 qbuf
->refcount
= sctx
->num_active_shader_queries
;
178 struct pipe_shader_buffer sbuf
;
179 sbuf
.buffer
= &qbuf
->buf
->b
.b
;
180 sbuf
.buffer_offset
= qbuf
->head
;
181 sbuf
.buffer_size
= sizeof(struct gfx10_sh_query_buffer_mem
);
182 si_set_rw_shader_buffer(sctx
, GFX10_GS_QUERY_BUF
, &sbuf
);
183 sctx
->current_vs_state
|= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
185 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
);
189 static void gfx10_sh_query_destroy(struct si_context
*sctx
, struct si_query
*rquery
)
191 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
192 gfx10_release_query_buffers(sctx
, query
->first
, query
->last
);
196 static bool gfx10_sh_query_begin(struct si_context
*sctx
, struct si_query
*rquery
)
198 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
200 gfx10_release_query_buffers(sctx
, query
->first
, query
->last
);
201 query
->first
= query
->last
= NULL
;
203 if (unlikely(!gfx10_alloc_query_buffer(sctx
)))
206 query
->first
= list_last_entry(&sctx
->shader_query_buffers
,
207 struct gfx10_sh_query_buffer
, list
);
208 query
->first_begin
= query
->first
->head
;
210 sctx
->num_active_shader_queries
++;
211 query
->first
->refcount
++;
216 static bool gfx10_sh_query_end(struct si_context
*sctx
, struct si_query
*rquery
)
218 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
220 if (unlikely(!query
->first
))
221 return false; /* earlier out of memory error */
223 query
->last
= list_last_entry(&sctx
->shader_query_buffers
,
224 struct gfx10_sh_query_buffer
, list
);
225 query
->last_end
= query
->last
->head
;
227 /* Signal the fence of the previous chunk */
228 if (query
->last_end
!= 0) {
229 uint64_t fence_va
= query
->last
->buf
->gpu_address
;
230 fence_va
+= query
->last_end
- sizeof(struct gfx10_sh_query_buffer_mem
);
231 fence_va
+= offsetof(struct gfx10_sh_query_buffer_mem
, fence
);
232 si_cp_release_mem(sctx
, sctx
->gfx_cs
,
233 V_028A90_BOTTOM_OF_PIPE_TS
, 0,
234 EOP_DST_SEL_MEM
, EOP_INT_SEL_NONE
,
235 EOP_DATA_SEL_VALUE_32BIT
,
236 query
->last
->buf
, fence_va
, 0xffffffff,
237 PIPE_QUERY_GPU_FINISHED
);
240 sctx
->num_active_shader_queries
--;
242 if (sctx
->num_active_shader_queries
> 0) {
243 gfx10_alloc_query_buffer(sctx
);
245 si_set_rw_shader_buffer(sctx
, GFX10_GS_QUERY_BUF
, NULL
);
246 sctx
->current_vs_state
&= C_VS_STATE_STREAMOUT_QUERY_ENABLED
;
248 /* If a query_begin is followed by a query_end without a draw
249 * in-between, we need to clear the atom to ensure that the
250 * next query_begin will re-initialize the shader buffer. */
251 si_set_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
, false);
257 static void gfx10_sh_query_add_result(struct gfx10_sh_query
*query
,
258 struct gfx10_sh_query_buffer_mem
*qmem
,
259 union pipe_query_result
*result
)
261 static const uint64_t mask
= ((uint64_t)1 << 63) - 1;
263 switch (query
->b
.type
) {
264 case PIPE_QUERY_PRIMITIVES_EMITTED
:
265 result
->u64
+= qmem
->stream
[query
->stream
].emitted_primitives
& mask
;
267 case PIPE_QUERY_PRIMITIVES_GENERATED
:
268 result
->u64
+= qmem
->stream
[query
->stream
].generated_primitives
& mask
;
270 case PIPE_QUERY_SO_STATISTICS
:
271 result
->so_statistics
.num_primitives_written
+=
272 qmem
->stream
[query
->stream
].emitted_primitives
& mask
;
273 result
->so_statistics
.primitives_storage_needed
+=
274 qmem
->stream
[query
->stream
].generated_primitives
& mask
;
276 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
277 result
->b
|= qmem
->stream
[query
->stream
].emitted_primitives
!=
278 qmem
->stream
[query
->stream
].generated_primitives
;
280 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
281 for (unsigned stream
= 0; stream
< SI_MAX_STREAMS
; ++stream
) {
282 result
->b
|= qmem
->stream
[query
->stream
].emitted_primitives
!=
283 qmem
->stream
[query
->stream
].generated_primitives
;
291 static bool gfx10_sh_query_get_result(struct si_context
*sctx
, struct si_query
*rquery
,
292 bool wait
, union pipe_query_result
*result
)
294 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
296 util_query_clear_result(result
, query
->b
.type
);
298 if (unlikely(!query
->first
))
299 return false; /* earlier out of memory error */
302 for (struct gfx10_sh_query_buffer
*qbuf
= query
->last
;;
303 qbuf
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.prev
, list
)) {
304 unsigned usage
= PIPE_TRANSFER_READ
|
305 (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
);
308 if (rquery
->b
.flushed
)
309 map
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
, usage
);
311 map
= si_buffer_map_sync_with_rings(sctx
, qbuf
->buf
, usage
);
316 unsigned results_begin
= 0;
317 unsigned results_end
= qbuf
->head
;
318 if (qbuf
== query
->first
)
319 results_begin
= query
->first_begin
;
320 if (qbuf
== query
->last
)
321 results_end
= query
->last_end
;
323 while (results_begin
!= results_end
) {
324 struct gfx10_sh_query_buffer_mem
*qmem
= map
+ results_begin
;
325 results_begin
+= sizeof(*qmem
);
327 gfx10_sh_query_add_result(query
, qmem
, result
);
330 if (qbuf
== query
->first
)
337 static void gfx10_sh_query_get_result_resource(struct si_context
*sctx
,
338 struct si_query
*rquery
,
340 enum pipe_query_value_type result_type
,
342 struct pipe_resource
*resource
,
345 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
346 struct si_qbo_state saved_state
= {};
347 struct pipe_resource
*tmp_buffer
= NULL
;
348 unsigned tmp_buffer_offset
= 0;
350 if (!sctx
->sh_query_result_shader
) {
351 sctx
->sh_query_result_shader
= gfx10_create_sh_query_result_cs(sctx
);
352 if (!sctx
->sh_query_result_shader
)
356 if (query
->first
!= query
->last
) {
357 u_suballocator_alloc(sctx
->allocator_zeroed_memory
, 16, 16,
358 &tmp_buffer_offset
, &tmp_buffer
);
363 si_save_qbo_state(sctx
, &saved_state
);
365 /* Pre-fill the constants configuring the shader behavior. */
370 uint32_t result_count
;
372 struct pipe_constant_buffer constant_buffer
= {};
375 switch (query
->b
.type
) {
376 case PIPE_QUERY_PRIMITIVES_GENERATED
:
377 consts
.offset
= sizeof(uint32_t) * query
->stream
;
380 case PIPE_QUERY_PRIMITIVES_EMITTED
:
381 consts
.offset
= sizeof(uint32_t) * (4 + query
->stream
);
384 case PIPE_QUERY_SO_STATISTICS
:
385 consts
.offset
= sizeof(uint32_t) * (4 * index
+ query
->stream
);
388 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
389 consts
.offset
= sizeof(uint32_t) * query
->stream
;
392 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
396 default: unreachable("bad query type");
399 /* Check result availability. */
404 if (result_type
== PIPE_QUERY_TYPE_I64
|| result_type
== PIPE_QUERY_TYPE_U64
)
407 constant_buffer
.buffer_size
= sizeof(consts
);
408 constant_buffer
.user_buffer
= &consts
;
410 /* Pre-fill the SSBOs and grid. */
411 struct pipe_shader_buffer ssbo
[3];
412 struct pipe_grid_info grid
= {};
414 ssbo
[1].buffer
= tmp_buffer
;
415 ssbo
[1].buffer_offset
= tmp_buffer_offset
;
416 ssbo
[1].buffer_size
= 16;
420 sctx
->b
.bind_compute_state(&sctx
->b
, sctx
->sh_query_result_shader
);
429 struct gfx10_sh_query_buffer
*qbuf
= query
->first
;
431 unsigned begin
= qbuf
== query
->first
? query
->first_begin
: 0;
432 unsigned end
= qbuf
== query
->last
? query
->last_end
: qbuf
->buf
->b
.b
.width0
;
436 ssbo
[0].buffer
= &qbuf
->buf
->b
.b
;
437 ssbo
[0].buffer_offset
= begin
;
438 ssbo
[0].buffer_size
= end
- begin
;
440 consts
.result_count
= (end
- begin
) / sizeof(struct gfx10_sh_query_buffer_mem
);
442 if (qbuf
!= query
->first
)
444 if (qbuf
!= query
->last
)
447 if (qbuf
== query
->last
) {
448 ssbo
[2].buffer
= resource
;
449 ssbo
[2].buffer_offset
= offset
;
450 ssbo
[2].buffer_size
= 8;
453 sctx
->b
.set_constant_buffer(&sctx
->b
, PIPE_SHADER_COMPUTE
, 0, &constant_buffer
);
454 sctx
->b
.set_shader_buffers(&sctx
->b
, PIPE_SHADER_COMPUTE
, 0, 3, ssbo
, 0x6);
459 /* Wait for result availability. Wait only for readiness
460 * of the last entry, since the fence writes should be
461 * serialized in the CP.
463 va
= qbuf
->buf
->gpu_address
;
464 va
+= end
- sizeof(struct gfx10_sh_query_buffer_mem
);
465 va
+= offsetof(struct gfx10_sh_query_buffer_mem
, fence
);
467 si_cp_wait_mem(sctx
, sctx
->gfx_cs
, va
, 0x00000001, 0x00000001, 0);
470 sctx
->b
.launch_grid(&sctx
->b
, &grid
);
471 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
;
473 if (qbuf
== query
->last
)
475 qbuf
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.next
, list
);
478 si_restore_qbo_state(sctx
, &saved_state
);
479 pipe_resource_reference(&tmp_buffer
, NULL
);
482 static const struct si_query_ops gfx10_sh_query_ops
= {
483 .destroy
= gfx10_sh_query_destroy
,
484 .begin
= gfx10_sh_query_begin
,
485 .end
= gfx10_sh_query_end
,
486 .get_result
= gfx10_sh_query_get_result
,
487 .get_result_resource
= gfx10_sh_query_get_result_resource
,
490 struct pipe_query
*gfx10_sh_query_create(struct si_screen
*screen
,
491 enum pipe_query_type query_type
,
494 struct gfx10_sh_query
*query
= CALLOC_STRUCT(gfx10_sh_query
);
495 if (unlikely(!query
))
498 query
->b
.ops
= &gfx10_sh_query_ops
;
499 query
->b
.type
= query_type
;
500 query
->stream
= index
;
502 return (struct pipe_query
*)query
;
505 void gfx10_init_query(struct si_context
*sctx
)
507 LIST_INITHEAD(&sctx
->shader_query_buffers
);
508 sctx
->atoms
.s
.shader_query
.emit
= emit_shader_query
;
511 void gfx10_destroy_query(struct si_context
*sctx
)
513 while (!LIST_IS_EMPTY(&sctx
->shader_query_buffers
)) {
514 struct gfx10_sh_query_buffer
*qbuf
=
515 list_first_entry(&sctx
->shader_query_buffers
,
516 struct gfx10_sh_query_buffer
, list
);
517 LIST_DEL(&qbuf
->list
);
519 assert(!qbuf
->refcount
);
520 si_resource_reference(&qbuf
->buf
, NULL
);