2 * Copyright 2018 Advanced Micro Devices, Inc.
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
28 #include "util/u_memory.h"
29 #include "util/u_suballoc.h"
34 * The query buffer is written to by ESGS NGG shaders with statistics about
35 * generated and (streamout-)emitted primitives.
37 * The context maintains a ring of these query buffers, and queries simply
38 * point into the ring, allowing an arbitrary number of queries to be active
39 * without additional GPU cost.
41 struct gfx10_sh_query_buffer
{
42 struct list_head list
;
43 struct si_resource
*buf
;
46 /* Offset into the buffer in bytes; points at the first un-emitted entry. */
50 /* Memory layout of the query buffer. Must be kept in sync with shaders
51 * (including QBO shaders) and should be aligned to cachelines.
53 * The somewhat awkward memory layout is for compatibility with the
54 * SET_PREDICATION packet, which also means that we're setting the high bit
55 * of all those values unconditionally.
57 struct gfx10_sh_query_buffer_mem
{
59 uint64_t generated_primitives_start_dummy
;
60 uint64_t emitted_primitives_start_dummy
;
61 uint64_t generated_primitives
;
62 uint64_t emitted_primitives
;
64 uint32_t fence
; /* bottom-of-pipe fence: set to ~0 when draws have finished */
68 /* Shader-based queries. */
69 struct gfx10_sh_query
{
72 struct gfx10_sh_query_buffer
*first
;
73 struct gfx10_sh_query_buffer
*last
;
80 static void emit_shader_query(struct si_context
*sctx
)
82 assert(!list_is_empty(&sctx
->shader_query_buffers
));
84 struct gfx10_sh_query_buffer
*qbuf
=
85 list_last_entry(&sctx
->shader_query_buffers
, struct gfx10_sh_query_buffer
, list
);
86 qbuf
->head
+= sizeof(struct gfx10_sh_query_buffer_mem
);
89 static void gfx10_release_query_buffers(struct si_context
*sctx
,
90 struct gfx10_sh_query_buffer
*first
,
91 struct gfx10_sh_query_buffer
*last
)
94 struct gfx10_sh_query_buffer
*qbuf
= first
;
96 first
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.next
, list
);
104 if (qbuf
->list
.next
== &sctx
->shader_query_buffers
)
105 continue; /* keep the most recent buffer; it may not be full yet */
106 if (qbuf
->list
.prev
== &sctx
->shader_query_buffers
)
107 continue; /* keep the oldest buffer for recycling */
109 list_del(&qbuf
->list
);
110 si_resource_reference(&qbuf
->buf
, NULL
);
115 static bool gfx10_alloc_query_buffer(struct si_context
*sctx
)
117 if (si_is_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
))
120 struct gfx10_sh_query_buffer
*qbuf
= NULL
;
122 if (!list_is_empty(&sctx
->shader_query_buffers
)) {
123 qbuf
= list_last_entry(&sctx
->shader_query_buffers
, struct gfx10_sh_query_buffer
, list
);
124 if (qbuf
->head
+ sizeof(struct gfx10_sh_query_buffer_mem
) <= qbuf
->buf
->b
.b
.width0
)
127 qbuf
= list_first_entry(&sctx
->shader_query_buffers
, struct gfx10_sh_query_buffer
, list
);
128 if (!qbuf
->refcount
&&
129 !si_rings_is_buffer_referenced(sctx
, qbuf
->buf
->buf
, RADEON_USAGE_READWRITE
) &&
130 sctx
->ws
->buffer_wait(qbuf
->buf
->buf
, 0, RADEON_USAGE_READWRITE
)) {
131 /* Can immediately re-use the oldest buffer */
132 list_del(&qbuf
->list
);
139 qbuf
= CALLOC_STRUCT(gfx10_sh_query_buffer
);
143 struct si_screen
*screen
= sctx
->screen
;
145 MAX2(sizeof(struct gfx10_sh_query_buffer_mem
), screen
->info
.min_alloc_size
);
146 qbuf
->buf
= si_resource(pipe_buffer_create(&screen
->b
, 0, PIPE_USAGE_STAGING
, buf_size
));
147 if (unlikely(!qbuf
->buf
)) {
153 /* The buffer is currently unused by the GPU. Initialize it.
155 * We need to set the high bit of all the primitive counters for
156 * compatibility with the SET_PREDICATION packet.
158 uint64_t *results
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
,
159 PIPE_TRANSFER_WRITE
| PIPE_TRANSFER_UNSYNCHRONIZED
);
162 for (unsigned i
= 0, e
= qbuf
->buf
->b
.b
.width0
/ sizeof(struct gfx10_sh_query_buffer_mem
); i
< e
;
164 for (unsigned j
= 0; j
< 16; ++j
)
165 results
[32 * i
+ j
] = (uint64_t)1 << 63;
166 results
[32 * i
+ 16] = 0;
169 list_addtail(&qbuf
->list
, &sctx
->shader_query_buffers
);
171 qbuf
->refcount
= sctx
->num_active_shader_queries
;
174 struct pipe_shader_buffer sbuf
;
175 sbuf
.buffer
= &qbuf
->buf
->b
.b
;
176 sbuf
.buffer_offset
= qbuf
->head
;
177 sbuf
.buffer_size
= sizeof(struct gfx10_sh_query_buffer_mem
);
178 si_set_rw_shader_buffer(sctx
, GFX10_GS_QUERY_BUF
, &sbuf
);
179 sctx
->current_vs_state
|= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
181 si_mark_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
);
185 static void gfx10_sh_query_destroy(struct si_context
*sctx
, struct si_query
*rquery
)
187 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
188 gfx10_release_query_buffers(sctx
, query
->first
, query
->last
);
192 static bool gfx10_sh_query_begin(struct si_context
*sctx
, struct si_query
*rquery
)
194 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
196 gfx10_release_query_buffers(sctx
, query
->first
, query
->last
);
197 query
->first
= query
->last
= NULL
;
199 if (unlikely(!gfx10_alloc_query_buffer(sctx
)))
202 query
->first
= list_last_entry(&sctx
->shader_query_buffers
, struct gfx10_sh_query_buffer
, list
);
203 query
->first_begin
= query
->first
->head
;
205 sctx
->num_active_shader_queries
++;
206 query
->first
->refcount
++;
211 static bool gfx10_sh_query_end(struct si_context
*sctx
, struct si_query
*rquery
)
213 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
215 if (unlikely(!query
->first
))
216 return false; /* earlier out of memory error */
218 query
->last
= list_last_entry(&sctx
->shader_query_buffers
, struct gfx10_sh_query_buffer
, list
);
219 query
->last_end
= query
->last
->head
;
221 /* Signal the fence of the previous chunk */
222 if (query
->last_end
!= 0) {
223 uint64_t fence_va
= query
->last
->buf
->gpu_address
;
224 fence_va
+= query
->last_end
- sizeof(struct gfx10_sh_query_buffer_mem
);
225 fence_va
+= offsetof(struct gfx10_sh_query_buffer_mem
, fence
);
226 si_cp_release_mem(sctx
, sctx
->gfx_cs
, V_028A90_BOTTOM_OF_PIPE_TS
, 0, EOP_DST_SEL_MEM
,
227 EOP_INT_SEL_NONE
, EOP_DATA_SEL_VALUE_32BIT
, query
->last
->buf
, fence_va
,
228 0xffffffff, PIPE_QUERY_GPU_FINISHED
);
231 sctx
->num_active_shader_queries
--;
233 if (sctx
->num_active_shader_queries
> 0) {
234 gfx10_alloc_query_buffer(sctx
);
236 si_set_rw_shader_buffer(sctx
, GFX10_GS_QUERY_BUF
, NULL
);
237 sctx
->current_vs_state
&= C_VS_STATE_STREAMOUT_QUERY_ENABLED
;
239 /* If a query_begin is followed by a query_end without a draw
240 * in-between, we need to clear the atom to ensure that the
241 * next query_begin will re-initialize the shader buffer. */
242 si_set_atom_dirty(sctx
, &sctx
->atoms
.s
.shader_query
, false);
248 static void gfx10_sh_query_add_result(struct gfx10_sh_query
*query
,
249 struct gfx10_sh_query_buffer_mem
*qmem
,
250 union pipe_query_result
*result
)
252 static const uint64_t mask
= ((uint64_t)1 << 63) - 1;
254 switch (query
->b
.type
) {
255 case PIPE_QUERY_PRIMITIVES_EMITTED
:
256 result
->u64
+= qmem
->stream
[query
->stream
].emitted_primitives
& mask
;
258 case PIPE_QUERY_PRIMITIVES_GENERATED
:
259 result
->u64
+= qmem
->stream
[query
->stream
].generated_primitives
& mask
;
261 case PIPE_QUERY_SO_STATISTICS
:
262 result
->so_statistics
.num_primitives_written
+=
263 qmem
->stream
[query
->stream
].emitted_primitives
& mask
;
264 result
->so_statistics
.primitives_storage_needed
+=
265 qmem
->stream
[query
->stream
].generated_primitives
& mask
;
267 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
268 result
->b
|= qmem
->stream
[query
->stream
].emitted_primitives
!=
269 qmem
->stream
[query
->stream
].generated_primitives
;
271 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
272 for (unsigned stream
= 0; stream
< SI_MAX_STREAMS
; ++stream
) {
273 result
->b
|= qmem
->stream
[query
->stream
].emitted_primitives
!=
274 qmem
->stream
[query
->stream
].generated_primitives
;
282 static bool gfx10_sh_query_get_result(struct si_context
*sctx
, struct si_query
*rquery
, bool wait
,
283 union pipe_query_result
*result
)
285 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
287 util_query_clear_result(result
, query
->b
.type
);
289 if (unlikely(!query
->first
))
290 return false; /* earlier out of memory error */
293 for (struct gfx10_sh_query_buffer
*qbuf
= query
->last
;;
294 qbuf
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.prev
, list
)) {
295 unsigned usage
= PIPE_TRANSFER_READ
| (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
);
298 if (rquery
->b
.flushed
)
299 map
= sctx
->ws
->buffer_map(qbuf
->buf
->buf
, NULL
, usage
);
301 map
= si_buffer_map_sync_with_rings(sctx
, qbuf
->buf
, usage
);
306 unsigned results_begin
= 0;
307 unsigned results_end
= qbuf
->head
;
308 if (qbuf
== query
->first
)
309 results_begin
= query
->first_begin
;
310 if (qbuf
== query
->last
)
311 results_end
= query
->last_end
;
313 while (results_begin
!= results_end
) {
314 struct gfx10_sh_query_buffer_mem
*qmem
= map
+ results_begin
;
315 results_begin
+= sizeof(*qmem
);
317 gfx10_sh_query_add_result(query
, qmem
, result
);
320 if (qbuf
== query
->first
)
327 static void gfx10_sh_query_get_result_resource(struct si_context
*sctx
, struct si_query
*rquery
,
328 bool wait
, enum pipe_query_value_type result_type
,
329 int index
, struct pipe_resource
*resource
,
332 struct gfx10_sh_query
*query
= (struct gfx10_sh_query
*)rquery
;
333 struct si_qbo_state saved_state
= {};
334 struct pipe_resource
*tmp_buffer
= NULL
;
335 unsigned tmp_buffer_offset
= 0;
337 if (!sctx
->sh_query_result_shader
) {
338 sctx
->sh_query_result_shader
= gfx10_create_sh_query_result_cs(sctx
);
339 if (!sctx
->sh_query_result_shader
)
343 if (query
->first
!= query
->last
) {
344 u_suballocator_alloc(sctx
->allocator_zeroed_memory
, 16, 16, &tmp_buffer_offset
, &tmp_buffer
);
349 si_save_qbo_state(sctx
, &saved_state
);
351 /* Pre-fill the constants configuring the shader behavior. */
356 uint32_t result_count
;
358 struct pipe_constant_buffer constant_buffer
= {};
361 switch (query
->b
.type
) {
362 case PIPE_QUERY_PRIMITIVES_GENERATED
:
363 consts
.offset
= sizeof(uint32_t) * query
->stream
;
366 case PIPE_QUERY_PRIMITIVES_EMITTED
:
367 consts
.offset
= sizeof(uint32_t) * (4 + query
->stream
);
370 case PIPE_QUERY_SO_STATISTICS
:
371 consts
.offset
= sizeof(uint32_t) * (4 * index
+ query
->stream
);
374 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
375 consts
.offset
= sizeof(uint32_t) * query
->stream
;
378 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE
:
383 unreachable("bad query type");
386 /* Check result availability. */
391 if (result_type
== PIPE_QUERY_TYPE_I64
|| result_type
== PIPE_QUERY_TYPE_U64
)
394 constant_buffer
.buffer_size
= sizeof(consts
);
395 constant_buffer
.user_buffer
= &consts
;
397 /* Pre-fill the SSBOs and grid. */
398 struct pipe_shader_buffer ssbo
[3];
399 struct pipe_grid_info grid
= {};
401 ssbo
[1].buffer
= tmp_buffer
;
402 ssbo
[1].buffer_offset
= tmp_buffer_offset
;
403 ssbo
[1].buffer_size
= 16;
407 sctx
->b
.bind_compute_state(&sctx
->b
, sctx
->sh_query_result_shader
);
416 struct gfx10_sh_query_buffer
*qbuf
= query
->first
;
418 unsigned begin
= qbuf
== query
->first
? query
->first_begin
: 0;
419 unsigned end
= qbuf
== query
->last
? query
->last_end
: qbuf
->buf
->b
.b
.width0
;
423 ssbo
[0].buffer
= &qbuf
->buf
->b
.b
;
424 ssbo
[0].buffer_offset
= begin
;
425 ssbo
[0].buffer_size
= end
- begin
;
427 consts
.result_count
= (end
- begin
) / sizeof(struct gfx10_sh_query_buffer_mem
);
429 if (qbuf
!= query
->first
)
431 if (qbuf
!= query
->last
)
434 if (qbuf
== query
->last
) {
435 ssbo
[2].buffer
= resource
;
436 ssbo
[2].buffer_offset
= offset
;
437 ssbo
[2].buffer_size
= 8;
440 sctx
->b
.set_constant_buffer(&sctx
->b
, PIPE_SHADER_COMPUTE
, 0, &constant_buffer
);
441 sctx
->b
.set_shader_buffers(&sctx
->b
, PIPE_SHADER_COMPUTE
, 0, 3, ssbo
, 0x6);
446 /* Wait for result availability. Wait only for readiness
447 * of the last entry, since the fence writes should be
448 * serialized in the CP.
450 va
= qbuf
->buf
->gpu_address
;
451 va
+= end
- sizeof(struct gfx10_sh_query_buffer_mem
);
452 va
+= offsetof(struct gfx10_sh_query_buffer_mem
, fence
);
454 si_cp_wait_mem(sctx
, sctx
->gfx_cs
, va
, 0x00000001, 0x00000001, 0);
457 sctx
->b
.launch_grid(&sctx
->b
, &grid
);
458 sctx
->flags
|= SI_CONTEXT_CS_PARTIAL_FLUSH
;
460 if (qbuf
== query
->last
)
462 qbuf
= LIST_ENTRY(struct gfx10_sh_query_buffer
, qbuf
->list
.next
, list
);
465 si_restore_qbo_state(sctx
, &saved_state
);
466 pipe_resource_reference(&tmp_buffer
, NULL
);
469 static const struct si_query_ops gfx10_sh_query_ops
= {
470 .destroy
= gfx10_sh_query_destroy
,
471 .begin
= gfx10_sh_query_begin
,
472 .end
= gfx10_sh_query_end
,
473 .get_result
= gfx10_sh_query_get_result
,
474 .get_result_resource
= gfx10_sh_query_get_result_resource
,
477 struct pipe_query
*gfx10_sh_query_create(struct si_screen
*screen
, enum pipe_query_type query_type
,
480 struct gfx10_sh_query
*query
= CALLOC_STRUCT(gfx10_sh_query
);
481 if (unlikely(!query
))
484 query
->b
.ops
= &gfx10_sh_query_ops
;
485 query
->b
.type
= query_type
;
486 query
->stream
= index
;
488 return (struct pipe_query
*)query
;
491 void gfx10_init_query(struct si_context
*sctx
)
493 list_inithead(&sctx
->shader_query_buffers
);
494 sctx
->atoms
.s
.shader_query
.emit
= emit_shader_query
;
497 void gfx10_destroy_query(struct si_context
*sctx
)
499 while (!list_is_empty(&sctx
->shader_query_buffers
)) {
500 struct gfx10_sh_query_buffer
*qbuf
=
501 list_first_entry(&sctx
->shader_query_buffers
, struct gfx10_sh_query_buffer
, list
);
502 list_del(&qbuf
->list
);
504 assert(!qbuf
->refcount
);
505 si_resource_reference(&qbuf
->buf
, NULL
);