2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
25 #include "r600_query.h"
27 #include "util/u_memory.h"
28 #include "util/u_upload_mgr.h"
30 #include "tgsi/tgsi_text.h"
32 struct r600_hw_query_params
{
33 unsigned start_offset
;
35 unsigned fence_offset
;
40 /* Queries without buffer handling or suspend/resume. */
41 struct r600_query_sw
{
44 uint64_t begin_result
;
46 /* Fence for GPU_FINISHED. */
47 struct pipe_fence_handle
*fence
;
50 static void r600_query_sw_destroy(struct r600_common_context
*rctx
,
51 struct r600_query
*rquery
)
53 struct pipe_screen
*screen
= rctx
->b
.screen
;
54 struct r600_query_sw
*query
= (struct r600_query_sw
*)rquery
;
56 screen
->fence_reference(screen
, &query
->fence
, NULL
);
60 static enum radeon_value_id
winsys_id_from_type(unsigned type
)
63 case R600_QUERY_REQUESTED_VRAM
: return RADEON_REQUESTED_VRAM_MEMORY
;
64 case R600_QUERY_REQUESTED_GTT
: return RADEON_REQUESTED_GTT_MEMORY
;
65 case R600_QUERY_MAPPED_VRAM
: return RADEON_MAPPED_VRAM
;
66 case R600_QUERY_MAPPED_GTT
: return RADEON_MAPPED_GTT
;
67 case R600_QUERY_BUFFER_WAIT_TIME
: return RADEON_BUFFER_WAIT_TIME_NS
;
68 case R600_QUERY_NUM_CTX_FLUSHES
: return RADEON_NUM_CS_FLUSHES
;
69 case R600_QUERY_NUM_BYTES_MOVED
: return RADEON_NUM_BYTES_MOVED
;
70 case R600_QUERY_NUM_EVICTIONS
: return RADEON_NUM_EVICTIONS
;
71 case R600_QUERY_VRAM_USAGE
: return RADEON_VRAM_USAGE
;
72 case R600_QUERY_GTT_USAGE
: return RADEON_GTT_USAGE
;
73 case R600_QUERY_GPU_TEMPERATURE
: return RADEON_GPU_TEMPERATURE
;
74 case R600_QUERY_CURRENT_GPU_SCLK
: return RADEON_CURRENT_SCLK
;
75 case R600_QUERY_CURRENT_GPU_MCLK
: return RADEON_CURRENT_MCLK
;
76 default: unreachable("query type does not correspond to winsys id");
80 static bool r600_query_sw_begin(struct r600_common_context
*rctx
,
81 struct r600_query
*rquery
)
83 struct r600_query_sw
*query
= (struct r600_query_sw
*)rquery
;
85 switch(query
->b
.type
) {
86 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
87 case PIPE_QUERY_GPU_FINISHED
:
89 case R600_QUERY_DRAW_CALLS
:
90 query
->begin_result
= rctx
->num_draw_calls
;
92 case R600_QUERY_SPILL_DRAW_CALLS
:
93 query
->begin_result
= rctx
->num_spill_draw_calls
;
95 case R600_QUERY_COMPUTE_CALLS
:
96 query
->begin_result
= rctx
->num_compute_calls
;
98 case R600_QUERY_SPILL_COMPUTE_CALLS
:
99 query
->begin_result
= rctx
->num_spill_compute_calls
;
101 case R600_QUERY_DMA_CALLS
:
102 query
->begin_result
= rctx
->num_dma_calls
;
104 case R600_QUERY_CP_DMA_CALLS
:
105 query
->begin_result
= rctx
->num_cp_dma_calls
;
107 case R600_QUERY_NUM_VS_FLUSHES
:
108 query
->begin_result
= rctx
->num_vs_flushes
;
110 case R600_QUERY_NUM_PS_FLUSHES
:
111 query
->begin_result
= rctx
->num_ps_flushes
;
113 case R600_QUERY_NUM_CS_FLUSHES
:
114 query
->begin_result
= rctx
->num_cs_flushes
;
116 case R600_QUERY_REQUESTED_VRAM
:
117 case R600_QUERY_REQUESTED_GTT
:
118 case R600_QUERY_MAPPED_VRAM
:
119 case R600_QUERY_MAPPED_GTT
:
120 case R600_QUERY_VRAM_USAGE
:
121 case R600_QUERY_GTT_USAGE
:
122 case R600_QUERY_GPU_TEMPERATURE
:
123 case R600_QUERY_CURRENT_GPU_SCLK
:
124 case R600_QUERY_CURRENT_GPU_MCLK
:
125 case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO
:
126 query
->begin_result
= 0;
128 case R600_QUERY_BUFFER_WAIT_TIME
:
129 case R600_QUERY_NUM_CTX_FLUSHES
:
130 case R600_QUERY_NUM_BYTES_MOVED
:
131 case R600_QUERY_NUM_EVICTIONS
: {
132 enum radeon_value_id ws_id
= winsys_id_from_type(query
->b
.type
);
133 query
->begin_result
= rctx
->ws
->query_value(rctx
->ws
, ws_id
);
136 case R600_QUERY_GPU_LOAD
:
137 query
->begin_result
= r600_gpu_load_begin(rctx
->screen
);
139 case R600_QUERY_NUM_COMPILATIONS
:
140 query
->begin_result
= p_atomic_read(&rctx
->screen
->num_compilations
);
142 case R600_QUERY_NUM_SHADERS_CREATED
:
143 query
->begin_result
= p_atomic_read(&rctx
->screen
->num_shaders_created
);
145 case R600_QUERY_NUM_SHADER_CACHE_HITS
:
146 query
->begin_result
=
147 p_atomic_read(&rctx
->screen
->num_shader_cache_hits
);
149 case R600_QUERY_GPIN_ASIC_ID
:
150 case R600_QUERY_GPIN_NUM_SIMD
:
151 case R600_QUERY_GPIN_NUM_RB
:
152 case R600_QUERY_GPIN_NUM_SPI
:
153 case R600_QUERY_GPIN_NUM_SE
:
156 unreachable("r600_query_sw_begin: bad query type");
162 static bool r600_query_sw_end(struct r600_common_context
*rctx
,
163 struct r600_query
*rquery
)
165 struct r600_query_sw
*query
= (struct r600_query_sw
*)rquery
;
167 switch(query
->b
.type
) {
168 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
170 case PIPE_QUERY_GPU_FINISHED
:
171 rctx
->b
.flush(&rctx
->b
, &query
->fence
, PIPE_FLUSH_DEFERRED
);
173 case R600_QUERY_DRAW_CALLS
:
174 query
->end_result
= rctx
->num_draw_calls
;
176 case R600_QUERY_SPILL_DRAW_CALLS
:
177 query
->end_result
= rctx
->num_spill_draw_calls
;
179 case R600_QUERY_COMPUTE_CALLS
:
180 query
->end_result
= rctx
->num_compute_calls
;
182 case R600_QUERY_SPILL_COMPUTE_CALLS
:
183 query
->end_result
= rctx
->num_spill_compute_calls
;
185 case R600_QUERY_DMA_CALLS
:
186 query
->end_result
= rctx
->num_dma_calls
;
188 case R600_QUERY_CP_DMA_CALLS
:
189 query
->end_result
= rctx
->num_cp_dma_calls
;
191 case R600_QUERY_NUM_VS_FLUSHES
:
192 query
->end_result
= rctx
->num_vs_flushes
;
194 case R600_QUERY_NUM_PS_FLUSHES
:
195 query
->end_result
= rctx
->num_ps_flushes
;
197 case R600_QUERY_NUM_CS_FLUSHES
:
198 query
->end_result
= rctx
->num_cs_flushes
;
200 case R600_QUERY_REQUESTED_VRAM
:
201 case R600_QUERY_REQUESTED_GTT
:
202 case R600_QUERY_MAPPED_VRAM
:
203 case R600_QUERY_MAPPED_GTT
:
204 case R600_QUERY_VRAM_USAGE
:
205 case R600_QUERY_GTT_USAGE
:
206 case R600_QUERY_GPU_TEMPERATURE
:
207 case R600_QUERY_CURRENT_GPU_SCLK
:
208 case R600_QUERY_CURRENT_GPU_MCLK
:
209 case R600_QUERY_BUFFER_WAIT_TIME
:
210 case R600_QUERY_NUM_CTX_FLUSHES
:
211 case R600_QUERY_NUM_BYTES_MOVED
:
212 case R600_QUERY_NUM_EVICTIONS
: {
213 enum radeon_value_id ws_id
= winsys_id_from_type(query
->b
.type
);
214 query
->end_result
= rctx
->ws
->query_value(rctx
->ws
, ws_id
);
217 case R600_QUERY_GPU_LOAD
:
218 query
->end_result
= r600_gpu_load_end(rctx
->screen
,
219 query
->begin_result
);
220 query
->begin_result
= 0;
222 case R600_QUERY_NUM_COMPILATIONS
:
223 query
->end_result
= p_atomic_read(&rctx
->screen
->num_compilations
);
225 case R600_QUERY_NUM_SHADERS_CREATED
:
226 query
->end_result
= p_atomic_read(&rctx
->screen
->num_shaders_created
);
228 case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO
:
229 query
->end_result
= rctx
->last_tex_ps_draw_ratio
;
231 case R600_QUERY_NUM_SHADER_CACHE_HITS
:
233 p_atomic_read(&rctx
->screen
->num_shader_cache_hits
);
235 case R600_QUERY_GPIN_ASIC_ID
:
236 case R600_QUERY_GPIN_NUM_SIMD
:
237 case R600_QUERY_GPIN_NUM_RB
:
238 case R600_QUERY_GPIN_NUM_SPI
:
239 case R600_QUERY_GPIN_NUM_SE
:
242 unreachable("r600_query_sw_end: bad query type");
248 static bool r600_query_sw_get_result(struct r600_common_context
*rctx
,
249 struct r600_query
*rquery
,
251 union pipe_query_result
*result
)
253 struct r600_query_sw
*query
= (struct r600_query_sw
*)rquery
;
255 switch (query
->b
.type
) {
256 case PIPE_QUERY_TIMESTAMP_DISJOINT
:
257 /* Convert from cycles per millisecond to cycles per second (Hz). */
258 result
->timestamp_disjoint
.frequency
=
259 (uint64_t)rctx
->screen
->info
.clock_crystal_freq
* 1000;
260 result
->timestamp_disjoint
.disjoint
= false;
262 case PIPE_QUERY_GPU_FINISHED
: {
263 struct pipe_screen
*screen
= rctx
->b
.screen
;
264 result
->b
= screen
->fence_finish(screen
, &rctx
->b
, query
->fence
,
265 wait
? PIPE_TIMEOUT_INFINITE
: 0);
269 case R600_QUERY_GPIN_ASIC_ID
:
272 case R600_QUERY_GPIN_NUM_SIMD
:
273 result
->u32
= rctx
->screen
->info
.num_good_compute_units
;
275 case R600_QUERY_GPIN_NUM_RB
:
276 result
->u32
= rctx
->screen
->info
.num_render_backends
;
278 case R600_QUERY_GPIN_NUM_SPI
:
279 result
->u32
= 1; /* all supported chips have one SPI per SE */
281 case R600_QUERY_GPIN_NUM_SE
:
282 result
->u32
= rctx
->screen
->info
.max_se
;
286 result
->u64
= query
->end_result
- query
->begin_result
;
288 switch (query
->b
.type
) {
289 case R600_QUERY_BUFFER_WAIT_TIME
:
290 case R600_QUERY_GPU_TEMPERATURE
:
293 case R600_QUERY_CURRENT_GPU_SCLK
:
294 case R600_QUERY_CURRENT_GPU_MCLK
:
295 result
->u64
*= 1000000;
303 static struct r600_query_ops sw_query_ops
= {
304 .destroy
= r600_query_sw_destroy
,
305 .begin
= r600_query_sw_begin
,
306 .end
= r600_query_sw_end
,
307 .get_result
= r600_query_sw_get_result
,
308 .get_result_resource
= NULL
311 static struct pipe_query
*r600_query_sw_create(struct pipe_context
*ctx
,
314 struct r600_query_sw
*query
;
316 query
= CALLOC_STRUCT(r600_query_sw
);
320 query
->b
.type
= query_type
;
321 query
->b
.ops
= &sw_query_ops
;
323 return (struct pipe_query
*)query
;
326 void r600_query_hw_destroy(struct r600_common_context
*rctx
,
327 struct r600_query
*rquery
)
329 struct r600_query_hw
*query
= (struct r600_query_hw
*)rquery
;
330 struct r600_query_buffer
*prev
= query
->buffer
.previous
;
332 /* Release all query buffers. */
334 struct r600_query_buffer
*qbuf
= prev
;
335 prev
= prev
->previous
;
336 r600_resource_reference(&qbuf
->buf
, NULL
);
340 r600_resource_reference(&query
->buffer
.buf
, NULL
);
344 static struct r600_resource
*r600_new_query_buffer(struct r600_common_context
*ctx
,
345 struct r600_query_hw
*query
)
347 unsigned buf_size
= MAX2(query
->result_size
,
348 ctx
->screen
->info
.min_alloc_size
);
350 /* Queries are normally read by the CPU after
351 * being written by the gpu, hence staging is probably a good
354 struct r600_resource
*buf
= (struct r600_resource
*)
355 pipe_buffer_create(ctx
->b
.screen
, 0,
356 PIPE_USAGE_STAGING
, buf_size
);
360 if (!query
->ops
->prepare_buffer(ctx
, query
, buf
)) {
361 r600_resource_reference(&buf
, NULL
);
368 static bool r600_query_hw_prepare_buffer(struct r600_common_context
*ctx
,
369 struct r600_query_hw
*query
,
370 struct r600_resource
*buffer
)
372 /* Callers ensure that the buffer is currently unused by the GPU. */
373 uint32_t *results
= ctx
->ws
->buffer_map(buffer
->buf
, NULL
,
374 PIPE_TRANSFER_WRITE
|
375 PIPE_TRANSFER_UNSYNCHRONIZED
);
379 memset(results
, 0, buffer
->b
.b
.width0
);
381 if (query
->b
.type
== PIPE_QUERY_OCCLUSION_COUNTER
||
382 query
->b
.type
== PIPE_QUERY_OCCLUSION_PREDICATE
) {
383 unsigned num_results
;
386 /* Set top bits for unused backends. */
387 num_results
= buffer
->b
.b
.width0
/ query
->result_size
;
388 for (j
= 0; j
< num_results
; j
++) {
389 for (i
= 0; i
< ctx
->max_db
; i
++) {
390 if (!(ctx
->backend_mask
& (1<<i
))) {
391 results
[(i
* 4)+1] = 0x80000000;
392 results
[(i
* 4)+3] = 0x80000000;
395 results
+= 4 * ctx
->max_db
;
402 static void r600_query_hw_get_result_resource(struct r600_common_context
*rctx
,
403 struct r600_query
*rquery
,
405 enum pipe_query_value_type result_type
,
407 struct pipe_resource
*resource
,
410 static struct r600_query_ops query_hw_ops
= {
411 .destroy
= r600_query_hw_destroy
,
412 .begin
= r600_query_hw_begin
,
413 .end
= r600_query_hw_end
,
414 .get_result
= r600_query_hw_get_result
,
415 .get_result_resource
= r600_query_hw_get_result_resource
,
418 static void r600_query_hw_do_emit_start(struct r600_common_context
*ctx
,
419 struct r600_query_hw
*query
,
420 struct r600_resource
*buffer
,
422 static void r600_query_hw_do_emit_stop(struct r600_common_context
*ctx
,
423 struct r600_query_hw
*query
,
424 struct r600_resource
*buffer
,
426 static void r600_query_hw_add_result(struct r600_common_context
*ctx
,
427 struct r600_query_hw
*, void *buffer
,
428 union pipe_query_result
*result
);
429 static void r600_query_hw_clear_result(struct r600_query_hw
*,
430 union pipe_query_result
*);
432 static struct r600_query_hw_ops query_hw_default_hw_ops
= {
433 .prepare_buffer
= r600_query_hw_prepare_buffer
,
434 .emit_start
= r600_query_hw_do_emit_start
,
435 .emit_stop
= r600_query_hw_do_emit_stop
,
436 .clear_result
= r600_query_hw_clear_result
,
437 .add_result
= r600_query_hw_add_result
,
440 bool r600_query_hw_init(struct r600_common_context
*rctx
,
441 struct r600_query_hw
*query
)
443 query
->buffer
.buf
= r600_new_query_buffer(rctx
, query
);
444 if (!query
->buffer
.buf
)
450 static struct pipe_query
*r600_query_hw_create(struct r600_common_context
*rctx
,
454 struct r600_query_hw
*query
= CALLOC_STRUCT(r600_query_hw
);
458 query
->b
.type
= query_type
;
459 query
->b
.ops
= &query_hw_ops
;
460 query
->ops
= &query_hw_default_hw_ops
;
462 switch (query_type
) {
463 case PIPE_QUERY_OCCLUSION_COUNTER
:
464 case PIPE_QUERY_OCCLUSION_PREDICATE
:
465 query
->result_size
= 16 * rctx
->max_db
;
466 query
->result_size
+= 16; /* for the fence + alignment */
467 query
->num_cs_dw_begin
= 6;
468 query
->num_cs_dw_end
= 6 + r600_gfx_write_fence_dwords(rctx
->screen
);
470 case PIPE_QUERY_TIME_ELAPSED
:
471 query
->result_size
= 24;
472 query
->num_cs_dw_begin
= 8;
473 query
->num_cs_dw_end
= 8 + r600_gfx_write_fence_dwords(rctx
->screen
);
475 case PIPE_QUERY_TIMESTAMP
:
476 query
->result_size
= 16;
477 query
->num_cs_dw_end
= 8 + r600_gfx_write_fence_dwords(rctx
->screen
);
478 query
->flags
= R600_QUERY_HW_FLAG_NO_START
;
480 case PIPE_QUERY_PRIMITIVES_EMITTED
:
481 case PIPE_QUERY_PRIMITIVES_GENERATED
:
482 case PIPE_QUERY_SO_STATISTICS
:
483 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
484 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
485 query
->result_size
= 32;
486 query
->num_cs_dw_begin
= 6;
487 query
->num_cs_dw_end
= 6;
488 query
->stream
= index
;
490 case PIPE_QUERY_PIPELINE_STATISTICS
:
491 /* 11 values on EG, 8 on R600. */
492 query
->result_size
= (rctx
->chip_class
>= EVERGREEN
? 11 : 8) * 16;
493 query
->result_size
+= 8; /* for the fence + alignment */
494 query
->num_cs_dw_begin
= 6;
495 query
->num_cs_dw_end
= 6 + r600_gfx_write_fence_dwords(rctx
->screen
);
503 if (!r600_query_hw_init(rctx
, query
)) {
508 return (struct pipe_query
*)query
;
511 static void r600_update_occlusion_query_state(struct r600_common_context
*rctx
,
512 unsigned type
, int diff
)
514 if (type
== PIPE_QUERY_OCCLUSION_COUNTER
||
515 type
== PIPE_QUERY_OCCLUSION_PREDICATE
) {
516 bool old_enable
= rctx
->num_occlusion_queries
!= 0;
517 bool old_perfect_enable
=
518 rctx
->num_perfect_occlusion_queries
!= 0;
519 bool enable
, perfect_enable
;
521 rctx
->num_occlusion_queries
+= diff
;
522 assert(rctx
->num_occlusion_queries
>= 0);
524 if (type
== PIPE_QUERY_OCCLUSION_COUNTER
) {
525 rctx
->num_perfect_occlusion_queries
+= diff
;
526 assert(rctx
->num_perfect_occlusion_queries
>= 0);
529 enable
= rctx
->num_occlusion_queries
!= 0;
530 perfect_enable
= rctx
->num_perfect_occlusion_queries
!= 0;
532 if (enable
!= old_enable
|| perfect_enable
!= old_perfect_enable
) {
533 rctx
->set_occlusion_query_state(&rctx
->b
, enable
);
538 static unsigned event_type_for_stream(struct r600_query_hw
*query
)
540 switch (query
->stream
) {
542 case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS
;
543 case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1
;
544 case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2
;
545 case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3
;
549 static void r600_query_hw_do_emit_start(struct r600_common_context
*ctx
,
550 struct r600_query_hw
*query
,
551 struct r600_resource
*buffer
,
554 struct radeon_winsys_cs
*cs
= ctx
->gfx
.cs
;
556 switch (query
->b
.type
) {
557 case PIPE_QUERY_OCCLUSION_COUNTER
:
558 case PIPE_QUERY_OCCLUSION_PREDICATE
:
559 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 2, 0));
560 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1));
562 radeon_emit(cs
, (va
>> 32) & 0xFFFF);
564 case PIPE_QUERY_PRIMITIVES_EMITTED
:
565 case PIPE_QUERY_PRIMITIVES_GENERATED
:
566 case PIPE_QUERY_SO_STATISTICS
:
567 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
568 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 2, 0));
569 radeon_emit(cs
, EVENT_TYPE(event_type_for_stream(query
)) | EVENT_INDEX(3));
571 radeon_emit(cs
, (va
>> 32) & 0xFFFF);
573 case PIPE_QUERY_TIME_ELAPSED
:
574 r600_gfx_write_event_eop(ctx
, EVENT_TYPE_BOTTOM_OF_PIPE_TS
,
575 0, 3, NULL
, va
, 0, 0);
577 case PIPE_QUERY_PIPELINE_STATISTICS
:
578 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 2, 0));
579 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT
) | EVENT_INDEX(2));
581 radeon_emit(cs
, (va
>> 32) & 0xFFFF);
586 r600_emit_reloc(ctx
, &ctx
->gfx
, query
->buffer
.buf
, RADEON_USAGE_WRITE
,
590 static void r600_query_hw_emit_start(struct r600_common_context
*ctx
,
591 struct r600_query_hw
*query
)
595 if (!query
->buffer
.buf
)
596 return; // previous buffer allocation failure
598 r600_update_occlusion_query_state(ctx
, query
->b
.type
, 1);
599 r600_update_prims_generated_query_state(ctx
, query
->b
.type
, 1);
601 ctx
->need_gfx_cs_space(&ctx
->b
, query
->num_cs_dw_begin
+ query
->num_cs_dw_end
,
604 /* Get a new query buffer if needed. */
605 if (query
->buffer
.results_end
+ query
->result_size
> query
->buffer
.buf
->b
.b
.width0
) {
606 struct r600_query_buffer
*qbuf
= MALLOC_STRUCT(r600_query_buffer
);
607 *qbuf
= query
->buffer
;
608 query
->buffer
.results_end
= 0;
609 query
->buffer
.previous
= qbuf
;
610 query
->buffer
.buf
= r600_new_query_buffer(ctx
, query
);
611 if (!query
->buffer
.buf
)
615 /* emit begin query */
616 va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
618 query
->ops
->emit_start(ctx
, query
, query
->buffer
.buf
, va
);
620 ctx
->num_cs_dw_queries_suspend
+= query
->num_cs_dw_end
;
623 static void r600_query_hw_do_emit_stop(struct r600_common_context
*ctx
,
624 struct r600_query_hw
*query
,
625 struct r600_resource
*buffer
,
628 struct radeon_winsys_cs
*cs
= ctx
->gfx
.cs
;
629 uint64_t fence_va
= 0;
631 switch (query
->b
.type
) {
632 case PIPE_QUERY_OCCLUSION_COUNTER
:
633 case PIPE_QUERY_OCCLUSION_PREDICATE
:
635 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 2, 0));
636 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1));
638 radeon_emit(cs
, (va
>> 32) & 0xFFFF);
640 fence_va
= va
+ ctx
->max_db
* 16 - 8;
642 case PIPE_QUERY_PRIMITIVES_EMITTED
:
643 case PIPE_QUERY_PRIMITIVES_GENERATED
:
644 case PIPE_QUERY_SO_STATISTICS
:
645 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
646 va
+= query
->result_size
/2;
647 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 2, 0));
648 radeon_emit(cs
, EVENT_TYPE(event_type_for_stream(query
)) | EVENT_INDEX(3));
650 radeon_emit(cs
, (va
>> 32) & 0xFFFF);
652 case PIPE_QUERY_TIME_ELAPSED
:
655 case PIPE_QUERY_TIMESTAMP
:
656 r600_gfx_write_event_eop(ctx
, EVENT_TYPE_BOTTOM_OF_PIPE_TS
,
657 0, 3, NULL
, va
, 0, 0);
660 case PIPE_QUERY_PIPELINE_STATISTICS
: {
661 unsigned sample_size
= (query
->result_size
- 8) / 2;
664 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 2, 0));
665 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT
) | EVENT_INDEX(2));
667 radeon_emit(cs
, (va
>> 32) & 0xFFFF);
669 fence_va
= va
+ sample_size
;
675 r600_emit_reloc(ctx
, &ctx
->gfx
, query
->buffer
.buf
, RADEON_USAGE_WRITE
,
679 r600_gfx_write_event_eop(ctx
, EVENT_TYPE_BOTTOM_OF_PIPE_TS
, 0, 1,
680 query
->buffer
.buf
, fence_va
, 0, 0x80000000);
683 static void r600_query_hw_emit_stop(struct r600_common_context
*ctx
,
684 struct r600_query_hw
*query
)
688 if (!query
->buffer
.buf
)
689 return; // previous buffer allocation failure
691 /* The queries which need begin already called this in begin_query. */
692 if (query
->flags
& R600_QUERY_HW_FLAG_NO_START
) {
693 ctx
->need_gfx_cs_space(&ctx
->b
, query
->num_cs_dw_end
, false);
697 va
= query
->buffer
.buf
->gpu_address
+ query
->buffer
.results_end
;
699 query
->ops
->emit_stop(ctx
, query
, query
->buffer
.buf
, va
);
701 query
->buffer
.results_end
+= query
->result_size
;
703 if (!(query
->flags
& R600_QUERY_HW_FLAG_NO_START
))
704 ctx
->num_cs_dw_queries_suspend
-= query
->num_cs_dw_end
;
706 r600_update_occlusion_query_state(ctx
, query
->b
.type
, -1);
707 r600_update_prims_generated_query_state(ctx
, query
->b
.type
, -1);
710 static void r600_emit_query_predication(struct r600_common_context
*ctx
,
711 struct r600_atom
*atom
)
713 struct radeon_winsys_cs
*cs
= ctx
->gfx
.cs
;
714 struct r600_query_hw
*query
= (struct r600_query_hw
*)ctx
->render_cond
;
715 struct r600_query_buffer
*qbuf
;
722 flag_wait
= ctx
->render_cond_mode
== PIPE_RENDER_COND_WAIT
||
723 ctx
->render_cond_mode
== PIPE_RENDER_COND_BY_REGION_WAIT
;
725 switch (query
->b
.type
) {
726 case PIPE_QUERY_OCCLUSION_COUNTER
:
727 case PIPE_QUERY_OCCLUSION_PREDICATE
:
728 op
= PRED_OP(PREDICATION_OP_ZPASS
);
730 case PIPE_QUERY_PRIMITIVES_EMITTED
:
731 case PIPE_QUERY_PRIMITIVES_GENERATED
:
732 case PIPE_QUERY_SO_STATISTICS
:
733 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
734 op
= PRED_OP(PREDICATION_OP_PRIMCOUNT
);
741 /* if true then invert, see GL_ARB_conditional_render_inverted */
742 if (ctx
->render_cond_invert
)
743 op
|= PREDICATION_DRAW_NOT_VISIBLE
; /* Draw if not visable/overflow */
745 op
|= PREDICATION_DRAW_VISIBLE
; /* Draw if visable/overflow */
747 op
|= flag_wait
? PREDICATION_HINT_WAIT
: PREDICATION_HINT_NOWAIT_DRAW
;
749 /* emit predicate packets for all data blocks */
750 for (qbuf
= &query
->buffer
; qbuf
; qbuf
= qbuf
->previous
) {
751 unsigned results_base
= 0;
752 uint64_t va
= qbuf
->buf
->gpu_address
;
754 while (results_base
< qbuf
->results_end
) {
755 radeon_emit(cs
, PKT3(PKT3_SET_PREDICATION
, 1, 0));
756 radeon_emit(cs
, va
+ results_base
);
757 radeon_emit(cs
, op
| (((va
+ results_base
) >> 32) & 0xFF));
758 r600_emit_reloc(ctx
, &ctx
->gfx
, qbuf
->buf
, RADEON_USAGE_READ
,
760 results_base
+= query
->result_size
;
762 /* set CONTINUE bit for all packets except the first */
763 op
|= PREDICATION_CONTINUE
;
768 static struct pipe_query
*r600_create_query(struct pipe_context
*ctx
, unsigned query_type
, unsigned index
)
770 struct r600_common_context
*rctx
= (struct r600_common_context
*)ctx
;
772 if (query_type
== PIPE_QUERY_TIMESTAMP_DISJOINT
||
773 query_type
== PIPE_QUERY_GPU_FINISHED
||
774 query_type
>= PIPE_QUERY_DRIVER_SPECIFIC
)
775 return r600_query_sw_create(ctx
, query_type
);
777 return r600_query_hw_create(rctx
, query_type
, index
);
780 static void r600_destroy_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
782 struct r600_common_context
*rctx
= (struct r600_common_context
*)ctx
;
783 struct r600_query
*rquery
= (struct r600_query
*)query
;
785 rquery
->ops
->destroy(rctx
, rquery
);
788 static boolean
r600_begin_query(struct pipe_context
*ctx
,
789 struct pipe_query
*query
)
791 struct r600_common_context
*rctx
= (struct r600_common_context
*)ctx
;
792 struct r600_query
*rquery
= (struct r600_query
*)query
;
794 return rquery
->ops
->begin(rctx
, rquery
);
797 void r600_query_hw_reset_buffers(struct r600_common_context
*rctx
,
798 struct r600_query_hw
*query
)
800 struct r600_query_buffer
*prev
= query
->buffer
.previous
;
802 /* Discard the old query buffers. */
804 struct r600_query_buffer
*qbuf
= prev
;
805 prev
= prev
->previous
;
806 r600_resource_reference(&qbuf
->buf
, NULL
);
810 query
->buffer
.results_end
= 0;
811 query
->buffer
.previous
= NULL
;
813 /* Obtain a new buffer if the current one can't be mapped without a stall. */
814 if (r600_rings_is_buffer_referenced(rctx
, query
->buffer
.buf
->buf
, RADEON_USAGE_READWRITE
) ||
815 !rctx
->ws
->buffer_wait(query
->buffer
.buf
->buf
, 0, RADEON_USAGE_READWRITE
)) {
816 r600_resource_reference(&query
->buffer
.buf
, NULL
);
817 query
->buffer
.buf
= r600_new_query_buffer(rctx
, query
);
819 if (!query
->ops
->prepare_buffer(rctx
, query
, query
->buffer
.buf
))
820 r600_resource_reference(&query
->buffer
.buf
, NULL
);
824 bool r600_query_hw_begin(struct r600_common_context
*rctx
,
825 struct r600_query
*rquery
)
827 struct r600_query_hw
*query
= (struct r600_query_hw
*)rquery
;
829 if (query
->flags
& R600_QUERY_HW_FLAG_NO_START
) {
834 if (!(query
->flags
& R600_QUERY_HW_FLAG_BEGIN_RESUMES
))
835 r600_query_hw_reset_buffers(rctx
, query
);
837 r600_query_hw_emit_start(rctx
, query
);
838 if (!query
->buffer
.buf
)
841 LIST_ADDTAIL(&query
->list
, &rctx
->active_queries
);
845 static bool r600_end_query(struct pipe_context
*ctx
, struct pipe_query
*query
)
847 struct r600_common_context
*rctx
= (struct r600_common_context
*)ctx
;
848 struct r600_query
*rquery
= (struct r600_query
*)query
;
850 return rquery
->ops
->end(rctx
, rquery
);
853 bool r600_query_hw_end(struct r600_common_context
*rctx
,
854 struct r600_query
*rquery
)
856 struct r600_query_hw
*query
= (struct r600_query_hw
*)rquery
;
858 if (query
->flags
& R600_QUERY_HW_FLAG_NO_START
)
859 r600_query_hw_reset_buffers(rctx
, query
);
861 r600_query_hw_emit_stop(rctx
, query
);
863 if (!(query
->flags
& R600_QUERY_HW_FLAG_NO_START
))
864 LIST_DELINIT(&query
->list
);
866 if (!query
->buffer
.buf
)
872 static void r600_get_hw_query_params(struct r600_common_context
*rctx
,
873 struct r600_query_hw
*rquery
, int index
,
874 struct r600_hw_query_params
*params
)
876 params
->pair_stride
= 0;
877 params
->pair_count
= 1;
879 switch (rquery
->b
.type
) {
880 case PIPE_QUERY_OCCLUSION_COUNTER
:
881 case PIPE_QUERY_OCCLUSION_PREDICATE
:
882 params
->start_offset
= 0;
883 params
->end_offset
= 8;
884 params
->fence_offset
= rctx
->max_db
* 16;
885 params
->pair_stride
= 16;
886 params
->pair_count
= rctx
->max_db
;
888 case PIPE_QUERY_TIME_ELAPSED
:
889 params
->start_offset
= 0;
890 params
->end_offset
= 8;
891 params
->fence_offset
= 16;
893 case PIPE_QUERY_TIMESTAMP
:
894 params
->start_offset
= 0;
895 params
->end_offset
= 0;
896 params
->fence_offset
= 8;
898 case PIPE_QUERY_PRIMITIVES_EMITTED
:
899 params
->start_offset
= 8;
900 params
->end_offset
= 24;
901 params
->fence_offset
= params
->end_offset
+ 4;
903 case PIPE_QUERY_PRIMITIVES_GENERATED
:
904 params
->start_offset
= 0;
905 params
->end_offset
= 16;
906 params
->fence_offset
= params
->end_offset
+ 4;
908 case PIPE_QUERY_SO_STATISTICS
:
909 params
->start_offset
= 8 - index
* 8;
910 params
->end_offset
= 24 - index
* 8;
911 params
->fence_offset
= params
->end_offset
+ 4;
913 case PIPE_QUERY_PIPELINE_STATISTICS
:
915 /* Offsets apply to EG+ */
916 static const unsigned offsets
[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
917 params
->start_offset
= offsets
[index
];
918 params
->end_offset
= 88 + offsets
[index
];
919 params
->fence_offset
= 2 * 88;
923 unreachable("r600_get_hw_query_params unsupported");
927 static unsigned r600_query_read_result(void *map
, unsigned start_index
, unsigned end_index
,
928 bool test_status_bit
)
930 uint32_t *current_result
= (uint32_t*)map
;
933 start
= (uint64_t)current_result
[start_index
] |
934 (uint64_t)current_result
[start_index
+1] << 32;
935 end
= (uint64_t)current_result
[end_index
] |
936 (uint64_t)current_result
[end_index
+1] << 32;
938 if (!test_status_bit
||
939 ((start
& 0x8000000000000000UL
) && (end
& 0x8000000000000000UL
))) {
945 static void r600_query_hw_add_result(struct r600_common_context
*ctx
,
946 struct r600_query_hw
*query
,
948 union pipe_query_result
*result
)
950 switch (query
->b
.type
) {
951 case PIPE_QUERY_OCCLUSION_COUNTER
: {
952 for (unsigned i
= 0; i
< ctx
->max_db
; ++i
) {
953 unsigned results_base
= i
* 16;
955 r600_query_read_result(buffer
+ results_base
, 0, 2, true);
959 case PIPE_QUERY_OCCLUSION_PREDICATE
: {
960 for (unsigned i
= 0; i
< ctx
->max_db
; ++i
) {
961 unsigned results_base
= i
* 16;
962 result
->b
= result
->b
||
963 r600_query_read_result(buffer
+ results_base
, 0, 2, true) != 0;
967 case PIPE_QUERY_TIME_ELAPSED
:
968 result
->u64
+= r600_query_read_result(buffer
, 0, 2, false);
970 case PIPE_QUERY_TIMESTAMP
:
971 result
->u64
= *(uint64_t*)buffer
;
973 case PIPE_QUERY_PRIMITIVES_EMITTED
:
974 /* SAMPLE_STREAMOUTSTATS stores this structure:
976 * u64 NumPrimitivesWritten;
977 * u64 PrimitiveStorageNeeded;
979 * We only need NumPrimitivesWritten here. */
980 result
->u64
+= r600_query_read_result(buffer
, 2, 6, true);
982 case PIPE_QUERY_PRIMITIVES_GENERATED
:
983 /* Here we read PrimitiveStorageNeeded. */
984 result
->u64
+= r600_query_read_result(buffer
, 0, 4, true);
986 case PIPE_QUERY_SO_STATISTICS
:
987 result
->so_statistics
.num_primitives_written
+=
988 r600_query_read_result(buffer
, 2, 6, true);
989 result
->so_statistics
.primitives_storage_needed
+=
990 r600_query_read_result(buffer
, 0, 4, true);
992 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
993 result
->b
= result
->b
||
994 r600_query_read_result(buffer
, 2, 6, true) !=
995 r600_query_read_result(buffer
, 0, 4, true);
997 case PIPE_QUERY_PIPELINE_STATISTICS
:
998 if (ctx
->chip_class
>= EVERGREEN
) {
999 result
->pipeline_statistics
.ps_invocations
+=
1000 r600_query_read_result(buffer
, 0, 22, false);
1001 result
->pipeline_statistics
.c_primitives
+=
1002 r600_query_read_result(buffer
, 2, 24, false);
1003 result
->pipeline_statistics
.c_invocations
+=
1004 r600_query_read_result(buffer
, 4, 26, false);
1005 result
->pipeline_statistics
.vs_invocations
+=
1006 r600_query_read_result(buffer
, 6, 28, false);
1007 result
->pipeline_statistics
.gs_invocations
+=
1008 r600_query_read_result(buffer
, 8, 30, false);
1009 result
->pipeline_statistics
.gs_primitives
+=
1010 r600_query_read_result(buffer
, 10, 32, false);
1011 result
->pipeline_statistics
.ia_primitives
+=
1012 r600_query_read_result(buffer
, 12, 34, false);
1013 result
->pipeline_statistics
.ia_vertices
+=
1014 r600_query_read_result(buffer
, 14, 36, false);
1015 result
->pipeline_statistics
.hs_invocations
+=
1016 r600_query_read_result(buffer
, 16, 38, false);
1017 result
->pipeline_statistics
.ds_invocations
+=
1018 r600_query_read_result(buffer
, 18, 40, false);
1019 result
->pipeline_statistics
.cs_invocations
+=
1020 r600_query_read_result(buffer
, 20, 42, false);
1022 result
->pipeline_statistics
.ps_invocations
+=
1023 r600_query_read_result(buffer
, 0, 16, false);
1024 result
->pipeline_statistics
.c_primitives
+=
1025 r600_query_read_result(buffer
, 2, 18, false);
1026 result
->pipeline_statistics
.c_invocations
+=
1027 r600_query_read_result(buffer
, 4, 20, false);
1028 result
->pipeline_statistics
.vs_invocations
+=
1029 r600_query_read_result(buffer
, 6, 22, false);
1030 result
->pipeline_statistics
.gs_invocations
+=
1031 r600_query_read_result(buffer
, 8, 24, false);
1032 result
->pipeline_statistics
.gs_primitives
+=
1033 r600_query_read_result(buffer
, 10, 26, false);
1034 result
->pipeline_statistics
.ia_primitives
+=
1035 r600_query_read_result(buffer
, 12, 28, false);
1036 result
->pipeline_statistics
.ia_vertices
+=
1037 r600_query_read_result(buffer
, 14, 30, false);
1039 #if 0 /* for testing */
1040 printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1041 "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1042 "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1043 result
->pipeline_statistics
.ia_vertices
,
1044 result
->pipeline_statistics
.ia_primitives
,
1045 result
->pipeline_statistics
.vs_invocations
,
1046 result
->pipeline_statistics
.hs_invocations
,
1047 result
->pipeline_statistics
.ds_invocations
,
1048 result
->pipeline_statistics
.gs_invocations
,
1049 result
->pipeline_statistics
.gs_primitives
,
1050 result
->pipeline_statistics
.c_invocations
,
1051 result
->pipeline_statistics
.c_primitives
,
1052 result
->pipeline_statistics
.ps_invocations
,
1053 result
->pipeline_statistics
.cs_invocations
);
1061 static boolean
r600_get_query_result(struct pipe_context
*ctx
,
1062 struct pipe_query
*query
, boolean wait
,
1063 union pipe_query_result
*result
)
1065 struct r600_common_context
*rctx
= (struct r600_common_context
*)ctx
;
1066 struct r600_query
*rquery
= (struct r600_query
*)query
;
1068 return rquery
->ops
->get_result(rctx
, rquery
, wait
, result
);
1071 static void r600_get_query_result_resource(struct pipe_context
*ctx
,
1072 struct pipe_query
*query
,
1074 enum pipe_query_value_type result_type
,
1076 struct pipe_resource
*resource
,
1079 struct r600_common_context
*rctx
= (struct r600_common_context
*)ctx
;
1080 struct r600_query
*rquery
= (struct r600_query
*)query
;
1082 rquery
->ops
->get_result_resource(rctx
, rquery
, wait
, result_type
, index
,
1086 static void r600_query_hw_clear_result(struct r600_query_hw
*query
,
1087 union pipe_query_result
*result
)
1089 util_query_clear_result(result
, query
->b
.type
);
1092 bool r600_query_hw_get_result(struct r600_common_context
*rctx
,
1093 struct r600_query
*rquery
,
1094 bool wait
, union pipe_query_result
*result
)
1096 struct r600_query_hw
*query
= (struct r600_query_hw
*)rquery
;
1097 struct r600_query_buffer
*qbuf
;
1099 query
->ops
->clear_result(query
, result
);
1101 for (qbuf
= &query
->buffer
; qbuf
; qbuf
= qbuf
->previous
) {
1102 unsigned results_base
= 0;
1105 map
= r600_buffer_map_sync_with_rings(rctx
, qbuf
->buf
,
1106 PIPE_TRANSFER_READ
|
1107 (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
));
1111 while (results_base
!= qbuf
->results_end
) {
1112 query
->ops
->add_result(rctx
, query
, map
+ results_base
,
1114 results_base
+= query
->result_size
;
1118 /* Convert the time to expected units. */
1119 if (rquery
->type
== PIPE_QUERY_TIME_ELAPSED
||
1120 rquery
->type
== PIPE_QUERY_TIMESTAMP
) {
1121 result
->u64
= (1000000 * result
->u64
) / rctx
->screen
->info
.clock_crystal_freq
;
1126 /* Create the compute shader that is used to collect the results.
1128 * One compute grid with a single thread is launched for every query result
1129 * buffer. The thread (optionally) reads a previous summary buffer, then
1130 * accumulates data from the query result buffer, and writes the result either
1131 * to a summary buffer to be consumed by the next grid invocation or to the
1132 * user-supplied buffer.
1138 * 0.y = result_stride
1139 * 0.z = result_count
1141 * 1: read previously accumulated values
1142 * 2: write accumulated values for chaining
1143 * 4: write result available
1144 * 8: convert result to boolean (0/1)
1145 * 16: only read one dword and use that as result
1146 * 32: apply timestamp conversion
1147 * 64: store full 64 bits result
1148 * 128: store signed 32 bits result
1149 * 1.x = fence_offset
1153 * BUFFER[0] = query result buffer
1154 * BUFFER[1] = previous summary buffer
1155 * BUFFER[2] = next summary buffer or user-supplied buffer
1157 static void r600_create_query_result_shader(struct r600_common_context
*rctx
)
1159 /* TEMP[0].xy = accumulated result so far
1160 * TEMP[0].z = result not available
1162 * TEMP[1].x = current result index
1163 * TEMP[1].y = current pair index
1165 static const char text_tmpl
[] =
1167 "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
1168 "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
1169 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
1175 "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
1176 "IMM[1] UINT32 {1, 2, 4, 8}\n"
1177 "IMM[2] UINT32 {16, 32, 64, 128}\n"
1178 "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
1180 "AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n"
1182 /* Check result availability. */
1183 "LOAD TEMP[1].x, BUFFER[0], CONST[1].xxxx\n"
1184 "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
1185 "MOV TEMP[1], TEMP[0].zzzz\n"
1186 "NOT TEMP[0].z, TEMP[0].zzzz\n"
1188 /* Load result if available. */
1190 "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
1193 /* Load previously accumulated result if requested. */
1194 "MOV TEMP[0], IMM[0].xxxx\n"
1195 "AND TEMP[4], CONST[0].wwww, IMM[1].xxxx\n"
1197 "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
1200 "MOV TEMP[1].x, IMM[0].xxxx\n"
1202 /* Break if accumulated result so far is not available. */
1203 "UIF TEMP[0].zzzz\n"
1207 /* Break if result_index >= result_count. */
1208 "USGE TEMP[5], TEMP[1].xxxx, CONST[0].zzzz\n"
1213 /* Load fence and check result availability */
1214 "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy, CONST[1].xxxx\n"
1215 "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
1216 "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
1217 "NOT TEMP[0].z, TEMP[0].zzzz\n"
1218 "UIF TEMP[0].zzzz\n"
1222 "MOV TEMP[1].y, IMM[0].xxxx\n"
1224 /* Load start and end. */
1225 "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy\n"
1226 "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[1].yyyy, TEMP[5].xxxx\n"
1227 "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
1229 "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0].xxxx\n"
1230 "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].xxxx\n"
1232 "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
1233 "U64ADD TEMP[0].xy, TEMP[0], TEMP[3]\n"
1235 /* Increment pair index */
1236 "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
1237 "USGE TEMP[5], TEMP[1].yyyy, CONST[1].zzzz\n"
1243 /* Increment result index */
1244 "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
1248 "AND TEMP[4], CONST[0].wwww, IMM[1].yyyy\n"
1250 /* Store accumulated data for chaining. */
1251 "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
1253 "AND TEMP[4], CONST[0].wwww, IMM[1].zzzz\n"
1255 /* Store result availability. */
1256 "NOT TEMP[0].z, TEMP[0]\n"
1257 "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
1258 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
1260 "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
1262 "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
1265 /* Store result if it is available. */
1266 "NOT TEMP[4], TEMP[0].zzzz\n"
1268 /* Apply timestamp conversion */
1269 "AND TEMP[4], CONST[0].wwww, IMM[2].yyyy\n"
1271 "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
1272 "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
1275 /* Convert to boolean */
1276 "AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n"
1278 "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n"
1279 "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
1280 "MOV TEMP[0].y, IMM[0].xxxx\n"
1283 "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
1285 "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
1288 "UIF TEMP[0].yyyy\n"
1289 "MOV TEMP[0].x, IMM[0].wwww\n"
1292 "AND TEMP[4], CONST[0].wwww, IMM[2].wwww\n"
1294 "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
1297 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
1305 char text
[sizeof(text_tmpl
) + 32];
1306 struct tgsi_token tokens
[1024];
1307 struct pipe_compute_state state
= {};
1309 /* Hard code the frequency into the shader so that the backend can
1310 * use the full range of optimizations for divide-by-constant.
1312 snprintf(text
, sizeof(text
), text_tmpl
,
1313 rctx
->screen
->info
.clock_crystal_freq
);
1315 if (!tgsi_text_translate(text
, tokens
, ARRAY_SIZE(tokens
))) {
1320 state
.ir_type
= PIPE_SHADER_IR_TGSI
;
1321 state
.prog
= tokens
;
1323 rctx
->query_result_shader
= rctx
->b
.create_compute_state(&rctx
->b
, &state
);
1326 static void r600_restore_qbo_state(struct r600_common_context
*rctx
,
1327 struct r600_qbo_state
*st
)
1329 rctx
->b
.bind_compute_state(&rctx
->b
, st
->saved_compute
);
1331 rctx
->b
.set_constant_buffer(&rctx
->b
, PIPE_SHADER_COMPUTE
, 0, &st
->saved_const0
);
1332 pipe_resource_reference(&st
->saved_const0
.buffer
, NULL
);
1334 rctx
->b
.set_shader_buffers(&rctx
->b
, PIPE_SHADER_COMPUTE
, 0, 3, st
->saved_ssbo
);
1335 for (unsigned i
= 0; i
< 3; ++i
)
1336 pipe_resource_reference(&st
->saved_ssbo
[i
].buffer
, NULL
);
1339 static void r600_query_hw_get_result_resource(struct r600_common_context
*rctx
,
1340 struct r600_query
*rquery
,
1342 enum pipe_query_value_type result_type
,
1344 struct pipe_resource
*resource
,
1347 struct r600_query_hw
*query
= (struct r600_query_hw
*)rquery
;
1348 struct r600_query_buffer
*qbuf
;
1349 struct r600_query_buffer
*qbuf_prev
;
1350 struct pipe_resource
*tmp_buffer
= NULL
;
1351 unsigned tmp_buffer_offset
= 0;
1352 struct r600_qbo_state saved_state
= {};
1353 struct pipe_grid_info grid
= {};
1354 struct pipe_constant_buffer constant_buffer
= {};
1355 struct pipe_shader_buffer ssbo
[3];
1356 struct r600_hw_query_params params
;
1358 uint32_t end_offset
;
1359 uint32_t result_stride
;
1360 uint32_t result_count
;
1362 uint32_t fence_offset
;
1363 uint32_t pair_stride
;
1364 uint32_t pair_count
;
1367 if (!rctx
->query_result_shader
) {
1368 r600_create_query_result_shader(rctx
);
1369 if (!rctx
->query_result_shader
)
1373 if (query
->buffer
.previous
) {
1374 u_suballocator_alloc(rctx
->allocator_zeroed_memory
, 16, 16,
1375 &tmp_buffer_offset
, &tmp_buffer
);
1380 rctx
->save_qbo_state(&rctx
->b
, &saved_state
);
1382 r600_get_hw_query_params(rctx
, query
, index
>= 0 ? index
: 0, ¶ms
);
1383 consts
.end_offset
= params
.end_offset
- params
.start_offset
;
1384 consts
.fence_offset
= params
.fence_offset
- params
.start_offset
;
1385 consts
.result_stride
= query
->result_size
;
1386 consts
.pair_stride
= params
.pair_stride
;
1387 consts
.pair_count
= params
.pair_count
;
1389 constant_buffer
.buffer_size
= sizeof(consts
);
1390 constant_buffer
.user_buffer
= &consts
;
1392 ssbo
[1].buffer
= tmp_buffer
;
1393 ssbo
[1].buffer_offset
= tmp_buffer_offset
;
1394 ssbo
[1].buffer_size
= 16;
1398 rctx
->b
.bind_compute_state(&rctx
->b
, rctx
->query_result_shader
);
1410 if (query
->b
.type
== PIPE_QUERY_OCCLUSION_PREDICATE
||
1411 query
->b
.type
== PIPE_QUERY_SO_OVERFLOW_PREDICATE
)
1413 else if (query
->b
.type
== PIPE_QUERY_TIMESTAMP
||
1414 query
->b
.type
== PIPE_QUERY_TIME_ELAPSED
)
1415 consts
.config
|= 32;
1417 switch (result_type
) {
1418 case PIPE_QUERY_TYPE_U64
:
1419 case PIPE_QUERY_TYPE_I64
:
1420 consts
.config
|= 64;
1422 case PIPE_QUERY_TYPE_I32
:
1423 consts
.config
|= 128;
1425 case PIPE_QUERY_TYPE_U32
:
1429 rctx
->flags
|= rctx
->screen
->barrier_flags
.cp_to_L2
;
1431 for (qbuf
= &query
->buffer
; qbuf
; qbuf
= qbuf_prev
) {
1432 if (query
->b
.type
!= PIPE_QUERY_TIMESTAMP
) {
1433 qbuf_prev
= qbuf
->previous
;
1434 consts
.result_count
= qbuf
->results_end
/ query
->result_size
;
1435 consts
.config
&= ~3;
1436 if (qbuf
!= &query
->buffer
)
1441 /* Only read the last timestamp. */
1443 consts
.result_count
= 0;
1444 consts
.config
|= 16;
1445 params
.start_offset
+= qbuf
->results_end
- query
->result_size
;
1448 rctx
->b
.set_constant_buffer(&rctx
->b
, PIPE_SHADER_COMPUTE
, 0, &constant_buffer
);
1450 ssbo
[0].buffer
= &qbuf
->buf
->b
.b
;
1451 ssbo
[0].buffer_offset
= params
.start_offset
;
1452 ssbo
[0].buffer_size
= qbuf
->results_end
- params
.start_offset
;
1454 if (!qbuf
->previous
) {
1455 ssbo
[2].buffer
= resource
;
1456 ssbo
[2].buffer_offset
= offset
;
1457 ssbo
[2].buffer_size
= 8;
1459 ((struct r600_resource
*)resource
)->TC_L2_dirty
= true;
1462 rctx
->b
.set_shader_buffers(&rctx
->b
, PIPE_SHADER_COMPUTE
, 0, 3, ssbo
);
1464 if (wait
&& qbuf
== &query
->buffer
) {
1467 /* Wait for result availability. Wait only for readiness
1468 * of the last entry, since the fence writes should be
1469 * serialized in the CP.
1471 va
= qbuf
->buf
->gpu_address
+ qbuf
->results_end
- query
->result_size
;
1472 va
+= params
.fence_offset
;
1474 r600_gfx_wait_fence(rctx
, va
, 0x80000000, 0x80000000);
1477 rctx
->b
.launch_grid(&rctx
->b
, &grid
);
1478 rctx
->flags
|= rctx
->screen
->barrier_flags
.compute_to_L2
;
1481 r600_restore_qbo_state(rctx
, &saved_state
);
1482 pipe_resource_reference(&tmp_buffer
, NULL
);
1485 static void r600_render_condition(struct pipe_context
*ctx
,
1486 struct pipe_query
*query
,
1490 struct r600_common_context
*rctx
= (struct r600_common_context
*)ctx
;
1491 struct r600_query_hw
*rquery
= (struct r600_query_hw
*)query
;
1492 struct r600_query_buffer
*qbuf
;
1493 struct r600_atom
*atom
= &rctx
->render_cond_atom
;
1495 rctx
->render_cond
= query
;
1496 rctx
->render_cond_invert
= condition
;
1497 rctx
->render_cond_mode
= mode
;
1499 /* Compute the size of SET_PREDICATION packets. */
1502 for (qbuf
= &rquery
->buffer
; qbuf
; qbuf
= qbuf
->previous
)
1503 atom
->num_dw
+= (qbuf
->results_end
/ rquery
->result_size
) * 5;
1506 rctx
->set_atom_dirty(rctx
, atom
, query
!= NULL
);
1509 void r600_suspend_queries(struct r600_common_context
*ctx
)
1511 struct r600_query_hw
*query
;
1513 LIST_FOR_EACH_ENTRY(query
, &ctx
->active_queries
, list
) {
1514 r600_query_hw_emit_stop(ctx
, query
);
1516 assert(ctx
->num_cs_dw_queries_suspend
== 0);
1519 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context
*ctx
,
1520 struct list_head
*query_list
)
1522 struct r600_query_hw
*query
;
1523 unsigned num_dw
= 0;
1525 LIST_FOR_EACH_ENTRY(query
, query_list
, list
) {
1527 num_dw
+= query
->num_cs_dw_begin
+ query
->num_cs_dw_end
;
1529 /* Workaround for the fact that
1530 * num_cs_dw_nontimer_queries_suspend is incremented for every
1531 * resumed query, which raises the bar in need_cs_space for
1532 * queries about to be resumed.
1534 num_dw
+= query
->num_cs_dw_end
;
1536 /* primitives generated query */
1537 num_dw
+= ctx
->streamout
.enable_atom
.num_dw
;
1538 /* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */
1544 void r600_resume_queries(struct r600_common_context
*ctx
)
1546 struct r600_query_hw
*query
;
1547 unsigned num_cs_dw
= r600_queries_num_cs_dw_for_resuming(ctx
, &ctx
->active_queries
);
1549 assert(ctx
->num_cs_dw_queries_suspend
== 0);
1551 /* Check CS space here. Resuming must not be interrupted by flushes. */
1552 ctx
->need_gfx_cs_space(&ctx
->b
, num_cs_dw
, true);
1554 LIST_FOR_EACH_ENTRY(query
, &ctx
->active_queries
, list
) {
1555 r600_query_hw_emit_start(ctx
, query
);
1559 /* Get backends mask */
1560 void r600_query_init_backend_mask(struct r600_common_context
*ctx
)
1562 struct radeon_winsys_cs
*cs
= ctx
->gfx
.cs
;
1563 struct r600_resource
*buffer
;
1565 unsigned num_backends
= ctx
->screen
->info
.num_render_backends
;
1566 unsigned i
, mask
= 0;
1568 /* if backend_map query is supported by the kernel */
1569 if (ctx
->screen
->info
.r600_gb_backend_map_valid
) {
1570 unsigned num_tile_pipes
= ctx
->screen
->info
.num_tile_pipes
;
1571 unsigned backend_map
= ctx
->screen
->info
.r600_gb_backend_map
;
1572 unsigned item_width
, item_mask
;
1574 if (ctx
->chip_class
>= EVERGREEN
) {
1582 while (num_tile_pipes
--) {
1583 i
= backend_map
& item_mask
;
1585 backend_map
>>= item_width
;
1588 ctx
->backend_mask
= mask
;
1593 /* otherwise backup path for older kernels */
1595 /* create buffer for event data */
1596 buffer
= (struct r600_resource
*)
1597 pipe_buffer_create(ctx
->b
.screen
, 0,
1598 PIPE_USAGE_STAGING
, ctx
->max_db
*16);
1602 /* initialize buffer with zeroes */
1603 results
= r600_buffer_map_sync_with_rings(ctx
, buffer
, PIPE_TRANSFER_WRITE
);
1605 memset(results
, 0, ctx
->max_db
* 4 * 4);
1607 /* emit EVENT_WRITE for ZPASS_DONE */
1608 radeon_emit(cs
, PKT3(PKT3_EVENT_WRITE
, 2, 0));
1609 radeon_emit(cs
, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1));
1610 radeon_emit(cs
, buffer
->gpu_address
);
1611 radeon_emit(cs
, buffer
->gpu_address
>> 32);
1613 r600_emit_reloc(ctx
, &ctx
->gfx
, buffer
,
1614 RADEON_USAGE_WRITE
, RADEON_PRIO_QUERY
);
1616 /* analyze results */
1617 results
= r600_buffer_map_sync_with_rings(ctx
, buffer
, PIPE_TRANSFER_READ
);
1619 for(i
= 0; i
< ctx
->max_db
; i
++) {
1620 /* at least highest bit will be set if backend is used */
1621 if (results
[i
*4 + 1])
1627 r600_resource_reference(&buffer
, NULL
);
1630 ctx
->backend_mask
= mask
;
1635 /* fallback to old method - set num_backends lower bits to 1 */
1636 ctx
->backend_mask
= (~((uint32_t)0))>>(32-num_backends
);
1640 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
1643 .query_type = R600_QUERY_##query_type_, \
1644 .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1645 .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
1646 .group_id = group_id_ \
1649 #define X(name_, query_type_, type_, result_type_) \
1650 XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1652 #define XG(group_, name_, query_type_, type_, result_type_) \
1653 XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
1655 static struct pipe_driver_query_info r600_driver_query_list
[] = {
1656 X("num-compilations", NUM_COMPILATIONS
, UINT64
, CUMULATIVE
),
1657 X("num-shaders-created", NUM_SHADERS_CREATED
, UINT64
, CUMULATIVE
),
1658 X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS
, UINT64
, CUMULATIVE
),
1659 X("draw-calls", DRAW_CALLS
, UINT64
, AVERAGE
),
1660 X("spill-draw-calls", SPILL_DRAW_CALLS
, UINT64
, AVERAGE
),
1661 X("compute-calls", COMPUTE_CALLS
, UINT64
, AVERAGE
),
1662 X("spill-compute-calls", SPILL_COMPUTE_CALLS
, UINT64
, AVERAGE
),
1663 X("dma-calls", DMA_CALLS
, UINT64
, AVERAGE
),
1664 X("cp-dma-calls", CP_DMA_CALLS
, UINT64
, AVERAGE
),
1665 X("num-vs-flushes", NUM_VS_FLUSHES
, UINT64
, AVERAGE
),
1666 X("num-ps-flushes", NUM_PS_FLUSHES
, UINT64
, AVERAGE
),
1667 X("num-cs-flushes", NUM_CS_FLUSHES
, UINT64
, AVERAGE
),
1668 X("requested-VRAM", REQUESTED_VRAM
, BYTES
, AVERAGE
),
1669 X("requested-GTT", REQUESTED_GTT
, BYTES
, AVERAGE
),
1670 X("mapped-VRAM", MAPPED_VRAM
, BYTES
, AVERAGE
),
1671 X("mapped-GTT", MAPPED_GTT
, BYTES
, AVERAGE
),
1672 X("buffer-wait-time", BUFFER_WAIT_TIME
, MICROSECONDS
, CUMULATIVE
),
1673 X("num-ctx-flushes", NUM_CTX_FLUSHES
, UINT64
, AVERAGE
),
1674 X("num-bytes-moved", NUM_BYTES_MOVED
, BYTES
, CUMULATIVE
),
1675 X("num-evictions", NUM_EVICTIONS
, UINT64
, CUMULATIVE
),
1676 X("VRAM-usage", VRAM_USAGE
, BYTES
, AVERAGE
),
1677 X("GTT-usage", GTT_USAGE
, BYTES
, AVERAGE
),
1678 X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO
, UINT64
, AVERAGE
),
1680 /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1681 * which use it as a fallback path to detect the GPU type.
1683 * Note: The names of these queries are significant for GPUPerfStudio
1684 * (and possibly their order as well). */
1685 XG(GPIN
, "GPIN_000", GPIN_ASIC_ID
, UINT
, AVERAGE
),
1686 XG(GPIN
, "GPIN_001", GPIN_NUM_SIMD
, UINT
, AVERAGE
),
1687 XG(GPIN
, "GPIN_002", GPIN_NUM_RB
, UINT
, AVERAGE
),
1688 XG(GPIN
, "GPIN_003", GPIN_NUM_SPI
, UINT
, AVERAGE
),
1689 XG(GPIN
, "GPIN_004", GPIN_NUM_SE
, UINT
, AVERAGE
),
1691 /* The following queries must be at the end of the list because their
1692 * availability is adjusted dynamically based on the DRM version. */
1693 X("GPU-load", GPU_LOAD
, UINT64
, AVERAGE
),
1694 X("temperature", GPU_TEMPERATURE
, UINT64
, AVERAGE
),
1695 X("shader-clock", CURRENT_GPU_SCLK
, HZ
, AVERAGE
),
1696 X("memory-clock", CURRENT_GPU_MCLK
, HZ
, AVERAGE
),
1703 static unsigned r600_get_num_queries(struct r600_common_screen
*rscreen
)
1705 if (rscreen
->info
.drm_major
== 2 && rscreen
->info
.drm_minor
>= 42)
1706 return ARRAY_SIZE(r600_driver_query_list
);
1707 else if (rscreen
->info
.drm_major
== 3)
1708 return ARRAY_SIZE(r600_driver_query_list
) - 3;
1710 return ARRAY_SIZE(r600_driver_query_list
) - 4;
1713 static int r600_get_driver_query_info(struct pipe_screen
*screen
,
1715 struct pipe_driver_query_info
*info
)
1717 struct r600_common_screen
*rscreen
= (struct r600_common_screen
*)screen
;
1718 unsigned num_queries
= r600_get_num_queries(rscreen
);
1721 unsigned num_perfcounters
=
1722 r600_get_perfcounter_info(rscreen
, 0, NULL
);
1724 return num_queries
+ num_perfcounters
;
1727 if (index
>= num_queries
)
1728 return r600_get_perfcounter_info(rscreen
, index
- num_queries
, info
);
1730 *info
= r600_driver_query_list
[index
];
1732 switch (info
->query_type
) {
1733 case R600_QUERY_REQUESTED_VRAM
:
1734 case R600_QUERY_VRAM_USAGE
:
1735 case R600_QUERY_MAPPED_VRAM
:
1736 info
->max_value
.u64
= rscreen
->info
.vram_size
;
1738 case R600_QUERY_REQUESTED_GTT
:
1739 case R600_QUERY_GTT_USAGE
:
1740 case R600_QUERY_MAPPED_GTT
:
1741 info
->max_value
.u64
= rscreen
->info
.gart_size
;
1743 case R600_QUERY_GPU_TEMPERATURE
:
1744 info
->max_value
.u64
= 125;
1748 if (info
->group_id
!= ~(unsigned)0 && rscreen
->perfcounters
)
1749 info
->group_id
+= rscreen
->perfcounters
->num_groups
;
1754 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1755 * performance counter groups, so be careful when changing this and related
1758 static int r600_get_driver_query_group_info(struct pipe_screen
*screen
,
1760 struct pipe_driver_query_group_info
*info
)
1762 struct r600_common_screen
*rscreen
= (struct r600_common_screen
*)screen
;
1763 unsigned num_pc_groups
= 0;
1765 if (rscreen
->perfcounters
)
1766 num_pc_groups
= rscreen
->perfcounters
->num_groups
;
1769 return num_pc_groups
+ R600_NUM_SW_QUERY_GROUPS
;
1771 if (index
< num_pc_groups
)
1772 return r600_get_perfcounter_group_info(rscreen
, index
, info
);
1774 index
-= num_pc_groups
;
1775 if (index
>= R600_NUM_SW_QUERY_GROUPS
)
1778 info
->name
= "GPIN";
1779 info
->max_active_queries
= 5;
1780 info
->num_queries
= 5;
1784 void r600_query_init(struct r600_common_context
*rctx
)
1786 rctx
->b
.create_query
= r600_create_query
;
1787 rctx
->b
.create_batch_query
= r600_create_batch_query
;
1788 rctx
->b
.destroy_query
= r600_destroy_query
;
1789 rctx
->b
.begin_query
= r600_begin_query
;
1790 rctx
->b
.end_query
= r600_end_query
;
1791 rctx
->b
.get_query_result
= r600_get_query_result
;
1792 rctx
->b
.get_query_result_resource
= r600_get_query_result_resource
;
1793 rctx
->render_cond_atom
.emit
= r600_emit_query_predication
;
1795 if (((struct r600_common_screen
*)rctx
->b
.screen
)->info
.num_render_backends
> 0)
1796 rctx
->b
.render_condition
= r600_render_condition
;
1798 LIST_INITHEAD(&rctx
->active_queries
);
1801 void r600_init_screen_query_functions(struct r600_common_screen
*rscreen
)
1803 rscreen
->b
.get_driver_query_info
= r600_get_driver_query_info
;
1804 rscreen
->b
.get_driver_query_group_info
= r600_get_driver_query_group_info
;