radeonsi: add separate HUD counters for CB and DB cache flushes
[mesa.git] / src / gallium / drivers / radeon / r600_query.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "r600_query.h"
26 #include "r600_cs.h"
27 #include "util/u_memory.h"
28 #include "util/u_upload_mgr.h"
29 #include "os/os_time.h"
30 #include "tgsi/tgsi_text.h"
31
32 struct r600_hw_query_params {
33 unsigned start_offset;
34 unsigned end_offset;
35 unsigned fence_offset;
36 unsigned pair_stride;
37 unsigned pair_count;
38 };
39
40 /* Queries without buffer handling or suspend/resume. */
41 struct r600_query_sw {
42 struct r600_query b;
43
44 uint64_t begin_result;
45 uint64_t end_result;
46
47 uint64_t begin_time;
48 uint64_t end_time;
49
50 /* Fence for GPU_FINISHED. */
51 struct pipe_fence_handle *fence;
52 };
53
54 static void r600_query_sw_destroy(struct r600_common_screen *rscreen,
55 struct r600_query *rquery)
56 {
57 struct r600_query_sw *query = (struct r600_query_sw *)rquery;
58
59 rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL);
60 FREE(query);
61 }
62
63 static enum radeon_value_id winsys_id_from_type(unsigned type)
64 {
65 switch (type) {
66 case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
67 case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
68 case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
69 case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
70 case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
71 case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
72 case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
73 case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
74 case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
75 case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
76 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
77 case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
78 case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
79 case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
80 case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
81 case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
82 case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
83 case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
84 default: unreachable("query type does not correspond to winsys id");
85 }
86 }
87
88 static bool r600_query_sw_begin(struct r600_common_context *rctx,
89 struct r600_query *rquery)
90 {
91 struct r600_query_sw *query = (struct r600_query_sw *)rquery;
92 enum radeon_value_id ws_id;
93
94 switch(query->b.type) {
95 case PIPE_QUERY_TIMESTAMP_DISJOINT:
96 case PIPE_QUERY_GPU_FINISHED:
97 break;
98 case R600_QUERY_DRAW_CALLS:
99 query->begin_result = rctx->num_draw_calls;
100 break;
101 case R600_QUERY_PRIM_RESTART_CALLS:
102 query->begin_result = rctx->num_prim_restart_calls;
103 break;
104 case R600_QUERY_SPILL_DRAW_CALLS:
105 query->begin_result = rctx->num_spill_draw_calls;
106 break;
107 case R600_QUERY_COMPUTE_CALLS:
108 query->begin_result = rctx->num_compute_calls;
109 break;
110 case R600_QUERY_SPILL_COMPUTE_CALLS:
111 query->begin_result = rctx->num_spill_compute_calls;
112 break;
113 case R600_QUERY_DMA_CALLS:
114 query->begin_result = rctx->num_dma_calls;
115 break;
116 case R600_QUERY_CP_DMA_CALLS:
117 query->begin_result = rctx->num_cp_dma_calls;
118 break;
119 case R600_QUERY_NUM_VS_FLUSHES:
120 query->begin_result = rctx->num_vs_flushes;
121 break;
122 case R600_QUERY_NUM_PS_FLUSHES:
123 query->begin_result = rctx->num_ps_flushes;
124 break;
125 case R600_QUERY_NUM_CS_FLUSHES:
126 query->begin_result = rctx->num_cs_flushes;
127 break;
128 case R600_QUERY_NUM_CB_CACHE_FLUSHES:
129 query->begin_result = rctx->num_cb_cache_flushes;
130 break;
131 case R600_QUERY_NUM_DB_CACHE_FLUSHES:
132 query->begin_result = rctx->num_db_cache_flushes;
133 break;
134 case R600_QUERY_NUM_L2_INVALIDATES:
135 query->begin_result = rctx->num_L2_invalidates;
136 break;
137 case R600_QUERY_NUM_L2_WRITEBACKS:
138 query->begin_result = rctx->num_L2_writebacks;
139 break;
140 case R600_QUERY_NUM_RESIDENT_HANDLES:
141 query->begin_result = rctx->num_resident_handles;
142 break;
143 case R600_QUERY_TC_OFFLOADED_SLOTS:
144 query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
145 break;
146 case R600_QUERY_TC_DIRECT_SLOTS:
147 query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
148 break;
149 case R600_QUERY_TC_NUM_SYNCS:
150 query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0;
151 break;
152 case R600_QUERY_REQUESTED_VRAM:
153 case R600_QUERY_REQUESTED_GTT:
154 case R600_QUERY_MAPPED_VRAM:
155 case R600_QUERY_MAPPED_GTT:
156 case R600_QUERY_VRAM_USAGE:
157 case R600_QUERY_VRAM_VIS_USAGE:
158 case R600_QUERY_GTT_USAGE:
159 case R600_QUERY_GPU_TEMPERATURE:
160 case R600_QUERY_CURRENT_GPU_SCLK:
161 case R600_QUERY_CURRENT_GPU_MCLK:
162 case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
163 case R600_QUERY_NUM_MAPPED_BUFFERS:
164 query->begin_result = 0;
165 break;
166 case R600_QUERY_BUFFER_WAIT_TIME:
167 case R600_QUERY_NUM_GFX_IBS:
168 case R600_QUERY_NUM_SDMA_IBS:
169 case R600_QUERY_NUM_BYTES_MOVED:
170 case R600_QUERY_NUM_EVICTIONS:
171 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
172 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
173 query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
174 break;
175 }
176 case R600_QUERY_CS_THREAD_BUSY:
177 ws_id = winsys_id_from_type(query->b.type);
178 query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
179 query->begin_time = os_time_get_nano();
180 break;
181 case R600_QUERY_GALLIUM_THREAD_BUSY:
182 query->begin_result =
183 rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
184 query->begin_time = os_time_get_nano();
185 break;
186 case R600_QUERY_GPU_LOAD:
187 case R600_QUERY_GPU_SHADERS_BUSY:
188 case R600_QUERY_GPU_TA_BUSY:
189 case R600_QUERY_GPU_GDS_BUSY:
190 case R600_QUERY_GPU_VGT_BUSY:
191 case R600_QUERY_GPU_IA_BUSY:
192 case R600_QUERY_GPU_SX_BUSY:
193 case R600_QUERY_GPU_WD_BUSY:
194 case R600_QUERY_GPU_BCI_BUSY:
195 case R600_QUERY_GPU_SC_BUSY:
196 case R600_QUERY_GPU_PA_BUSY:
197 case R600_QUERY_GPU_DB_BUSY:
198 case R600_QUERY_GPU_CP_BUSY:
199 case R600_QUERY_GPU_CB_BUSY:
200 case R600_QUERY_GPU_SDMA_BUSY:
201 case R600_QUERY_GPU_PFP_BUSY:
202 case R600_QUERY_GPU_MEQ_BUSY:
203 case R600_QUERY_GPU_ME_BUSY:
204 case R600_QUERY_GPU_SURF_SYNC_BUSY:
205 case R600_QUERY_GPU_DMA_BUSY:
206 case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
207 case R600_QUERY_GPU_CE_BUSY:
208 query->begin_result = r600_begin_counter(rctx->screen,
209 query->b.type);
210 break;
211 case R600_QUERY_NUM_COMPILATIONS:
212 query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
213 break;
214 case R600_QUERY_NUM_SHADERS_CREATED:
215 query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
216 break;
217 case R600_QUERY_NUM_SHADER_CACHE_HITS:
218 query->begin_result =
219 p_atomic_read(&rctx->screen->num_shader_cache_hits);
220 break;
221 case R600_QUERY_GPIN_ASIC_ID:
222 case R600_QUERY_GPIN_NUM_SIMD:
223 case R600_QUERY_GPIN_NUM_RB:
224 case R600_QUERY_GPIN_NUM_SPI:
225 case R600_QUERY_GPIN_NUM_SE:
226 break;
227 default:
228 unreachable("r600_query_sw_begin: bad query type");
229 }
230
231 return true;
232 }
233
234 static bool r600_query_sw_end(struct r600_common_context *rctx,
235 struct r600_query *rquery)
236 {
237 struct r600_query_sw *query = (struct r600_query_sw *)rquery;
238 enum radeon_value_id ws_id;
239
240 switch(query->b.type) {
241 case PIPE_QUERY_TIMESTAMP_DISJOINT:
242 break;
243 case PIPE_QUERY_GPU_FINISHED:
244 rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
245 break;
246 case R600_QUERY_DRAW_CALLS:
247 query->end_result = rctx->num_draw_calls;
248 break;
249 case R600_QUERY_PRIM_RESTART_CALLS:
250 query->end_result = rctx->num_prim_restart_calls;
251 break;
252 case R600_QUERY_SPILL_DRAW_CALLS:
253 query->end_result = rctx->num_spill_draw_calls;
254 break;
255 case R600_QUERY_COMPUTE_CALLS:
256 query->end_result = rctx->num_compute_calls;
257 break;
258 case R600_QUERY_SPILL_COMPUTE_CALLS:
259 query->end_result = rctx->num_spill_compute_calls;
260 break;
261 case R600_QUERY_DMA_CALLS:
262 query->end_result = rctx->num_dma_calls;
263 break;
264 case R600_QUERY_CP_DMA_CALLS:
265 query->end_result = rctx->num_cp_dma_calls;
266 break;
267 case R600_QUERY_NUM_VS_FLUSHES:
268 query->end_result = rctx->num_vs_flushes;
269 break;
270 case R600_QUERY_NUM_PS_FLUSHES:
271 query->end_result = rctx->num_ps_flushes;
272 break;
273 case R600_QUERY_NUM_CS_FLUSHES:
274 query->end_result = rctx->num_cs_flushes;
275 break;
276 case R600_QUERY_NUM_CB_CACHE_FLUSHES:
277 query->end_result = rctx->num_cb_cache_flushes;
278 break;
279 case R600_QUERY_NUM_DB_CACHE_FLUSHES:
280 query->end_result = rctx->num_db_cache_flushes;
281 break;
282 case R600_QUERY_NUM_L2_INVALIDATES:
283 query->end_result = rctx->num_L2_invalidates;
284 break;
285 case R600_QUERY_NUM_L2_WRITEBACKS:
286 query->end_result = rctx->num_L2_writebacks;
287 break;
288 case R600_QUERY_NUM_RESIDENT_HANDLES:
289 query->end_result = rctx->num_resident_handles;
290 break;
291 case R600_QUERY_TC_OFFLOADED_SLOTS:
292 query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
293 break;
294 case R600_QUERY_TC_DIRECT_SLOTS:
295 query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
296 break;
297 case R600_QUERY_TC_NUM_SYNCS:
298 query->end_result = rctx->tc ? rctx->tc->num_syncs : 0;
299 break;
300 case R600_QUERY_REQUESTED_VRAM:
301 case R600_QUERY_REQUESTED_GTT:
302 case R600_QUERY_MAPPED_VRAM:
303 case R600_QUERY_MAPPED_GTT:
304 case R600_QUERY_VRAM_USAGE:
305 case R600_QUERY_VRAM_VIS_USAGE:
306 case R600_QUERY_GTT_USAGE:
307 case R600_QUERY_GPU_TEMPERATURE:
308 case R600_QUERY_CURRENT_GPU_SCLK:
309 case R600_QUERY_CURRENT_GPU_MCLK:
310 case R600_QUERY_BUFFER_WAIT_TIME:
311 case R600_QUERY_NUM_MAPPED_BUFFERS:
312 case R600_QUERY_NUM_GFX_IBS:
313 case R600_QUERY_NUM_SDMA_IBS:
314 case R600_QUERY_NUM_BYTES_MOVED:
315 case R600_QUERY_NUM_EVICTIONS:
316 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
317 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
318 query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
319 break;
320 }
321 case R600_QUERY_CS_THREAD_BUSY:
322 ws_id = winsys_id_from_type(query->b.type);
323 query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
324 query->end_time = os_time_get_nano();
325 break;
326 case R600_QUERY_GALLIUM_THREAD_BUSY:
327 query->end_result =
328 rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
329 query->end_time = os_time_get_nano();
330 break;
331 case R600_QUERY_GPU_LOAD:
332 case R600_QUERY_GPU_SHADERS_BUSY:
333 case R600_QUERY_GPU_TA_BUSY:
334 case R600_QUERY_GPU_GDS_BUSY:
335 case R600_QUERY_GPU_VGT_BUSY:
336 case R600_QUERY_GPU_IA_BUSY:
337 case R600_QUERY_GPU_SX_BUSY:
338 case R600_QUERY_GPU_WD_BUSY:
339 case R600_QUERY_GPU_BCI_BUSY:
340 case R600_QUERY_GPU_SC_BUSY:
341 case R600_QUERY_GPU_PA_BUSY:
342 case R600_QUERY_GPU_DB_BUSY:
343 case R600_QUERY_GPU_CP_BUSY:
344 case R600_QUERY_GPU_CB_BUSY:
345 case R600_QUERY_GPU_SDMA_BUSY:
346 case R600_QUERY_GPU_PFP_BUSY:
347 case R600_QUERY_GPU_MEQ_BUSY:
348 case R600_QUERY_GPU_ME_BUSY:
349 case R600_QUERY_GPU_SURF_SYNC_BUSY:
350 case R600_QUERY_GPU_DMA_BUSY:
351 case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
352 case R600_QUERY_GPU_CE_BUSY:
353 query->end_result = r600_end_counter(rctx->screen,
354 query->b.type,
355 query->begin_result);
356 query->begin_result = 0;
357 break;
358 case R600_QUERY_NUM_COMPILATIONS:
359 query->end_result = p_atomic_read(&rctx->screen->num_compilations);
360 break;
361 case R600_QUERY_NUM_SHADERS_CREATED:
362 query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
363 break;
364 case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
365 query->end_result = rctx->last_tex_ps_draw_ratio;
366 break;
367 case R600_QUERY_NUM_SHADER_CACHE_HITS:
368 query->end_result =
369 p_atomic_read(&rctx->screen->num_shader_cache_hits);
370 break;
371 case R600_QUERY_GPIN_ASIC_ID:
372 case R600_QUERY_GPIN_NUM_SIMD:
373 case R600_QUERY_GPIN_NUM_RB:
374 case R600_QUERY_GPIN_NUM_SPI:
375 case R600_QUERY_GPIN_NUM_SE:
376 break;
377 default:
378 unreachable("r600_query_sw_end: bad query type");
379 }
380
381 return true;
382 }
383
384 static bool r600_query_sw_get_result(struct r600_common_context *rctx,
385 struct r600_query *rquery,
386 bool wait,
387 union pipe_query_result *result)
388 {
389 struct r600_query_sw *query = (struct r600_query_sw *)rquery;
390
391 switch (query->b.type) {
392 case PIPE_QUERY_TIMESTAMP_DISJOINT:
393 /* Convert from cycles per millisecond to cycles per second (Hz). */
394 result->timestamp_disjoint.frequency =
395 (uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
396 result->timestamp_disjoint.disjoint = false;
397 return true;
398 case PIPE_QUERY_GPU_FINISHED: {
399 struct pipe_screen *screen = rctx->b.screen;
400 struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b;
401
402 result->b = screen->fence_finish(screen, ctx, query->fence,
403 wait ? PIPE_TIMEOUT_INFINITE : 0);
404 return result->b;
405 }
406
407 case R600_QUERY_CS_THREAD_BUSY:
408 case R600_QUERY_GALLIUM_THREAD_BUSY:
409 result->u64 = (query->end_result - query->begin_result) * 100 /
410 (query->end_time - query->begin_time);
411 return true;
412 case R600_QUERY_GPIN_ASIC_ID:
413 result->u32 = 0;
414 return true;
415 case R600_QUERY_GPIN_NUM_SIMD:
416 result->u32 = rctx->screen->info.num_good_compute_units;
417 return true;
418 case R600_QUERY_GPIN_NUM_RB:
419 result->u32 = rctx->screen->info.num_render_backends;
420 return true;
421 case R600_QUERY_GPIN_NUM_SPI:
422 result->u32 = 1; /* all supported chips have one SPI per SE */
423 return true;
424 case R600_QUERY_GPIN_NUM_SE:
425 result->u32 = rctx->screen->info.max_se;
426 return true;
427 }
428
429 result->u64 = query->end_result - query->begin_result;
430
431 switch (query->b.type) {
432 case R600_QUERY_BUFFER_WAIT_TIME:
433 case R600_QUERY_GPU_TEMPERATURE:
434 result->u64 /= 1000;
435 break;
436 case R600_QUERY_CURRENT_GPU_SCLK:
437 case R600_QUERY_CURRENT_GPU_MCLK:
438 result->u64 *= 1000000;
439 break;
440 }
441
442 return true;
443 }
444
445
446 static struct r600_query_ops sw_query_ops = {
447 .destroy = r600_query_sw_destroy,
448 .begin = r600_query_sw_begin,
449 .end = r600_query_sw_end,
450 .get_result = r600_query_sw_get_result,
451 .get_result_resource = NULL
452 };
453
454 static struct pipe_query *r600_query_sw_create(unsigned query_type)
455 {
456 struct r600_query_sw *query;
457
458 query = CALLOC_STRUCT(r600_query_sw);
459 if (!query)
460 return NULL;
461
462 query->b.type = query_type;
463 query->b.ops = &sw_query_ops;
464
465 return (struct pipe_query *)query;
466 }
467
468 void r600_query_hw_destroy(struct r600_common_screen *rscreen,
469 struct r600_query *rquery)
470 {
471 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
472 struct r600_query_buffer *prev = query->buffer.previous;
473
474 /* Release all query buffers. */
475 while (prev) {
476 struct r600_query_buffer *qbuf = prev;
477 prev = prev->previous;
478 r600_resource_reference(&qbuf->buf, NULL);
479 FREE(qbuf);
480 }
481
482 r600_resource_reference(&query->buffer.buf, NULL);
483 FREE(rquery);
484 }
485
486 static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen,
487 struct r600_query_hw *query)
488 {
489 unsigned buf_size = MAX2(query->result_size,
490 rscreen->info.min_alloc_size);
491
492 /* Queries are normally read by the CPU after
493 * being written by the gpu, hence staging is probably a good
494 * usage pattern.
495 */
496 struct r600_resource *buf = (struct r600_resource*)
497 pipe_buffer_create(&rscreen->b, 0,
498 PIPE_USAGE_STAGING, buf_size);
499 if (!buf)
500 return NULL;
501
502 if (!query->ops->prepare_buffer(rscreen, query, buf)) {
503 r600_resource_reference(&buf, NULL);
504 return NULL;
505 }
506
507 return buf;
508 }
509
510 static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen,
511 struct r600_query_hw *query,
512 struct r600_resource *buffer)
513 {
514 /* Callers ensure that the buffer is currently unused by the GPU. */
515 uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL,
516 PIPE_TRANSFER_WRITE |
517 PIPE_TRANSFER_UNSYNCHRONIZED);
518 if (!results)
519 return false;
520
521 memset(results, 0, buffer->b.b.width0);
522
523 if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
524 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE) {
525 unsigned max_rbs = rscreen->info.num_render_backends;
526 unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask;
527 unsigned num_results;
528 unsigned i, j;
529
530 /* Set top bits for unused backends. */
531 num_results = buffer->b.b.width0 / query->result_size;
532 for (j = 0; j < num_results; j++) {
533 for (i = 0; i < max_rbs; i++) {
534 if (!(enabled_rb_mask & (1<<i))) {
535 results[(i * 4)+1] = 0x80000000;
536 results[(i * 4)+3] = 0x80000000;
537 }
538 }
539 results += 4 * max_rbs;
540 }
541 }
542
543 return true;
544 }
545
546 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
547 struct r600_query *rquery,
548 bool wait,
549 enum pipe_query_value_type result_type,
550 int index,
551 struct pipe_resource *resource,
552 unsigned offset);
553
554 static struct r600_query_ops query_hw_ops = {
555 .destroy = r600_query_hw_destroy,
556 .begin = r600_query_hw_begin,
557 .end = r600_query_hw_end,
558 .get_result = r600_query_hw_get_result,
559 .get_result_resource = r600_query_hw_get_result_resource,
560 };
561
562 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
563 struct r600_query_hw *query,
564 struct r600_resource *buffer,
565 uint64_t va);
566 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
567 struct r600_query_hw *query,
568 struct r600_resource *buffer,
569 uint64_t va);
570 static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
571 struct r600_query_hw *, void *buffer,
572 union pipe_query_result *result);
573 static void r600_query_hw_clear_result(struct r600_query_hw *,
574 union pipe_query_result *);
575
576 static struct r600_query_hw_ops query_hw_default_hw_ops = {
577 .prepare_buffer = r600_query_hw_prepare_buffer,
578 .emit_start = r600_query_hw_do_emit_start,
579 .emit_stop = r600_query_hw_do_emit_stop,
580 .clear_result = r600_query_hw_clear_result,
581 .add_result = r600_query_hw_add_result,
582 };
583
584 bool r600_query_hw_init(struct r600_common_screen *rscreen,
585 struct r600_query_hw *query)
586 {
587 query->buffer.buf = r600_new_query_buffer(rscreen, query);
588 if (!query->buffer.buf)
589 return false;
590
591 return true;
592 }
593
594 static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen,
595 unsigned query_type,
596 unsigned index)
597 {
598 struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
599 if (!query)
600 return NULL;
601
602 query->b.type = query_type;
603 query->b.ops = &query_hw_ops;
604 query->ops = &query_hw_default_hw_ops;
605
606 switch (query_type) {
607 case PIPE_QUERY_OCCLUSION_COUNTER:
608 case PIPE_QUERY_OCCLUSION_PREDICATE:
609 query->result_size = 16 * rscreen->info.num_render_backends;
610 query->result_size += 16; /* for the fence + alignment */
611 query->num_cs_dw_begin = 6;
612 query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
613 break;
614 case PIPE_QUERY_TIME_ELAPSED:
615 query->result_size = 24;
616 query->num_cs_dw_begin = 8;
617 query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
618 break;
619 case PIPE_QUERY_TIMESTAMP:
620 query->result_size = 16;
621 query->num_cs_dw_end = 8 + r600_gfx_write_fence_dwords(rscreen);
622 query->flags = R600_QUERY_HW_FLAG_NO_START;
623 break;
624 case PIPE_QUERY_PRIMITIVES_EMITTED:
625 case PIPE_QUERY_PRIMITIVES_GENERATED:
626 case PIPE_QUERY_SO_STATISTICS:
627 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
628 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
629 query->result_size = 32;
630 query->num_cs_dw_begin = 6;
631 query->num_cs_dw_end = 6;
632 query->stream = index;
633 break;
634 case PIPE_QUERY_PIPELINE_STATISTICS:
635 /* 11 values on EG, 8 on R600. */
636 query->result_size = (rscreen->chip_class >= EVERGREEN ? 11 : 8) * 16;
637 query->result_size += 8; /* for the fence + alignment */
638 query->num_cs_dw_begin = 6;
639 query->num_cs_dw_end = 6 + r600_gfx_write_fence_dwords(rscreen);
640 break;
641 default:
642 assert(0);
643 FREE(query);
644 return NULL;
645 }
646
647 if (!r600_query_hw_init(rscreen, query)) {
648 FREE(query);
649 return NULL;
650 }
651
652 return (struct pipe_query *)query;
653 }
654
655 static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
656 unsigned type, int diff)
657 {
658 if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
659 type == PIPE_QUERY_OCCLUSION_PREDICATE) {
660 bool old_enable = rctx->num_occlusion_queries != 0;
661 bool old_perfect_enable =
662 rctx->num_perfect_occlusion_queries != 0;
663 bool enable, perfect_enable;
664
665 rctx->num_occlusion_queries += diff;
666 assert(rctx->num_occlusion_queries >= 0);
667
668 if (type == PIPE_QUERY_OCCLUSION_COUNTER) {
669 rctx->num_perfect_occlusion_queries += diff;
670 assert(rctx->num_perfect_occlusion_queries >= 0);
671 }
672
673 enable = rctx->num_occlusion_queries != 0;
674 perfect_enable = rctx->num_perfect_occlusion_queries != 0;
675
676 if (enable != old_enable || perfect_enable != old_perfect_enable) {
677 rctx->set_occlusion_query_state(&rctx->b, enable);
678 }
679 }
680 }
681
682 static unsigned event_type_for_stream(struct r600_query_hw *query)
683 {
684 switch (query->stream) {
685 default:
686 case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
687 case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
688 case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;
689 case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;
690 }
691 }
692
693 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
694 struct r600_query_hw *query,
695 struct r600_resource *buffer,
696 uint64_t va)
697 {
698 struct radeon_winsys_cs *cs = ctx->gfx.cs;
699
700 switch (query->b.type) {
701 case PIPE_QUERY_OCCLUSION_COUNTER:
702 case PIPE_QUERY_OCCLUSION_PREDICATE:
703 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
704 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
705 radeon_emit(cs, va);
706 radeon_emit(cs, va >> 32);
707 break;
708 case PIPE_QUERY_PRIMITIVES_EMITTED:
709 case PIPE_QUERY_PRIMITIVES_GENERATED:
710 case PIPE_QUERY_SO_STATISTICS:
711 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
712 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
713 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
714 radeon_emit(cs, va);
715 radeon_emit(cs, va >> 32);
716 break;
717 case PIPE_QUERY_TIME_ELAPSED:
718 if (ctx->chip_class >= SI) {
719 /* Write the timestamp from the CP not waiting for
720 * outstanding draws (top-of-pipe).
721 */
722 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
723 radeon_emit(cs, COPY_DATA_COUNT_SEL |
724 COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
725 COPY_DATA_DST_SEL(COPY_DATA_MEM_ASYNC));
726 radeon_emit(cs, 0);
727 radeon_emit(cs, 0);
728 radeon_emit(cs, va);
729 radeon_emit(cs, va >> 32);
730 } else {
731 /* Write the timestamp after the last draw is done.
732 * (bottom-of-pipe)
733 */
734 r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
735 0, 3, NULL, va, 0, 0);
736 }
737 break;
738 case PIPE_QUERY_PIPELINE_STATISTICS:
739 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
740 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
741 radeon_emit(cs, va);
742 radeon_emit(cs, va >> 32);
743 break;
744 default:
745 assert(0);
746 }
747 r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
748 RADEON_PRIO_QUERY);
749 }
750
751 static void r600_query_hw_emit_start(struct r600_common_context *ctx,
752 struct r600_query_hw *query)
753 {
754 uint64_t va;
755
756 if (!query->buffer.buf)
757 return; // previous buffer allocation failure
758
759 r600_update_occlusion_query_state(ctx, query->b.type, 1);
760 r600_update_prims_generated_query_state(ctx, query->b.type, 1);
761
762 ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
763 true);
764
765 /* Get a new query buffer if needed. */
766 if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
767 struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
768 *qbuf = query->buffer;
769 query->buffer.results_end = 0;
770 query->buffer.previous = qbuf;
771 query->buffer.buf = r600_new_query_buffer(ctx->screen, query);
772 if (!query->buffer.buf)
773 return;
774 }
775
776 /* emit begin query */
777 va = query->buffer.buf->gpu_address + query->buffer.results_end;
778
779 query->ops->emit_start(ctx, query, query->buffer.buf, va);
780
781 ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
782 }
783
784 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
785 struct r600_query_hw *query,
786 struct r600_resource *buffer,
787 uint64_t va)
788 {
789 struct radeon_winsys_cs *cs = ctx->gfx.cs;
790 uint64_t fence_va = 0;
791
792 switch (query->b.type) {
793 case PIPE_QUERY_OCCLUSION_COUNTER:
794 case PIPE_QUERY_OCCLUSION_PREDICATE:
795 va += 8;
796 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
797 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
798 radeon_emit(cs, va);
799 radeon_emit(cs, va >> 32);
800
801 fence_va = va + ctx->screen->info.num_render_backends * 16 - 8;
802 break;
803 case PIPE_QUERY_PRIMITIVES_EMITTED:
804 case PIPE_QUERY_PRIMITIVES_GENERATED:
805 case PIPE_QUERY_SO_STATISTICS:
806 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
807 va += query->result_size/2;
808 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
809 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
810 radeon_emit(cs, va);
811 radeon_emit(cs, va >> 32);
812 break;
813 case PIPE_QUERY_TIME_ELAPSED:
814 va += 8;
815 /* fall through */
816 case PIPE_QUERY_TIMESTAMP:
817 r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
818 0, 3, NULL, va, 0, 0);
819 fence_va = va + 8;
820 break;
821 case PIPE_QUERY_PIPELINE_STATISTICS: {
822 unsigned sample_size = (query->result_size - 8) / 2;
823
824 va += sample_size;
825 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
826 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
827 radeon_emit(cs, va);
828 radeon_emit(cs, va >> 32);
829
830 fence_va = va + sample_size;
831 break;
832 }
833 default:
834 assert(0);
835 }
836 r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
837 RADEON_PRIO_QUERY);
838
839 if (fence_va)
840 r600_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0, 1,
841 query->buffer.buf, fence_va, 0, 0x80000000);
842 }
843
844 static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
845 struct r600_query_hw *query)
846 {
847 uint64_t va;
848
849 if (!query->buffer.buf)
850 return; // previous buffer allocation failure
851
852 /* The queries which need begin already called this in begin_query. */
853 if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
854 ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);
855 }
856
857 /* emit end query */
858 va = query->buffer.buf->gpu_address + query->buffer.results_end;
859
860 query->ops->emit_stop(ctx, query, query->buffer.buf, va);
861
862 query->buffer.results_end += query->result_size;
863
864 if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
865 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
866
867 r600_update_occlusion_query_state(ctx, query->b.type, -1);
868 r600_update_prims_generated_query_state(ctx, query->b.type, -1);
869 }
870
871 static void r600_emit_query_predication(struct r600_common_context *ctx,
872 struct r600_atom *atom)
873 {
874 struct radeon_winsys_cs *cs = ctx->gfx.cs;
875 struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
876 struct r600_query_buffer *qbuf;
877 uint32_t op;
878 bool flag_wait;
879
880 if (!query)
881 return;
882
883 flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
884 ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
885
886 switch (query->b.type) {
887 case PIPE_QUERY_OCCLUSION_COUNTER:
888 case PIPE_QUERY_OCCLUSION_PREDICATE:
889 op = PRED_OP(PREDICATION_OP_ZPASS);
890 break;
891 case PIPE_QUERY_PRIMITIVES_EMITTED:
892 case PIPE_QUERY_PRIMITIVES_GENERATED:
893 case PIPE_QUERY_SO_STATISTICS:
894 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
895 op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
896 break;
897 default:
898 assert(0);
899 return;
900 }
901
902 /* if true then invert, see GL_ARB_conditional_render_inverted */
903 if (ctx->render_cond_invert)
904 op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
905 else
906 op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
907
908 op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
909
910 /* emit predicate packets for all data blocks */
911 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
912 unsigned results_base = 0;
913 uint64_t va_base = qbuf->buf->gpu_address;
914
915 while (results_base < qbuf->results_end) {
916 uint64_t va = va_base + results_base;
917
918 if (ctx->chip_class >= GFX9) {
919 radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
920 radeon_emit(cs, op);
921 radeon_emit(cs, va);
922 radeon_emit(cs, va >> 32);
923 } else {
924 radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
925 radeon_emit(cs, va);
926 radeon_emit(cs, op | ((va >> 32) & 0xFF));
927 }
928 r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
929 RADEON_PRIO_QUERY);
930 results_base += query->result_size;
931
932 /* set CONTINUE bit for all packets except the first */
933 op |= PREDICATION_CONTINUE;
934 }
935 }
936 }
937
938 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
939 {
940 struct r600_common_screen *rscreen =
941 (struct r600_common_screen *)ctx->screen;
942
943 if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
944 query_type == PIPE_QUERY_GPU_FINISHED ||
945 query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
946 return r600_query_sw_create(query_type);
947
948 return r600_query_hw_create(rscreen, query_type, index);
949 }
950
951 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
952 {
953 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
954 struct r600_query *rquery = (struct r600_query *)query;
955
956 rquery->ops->destroy(rctx->screen, rquery);
957 }
958
959 static boolean r600_begin_query(struct pipe_context *ctx,
960 struct pipe_query *query)
961 {
962 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
963 struct r600_query *rquery = (struct r600_query *)query;
964
965 return rquery->ops->begin(rctx, rquery);
966 }
967
968 void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
969 struct r600_query_hw *query)
970 {
971 struct r600_query_buffer *prev = query->buffer.previous;
972
973 /* Discard the old query buffers. */
974 while (prev) {
975 struct r600_query_buffer *qbuf = prev;
976 prev = prev->previous;
977 r600_resource_reference(&qbuf->buf, NULL);
978 FREE(qbuf);
979 }
980
981 query->buffer.results_end = 0;
982 query->buffer.previous = NULL;
983
984 /* Obtain a new buffer if the current one can't be mapped without a stall. */
985 if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
986 !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
987 r600_resource_reference(&query->buffer.buf, NULL);
988 query->buffer.buf = r600_new_query_buffer(rctx->screen, query);
989 } else {
990 if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf))
991 r600_resource_reference(&query->buffer.buf, NULL);
992 }
993 }
994
995 bool r600_query_hw_begin(struct r600_common_context *rctx,
996 struct r600_query *rquery)
997 {
998 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
999
1000 if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
1001 assert(0);
1002 return false;
1003 }
1004
1005 if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
1006 r600_query_hw_reset_buffers(rctx, query);
1007
1008 r600_query_hw_emit_start(rctx, query);
1009 if (!query->buffer.buf)
1010 return false;
1011
1012 LIST_ADDTAIL(&query->list, &rctx->active_queries);
1013 return true;
1014 }
1015
1016 static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
1017 {
1018 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1019 struct r600_query *rquery = (struct r600_query *)query;
1020
1021 return rquery->ops->end(rctx, rquery);
1022 }
1023
1024 bool r600_query_hw_end(struct r600_common_context *rctx,
1025 struct r600_query *rquery)
1026 {
1027 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1028
1029 if (query->flags & R600_QUERY_HW_FLAG_NO_START)
1030 r600_query_hw_reset_buffers(rctx, query);
1031
1032 r600_query_hw_emit_stop(rctx, query);
1033
1034 if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
1035 LIST_DELINIT(&query->list);
1036
1037 if (!query->buffer.buf)
1038 return false;
1039
1040 return true;
1041 }
1042
1043 static void r600_get_hw_query_params(struct r600_common_context *rctx,
1044 struct r600_query_hw *rquery, int index,
1045 struct r600_hw_query_params *params)
1046 {
1047 unsigned max_rbs = rctx->screen->info.num_render_backends;
1048
1049 params->pair_stride = 0;
1050 params->pair_count = 1;
1051
1052 switch (rquery->b.type) {
1053 case PIPE_QUERY_OCCLUSION_COUNTER:
1054 case PIPE_QUERY_OCCLUSION_PREDICATE:
1055 params->start_offset = 0;
1056 params->end_offset = 8;
1057 params->fence_offset = max_rbs * 16;
1058 params->pair_stride = 16;
1059 params->pair_count = max_rbs;
1060 break;
1061 case PIPE_QUERY_TIME_ELAPSED:
1062 params->start_offset = 0;
1063 params->end_offset = 8;
1064 params->fence_offset = 16;
1065 break;
1066 case PIPE_QUERY_TIMESTAMP:
1067 params->start_offset = 0;
1068 params->end_offset = 0;
1069 params->fence_offset = 8;
1070 break;
1071 case PIPE_QUERY_PRIMITIVES_EMITTED:
1072 params->start_offset = 8;
1073 params->end_offset = 24;
1074 params->fence_offset = params->end_offset + 4;
1075 break;
1076 case PIPE_QUERY_PRIMITIVES_GENERATED:
1077 params->start_offset = 0;
1078 params->end_offset = 16;
1079 params->fence_offset = params->end_offset + 4;
1080 break;
1081 case PIPE_QUERY_SO_STATISTICS:
1082 params->start_offset = 8 - index * 8;
1083 params->end_offset = 24 - index * 8;
1084 params->fence_offset = params->end_offset + 4;
1085 break;
1086 case PIPE_QUERY_PIPELINE_STATISTICS:
1087 {
1088 /* Offsets apply to EG+ */
1089 static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
1090 params->start_offset = offsets[index];
1091 params->end_offset = 88 + offsets[index];
1092 params->fence_offset = 2 * 88;
1093 break;
1094 }
1095 default:
1096 unreachable("r600_get_hw_query_params unsupported");
1097 }
1098 }
1099
1100 static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
1101 bool test_status_bit)
1102 {
1103 uint32_t *current_result = (uint32_t*)map;
1104 uint64_t start, end;
1105
1106 start = (uint64_t)current_result[start_index] |
1107 (uint64_t)current_result[start_index+1] << 32;
1108 end = (uint64_t)current_result[end_index] |
1109 (uint64_t)current_result[end_index+1] << 32;
1110
1111 if (!test_status_bit ||
1112 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1113 return end - start;
1114 }
1115 return 0;
1116 }
1117
1118 static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
1119 struct r600_query_hw *query,
1120 void *buffer,
1121 union pipe_query_result *result)
1122 {
1123 unsigned max_rbs = rscreen->info.num_render_backends;
1124
1125 switch (query->b.type) {
1126 case PIPE_QUERY_OCCLUSION_COUNTER: {
1127 for (unsigned i = 0; i < max_rbs; ++i) {
1128 unsigned results_base = i * 16;
1129 result->u64 +=
1130 r600_query_read_result(buffer + results_base, 0, 2, true);
1131 }
1132 break;
1133 }
1134 case PIPE_QUERY_OCCLUSION_PREDICATE: {
1135 for (unsigned i = 0; i < max_rbs; ++i) {
1136 unsigned results_base = i * 16;
1137 result->b = result->b ||
1138 r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
1139 }
1140 break;
1141 }
1142 case PIPE_QUERY_TIME_ELAPSED:
1143 result->u64 += r600_query_read_result(buffer, 0, 2, false);
1144 break;
1145 case PIPE_QUERY_TIMESTAMP:
1146 result->u64 = *(uint64_t*)buffer;
1147 break;
1148 case PIPE_QUERY_PRIMITIVES_EMITTED:
1149 /* SAMPLE_STREAMOUTSTATS stores this structure:
1150 * {
1151 * u64 NumPrimitivesWritten;
1152 * u64 PrimitiveStorageNeeded;
1153 * }
1154 * We only need NumPrimitivesWritten here. */
1155 result->u64 += r600_query_read_result(buffer, 2, 6, true);
1156 break;
1157 case PIPE_QUERY_PRIMITIVES_GENERATED:
1158 /* Here we read PrimitiveStorageNeeded. */
1159 result->u64 += r600_query_read_result(buffer, 0, 4, true);
1160 break;
1161 case PIPE_QUERY_SO_STATISTICS:
1162 result->so_statistics.num_primitives_written +=
1163 r600_query_read_result(buffer, 2, 6, true);
1164 result->so_statistics.primitives_storage_needed +=
1165 r600_query_read_result(buffer, 0, 4, true);
1166 break;
1167 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1168 result->b = result->b ||
1169 r600_query_read_result(buffer, 2, 6, true) !=
1170 r600_query_read_result(buffer, 0, 4, true);
1171 break;
1172 case PIPE_QUERY_PIPELINE_STATISTICS:
1173 if (rscreen->chip_class >= EVERGREEN) {
1174 result->pipeline_statistics.ps_invocations +=
1175 r600_query_read_result(buffer, 0, 22, false);
1176 result->pipeline_statistics.c_primitives +=
1177 r600_query_read_result(buffer, 2, 24, false);
1178 result->pipeline_statistics.c_invocations +=
1179 r600_query_read_result(buffer, 4, 26, false);
1180 result->pipeline_statistics.vs_invocations +=
1181 r600_query_read_result(buffer, 6, 28, false);
1182 result->pipeline_statistics.gs_invocations +=
1183 r600_query_read_result(buffer, 8, 30, false);
1184 result->pipeline_statistics.gs_primitives +=
1185 r600_query_read_result(buffer, 10, 32, false);
1186 result->pipeline_statistics.ia_primitives +=
1187 r600_query_read_result(buffer, 12, 34, false);
1188 result->pipeline_statistics.ia_vertices +=
1189 r600_query_read_result(buffer, 14, 36, false);
1190 result->pipeline_statistics.hs_invocations +=
1191 r600_query_read_result(buffer, 16, 38, false);
1192 result->pipeline_statistics.ds_invocations +=
1193 r600_query_read_result(buffer, 18, 40, false);
1194 result->pipeline_statistics.cs_invocations +=
1195 r600_query_read_result(buffer, 20, 42, false);
1196 } else {
1197 result->pipeline_statistics.ps_invocations +=
1198 r600_query_read_result(buffer, 0, 16, false);
1199 result->pipeline_statistics.c_primitives +=
1200 r600_query_read_result(buffer, 2, 18, false);
1201 result->pipeline_statistics.c_invocations +=
1202 r600_query_read_result(buffer, 4, 20, false);
1203 result->pipeline_statistics.vs_invocations +=
1204 r600_query_read_result(buffer, 6, 22, false);
1205 result->pipeline_statistics.gs_invocations +=
1206 r600_query_read_result(buffer, 8, 24, false);
1207 result->pipeline_statistics.gs_primitives +=
1208 r600_query_read_result(buffer, 10, 26, false);
1209 result->pipeline_statistics.ia_primitives +=
1210 r600_query_read_result(buffer, 12, 28, false);
1211 result->pipeline_statistics.ia_vertices +=
1212 r600_query_read_result(buffer, 14, 30, false);
1213 }
1214 #if 0 /* for testing */
1215 printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1216 "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1217 "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1218 result->pipeline_statistics.ia_vertices,
1219 result->pipeline_statistics.ia_primitives,
1220 result->pipeline_statistics.vs_invocations,
1221 result->pipeline_statistics.hs_invocations,
1222 result->pipeline_statistics.ds_invocations,
1223 result->pipeline_statistics.gs_invocations,
1224 result->pipeline_statistics.gs_primitives,
1225 result->pipeline_statistics.c_invocations,
1226 result->pipeline_statistics.c_primitives,
1227 result->pipeline_statistics.ps_invocations,
1228 result->pipeline_statistics.cs_invocations);
1229 #endif
1230 break;
1231 default:
1232 assert(0);
1233 }
1234 }
1235
1236 static boolean r600_get_query_result(struct pipe_context *ctx,
1237 struct pipe_query *query, boolean wait,
1238 union pipe_query_result *result)
1239 {
1240 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1241 struct r600_query *rquery = (struct r600_query *)query;
1242
1243 return rquery->ops->get_result(rctx, rquery, wait, result);
1244 }
1245
1246 static void r600_get_query_result_resource(struct pipe_context *ctx,
1247 struct pipe_query *query,
1248 boolean wait,
1249 enum pipe_query_value_type result_type,
1250 int index,
1251 struct pipe_resource *resource,
1252 unsigned offset)
1253 {
1254 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1255 struct r600_query *rquery = (struct r600_query *)query;
1256
1257 rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,
1258 resource, offset);
1259 }
1260
1261 static void r600_query_hw_clear_result(struct r600_query_hw *query,
1262 union pipe_query_result *result)
1263 {
1264 util_query_clear_result(result, query->b.type);
1265 }
1266
1267 bool r600_query_hw_get_result(struct r600_common_context *rctx,
1268 struct r600_query *rquery,
1269 bool wait, union pipe_query_result *result)
1270 {
1271 struct r600_common_screen *rscreen = rctx->screen;
1272 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1273 struct r600_query_buffer *qbuf;
1274
1275 query->ops->clear_result(query, result);
1276
1277 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1278 unsigned usage = PIPE_TRANSFER_READ |
1279 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
1280 unsigned results_base = 0;
1281 void *map;
1282
1283 if (rquery->b.flushed)
1284 map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1285 else
1286 map = r600_buffer_map_sync_with_rings(rctx, qbuf->buf, usage);
1287
1288 if (!map)
1289 return false;
1290
1291 while (results_base != qbuf->results_end) {
1292 query->ops->add_result(rscreen, query, map + results_base,
1293 result);
1294 results_base += query->result_size;
1295 }
1296 }
1297
1298 /* Convert the time to expected units. */
1299 if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
1300 rquery->type == PIPE_QUERY_TIMESTAMP) {
1301 result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq;
1302 }
1303 return true;
1304 }
1305
1306 /* Create the compute shader that is used to collect the results.
1307 *
1308 * One compute grid with a single thread is launched for every query result
1309 * buffer. The thread (optionally) reads a previous summary buffer, then
1310 * accumulates data from the query result buffer, and writes the result either
1311 * to a summary buffer to be consumed by the next grid invocation or to the
1312 * user-supplied buffer.
1313 *
1314 * Data layout:
1315 *
1316 * CONST
1317 * 0.x = end_offset
1318 * 0.y = result_stride
1319 * 0.z = result_count
1320 * 0.w = bit field:
1321 * 1: read previously accumulated values
1322 * 2: write accumulated values for chaining
1323 * 4: write result available
1324 * 8: convert result to boolean (0/1)
1325 * 16: only read one dword and use that as result
1326 * 32: apply timestamp conversion
1327 * 64: store full 64 bits result
1328 * 128: store signed 32 bits result
1329 * 1.x = fence_offset
1330 * 1.y = pair_stride
1331 * 1.z = pair_count
1332 *
1333 * BUFFER[0] = query result buffer
1334 * BUFFER[1] = previous summary buffer
1335 * BUFFER[2] = next summary buffer or user-supplied buffer
1336 */
1337 static void r600_create_query_result_shader(struct r600_common_context *rctx)
1338 {
1339 /* TEMP[0].xy = accumulated result so far
1340 * TEMP[0].z = result not available
1341 *
1342 * TEMP[1].x = current result index
1343 * TEMP[1].y = current pair index
1344 */
1345 static const char text_tmpl[] =
1346 "COMP\n"
1347 "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
1348 "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
1349 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
1350 "DCL BUFFER[0]\n"
1351 "DCL BUFFER[1]\n"
1352 "DCL BUFFER[2]\n"
1353 "DCL CONST[0..1]\n"
1354 "DCL TEMP[0..5]\n"
1355 "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
1356 "IMM[1] UINT32 {1, 2, 4, 8}\n"
1357 "IMM[2] UINT32 {16, 32, 64, 128}\n"
1358 "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
1359
1360 "AND TEMP[5], CONST[0].wwww, IMM[2].xxxx\n"
1361 "UIF TEMP[5]\n"
1362 /* Check result availability. */
1363 "LOAD TEMP[1].x, BUFFER[0], CONST[1].xxxx\n"
1364 "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
1365 "MOV TEMP[1], TEMP[0].zzzz\n"
1366 "NOT TEMP[0].z, TEMP[0].zzzz\n"
1367
1368 /* Load result if available. */
1369 "UIF TEMP[1]\n"
1370 "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
1371 "ENDIF\n"
1372 "ELSE\n"
1373 /* Load previously accumulated result if requested. */
1374 "MOV TEMP[0], IMM[0].xxxx\n"
1375 "AND TEMP[4], CONST[0].wwww, IMM[1].xxxx\n"
1376 "UIF TEMP[4]\n"
1377 "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
1378 "ENDIF\n"
1379
1380 "MOV TEMP[1].x, IMM[0].xxxx\n"
1381 "BGNLOOP\n"
1382 /* Break if accumulated result so far is not available. */
1383 "UIF TEMP[0].zzzz\n"
1384 "BRK\n"
1385 "ENDIF\n"
1386
1387 /* Break if result_index >= result_count. */
1388 "USGE TEMP[5], TEMP[1].xxxx, CONST[0].zzzz\n"
1389 "UIF TEMP[5]\n"
1390 "BRK\n"
1391 "ENDIF\n"
1392
1393 /* Load fence and check result availability */
1394 "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy, CONST[1].xxxx\n"
1395 "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
1396 "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
1397 "NOT TEMP[0].z, TEMP[0].zzzz\n"
1398 "UIF TEMP[0].zzzz\n"
1399 "BRK\n"
1400 "ENDIF\n"
1401
1402 "MOV TEMP[1].y, IMM[0].xxxx\n"
1403 "BGNLOOP\n"
1404 /* Load start and end. */
1405 "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0].yyyy\n"
1406 "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[1].yyyy, TEMP[5].xxxx\n"
1407 "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
1408
1409 "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0].xxxx\n"
1410 "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].xxxx\n"
1411
1412 "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
1413 "U64ADD TEMP[0].xy, TEMP[0], TEMP[3]\n"
1414
1415 /* Increment pair index */
1416 "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
1417 "USGE TEMP[5], TEMP[1].yyyy, CONST[1].zzzz\n"
1418 "UIF TEMP[5]\n"
1419 "BRK\n"
1420 "ENDIF\n"
1421 "ENDLOOP\n"
1422
1423 /* Increment result index */
1424 "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
1425 "ENDLOOP\n"
1426 "ENDIF\n"
1427
1428 "AND TEMP[4], CONST[0].wwww, IMM[1].yyyy\n"
1429 "UIF TEMP[4]\n"
1430 /* Store accumulated data for chaining. */
1431 "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
1432 "ELSE\n"
1433 "AND TEMP[4], CONST[0].wwww, IMM[1].zzzz\n"
1434 "UIF TEMP[4]\n"
1435 /* Store result availability. */
1436 "NOT TEMP[0].z, TEMP[0]\n"
1437 "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
1438 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
1439
1440 "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
1441 "UIF TEMP[4]\n"
1442 "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
1443 "ENDIF\n"
1444 "ELSE\n"
1445 /* Store result if it is available. */
1446 "NOT TEMP[4], TEMP[0].zzzz\n"
1447 "UIF TEMP[4]\n"
1448 /* Apply timestamp conversion */
1449 "AND TEMP[4], CONST[0].wwww, IMM[2].yyyy\n"
1450 "UIF TEMP[4]\n"
1451 "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
1452 "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
1453 "ENDIF\n"
1454
1455 /* Convert to boolean */
1456 "AND TEMP[4], CONST[0].wwww, IMM[1].wwww\n"
1457 "UIF TEMP[4]\n"
1458 "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[0].xxxx\n"
1459 "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
1460 "MOV TEMP[0].y, IMM[0].xxxx\n"
1461 "ENDIF\n"
1462
1463 "AND TEMP[4], CONST[0].wwww, IMM[2].zzzz\n"
1464 "UIF TEMP[4]\n"
1465 "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
1466 "ELSE\n"
1467 /* Clamping */
1468 "UIF TEMP[0].yyyy\n"
1469 "MOV TEMP[0].x, IMM[0].wwww\n"
1470 "ENDIF\n"
1471
1472 "AND TEMP[4], CONST[0].wwww, IMM[2].wwww\n"
1473 "UIF TEMP[4]\n"
1474 "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
1475 "ENDIF\n"
1476
1477 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
1478 "ENDIF\n"
1479 "ENDIF\n"
1480 "ENDIF\n"
1481 "ENDIF\n"
1482
1483 "END\n";
1484
1485 char text[sizeof(text_tmpl) + 32];
1486 struct tgsi_token tokens[1024];
1487 struct pipe_compute_state state = {};
1488
1489 /* Hard code the frequency into the shader so that the backend can
1490 * use the full range of optimizations for divide-by-constant.
1491 */
1492 snprintf(text, sizeof(text), text_tmpl,
1493 rctx->screen->info.clock_crystal_freq);
1494
1495 if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
1496 assert(false);
1497 return;
1498 }
1499
1500 state.ir_type = PIPE_SHADER_IR_TGSI;
1501 state.prog = tokens;
1502
1503 rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);
1504 }
1505
1506 static void r600_restore_qbo_state(struct r600_common_context *rctx,
1507 struct r600_qbo_state *st)
1508 {
1509 rctx->b.bind_compute_state(&rctx->b, st->saved_compute);
1510
1511 rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
1512 pipe_resource_reference(&st->saved_const0.buffer, NULL);
1513
1514 rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
1515 for (unsigned i = 0; i < 3; ++i)
1516 pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
1517 }
1518
1519 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
1520 struct r600_query *rquery,
1521 bool wait,
1522 enum pipe_query_value_type result_type,
1523 int index,
1524 struct pipe_resource *resource,
1525 unsigned offset)
1526 {
1527 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1528 struct r600_query_buffer *qbuf;
1529 struct r600_query_buffer *qbuf_prev;
1530 struct pipe_resource *tmp_buffer = NULL;
1531 unsigned tmp_buffer_offset = 0;
1532 struct r600_qbo_state saved_state = {};
1533 struct pipe_grid_info grid = {};
1534 struct pipe_constant_buffer constant_buffer = {};
1535 struct pipe_shader_buffer ssbo[3];
1536 struct r600_hw_query_params params;
1537 struct {
1538 uint32_t end_offset;
1539 uint32_t result_stride;
1540 uint32_t result_count;
1541 uint32_t config;
1542 uint32_t fence_offset;
1543 uint32_t pair_stride;
1544 uint32_t pair_count;
1545 } consts;
1546
1547 if (!rctx->query_result_shader) {
1548 r600_create_query_result_shader(rctx);
1549 if (!rctx->query_result_shader)
1550 return;
1551 }
1552
1553 if (query->buffer.previous) {
1554 u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
1555 &tmp_buffer_offset, &tmp_buffer);
1556 if (!tmp_buffer)
1557 return;
1558 }
1559
1560 rctx->save_qbo_state(&rctx->b, &saved_state);
1561
1562 r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, &params);
1563 consts.end_offset = params.end_offset - params.start_offset;
1564 consts.fence_offset = params.fence_offset - params.start_offset;
1565 consts.result_stride = query->result_size;
1566 consts.pair_stride = params.pair_stride;
1567 consts.pair_count = params.pair_count;
1568
1569 constant_buffer.buffer_size = sizeof(consts);
1570 constant_buffer.user_buffer = &consts;
1571
1572 ssbo[1].buffer = tmp_buffer;
1573 ssbo[1].buffer_offset = tmp_buffer_offset;
1574 ssbo[1].buffer_size = 16;
1575
1576 ssbo[2] = ssbo[1];
1577
1578 rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);
1579
1580 grid.block[0] = 1;
1581 grid.block[1] = 1;
1582 grid.block[2] = 1;
1583 grid.grid[0] = 1;
1584 grid.grid[1] = 1;
1585 grid.grid[2] = 1;
1586
1587 consts.config = 0;
1588 if (index < 0)
1589 consts.config |= 4;
1590 if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1591 query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE)
1592 consts.config |= 8;
1593 else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
1594 query->b.type == PIPE_QUERY_TIME_ELAPSED)
1595 consts.config |= 32;
1596
1597 switch (result_type) {
1598 case PIPE_QUERY_TYPE_U64:
1599 case PIPE_QUERY_TYPE_I64:
1600 consts.config |= 64;
1601 break;
1602 case PIPE_QUERY_TYPE_I32:
1603 consts.config |= 128;
1604 break;
1605 case PIPE_QUERY_TYPE_U32:
1606 break;
1607 }
1608
1609 rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;
1610
1611 for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1612 if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1613 qbuf_prev = qbuf->previous;
1614 consts.result_count = qbuf->results_end / query->result_size;
1615 consts.config &= ~3;
1616 if (qbuf != &query->buffer)
1617 consts.config |= 1;
1618 if (qbuf->previous)
1619 consts.config |= 2;
1620 } else {
1621 /* Only read the last timestamp. */
1622 qbuf_prev = NULL;
1623 consts.result_count = 0;
1624 consts.config |= 16;
1625 params.start_offset += qbuf->results_end - query->result_size;
1626 }
1627
1628 rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
1629
1630 ssbo[0].buffer = &qbuf->buf->b.b;
1631 ssbo[0].buffer_offset = params.start_offset;
1632 ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1633
1634 if (!qbuf->previous) {
1635 ssbo[2].buffer = resource;
1636 ssbo[2].buffer_offset = offset;
1637 ssbo[2].buffer_size = 8;
1638
1639 ((struct r600_resource *)resource)->TC_L2_dirty = true;
1640 }
1641
1642 rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
1643
1644 if (wait && qbuf == &query->buffer) {
1645 uint64_t va;
1646
1647 /* Wait for result availability. Wait only for readiness
1648 * of the last entry, since the fence writes should be
1649 * serialized in the CP.
1650 */
1651 va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1652 va += params.fence_offset;
1653
1654 r600_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
1655 }
1656
1657 rctx->b.launch_grid(&rctx->b, &grid);
1658 rctx->flags |= rctx->screen->barrier_flags.compute_to_L2;
1659 }
1660
1661 r600_restore_qbo_state(rctx, &saved_state);
1662 pipe_resource_reference(&tmp_buffer, NULL);
1663 }
1664
1665 static void r600_render_condition(struct pipe_context *ctx,
1666 struct pipe_query *query,
1667 boolean condition,
1668 enum pipe_render_cond_flag mode)
1669 {
1670 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1671 struct r600_query_hw *rquery = (struct r600_query_hw *)query;
1672 struct r600_query_buffer *qbuf;
1673 struct r600_atom *atom = &rctx->render_cond_atom;
1674
1675 rctx->render_cond = query;
1676 rctx->render_cond_invert = condition;
1677 rctx->render_cond_mode = mode;
1678
1679 /* Compute the size of SET_PREDICATION packets. */
1680 atom->num_dw = 0;
1681 if (query) {
1682 for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
1683 atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
1684 }
1685
1686 rctx->set_atom_dirty(rctx, atom, query != NULL);
1687 }
1688
1689 void r600_suspend_queries(struct r600_common_context *ctx)
1690 {
1691 struct r600_query_hw *query;
1692
1693 LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
1694 r600_query_hw_emit_stop(ctx, query);
1695 }
1696 assert(ctx->num_cs_dw_queries_suspend == 0);
1697 }
1698
1699 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
1700 struct list_head *query_list)
1701 {
1702 struct r600_query_hw *query;
1703 unsigned num_dw = 0;
1704
1705 LIST_FOR_EACH_ENTRY(query, query_list, list) {
1706 /* begin + end */
1707 num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
1708
1709 /* Workaround for the fact that
1710 * num_cs_dw_nontimer_queries_suspend is incremented for every
1711 * resumed query, which raises the bar in need_cs_space for
1712 * queries about to be resumed.
1713 */
1714 num_dw += query->num_cs_dw_end;
1715 }
1716 /* primitives generated query */
1717 num_dw += ctx->streamout.enable_atom.num_dw;
1718 /* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */
1719 num_dw += 13;
1720
1721 return num_dw;
1722 }
1723
1724 void r600_resume_queries(struct r600_common_context *ctx)
1725 {
1726 struct r600_query_hw *query;
1727 unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
1728
1729 assert(ctx->num_cs_dw_queries_suspend == 0);
1730
1731 /* Check CS space here. Resuming must not be interrupted by flushes. */
1732 ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);
1733
1734 LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
1735 r600_query_hw_emit_start(ctx, query);
1736 }
1737 }
1738
1739 /* Fix radeon_info::enabled_rb_mask for R600, R700, EVERGREEN, NI. */
1740 void r600_query_fix_enabled_rb_mask(struct r600_common_screen *rscreen)
1741 {
1742 struct r600_common_context *ctx =
1743 (struct r600_common_context*)rscreen->aux_context;
1744 struct radeon_winsys_cs *cs = ctx->gfx.cs;
1745 struct r600_resource *buffer;
1746 uint32_t *results;
1747 unsigned i, mask = 0;
1748 unsigned max_rbs = ctx->screen->info.num_render_backends;
1749
1750 assert(rscreen->chip_class <= CAYMAN);
1751
1752 /* if backend_map query is supported by the kernel */
1753 if (rscreen->info.r600_gb_backend_map_valid) {
1754 unsigned num_tile_pipes = rscreen->info.num_tile_pipes;
1755 unsigned backend_map = rscreen->info.r600_gb_backend_map;
1756 unsigned item_width, item_mask;
1757
1758 if (ctx->chip_class >= EVERGREEN) {
1759 item_width = 4;
1760 item_mask = 0x7;
1761 } else {
1762 item_width = 2;
1763 item_mask = 0x3;
1764 }
1765
1766 while (num_tile_pipes--) {
1767 i = backend_map & item_mask;
1768 mask |= (1<<i);
1769 backend_map >>= item_width;
1770 }
1771 if (mask != 0) {
1772 rscreen->info.enabled_rb_mask = mask;
1773 return;
1774 }
1775 }
1776
1777 /* otherwise backup path for older kernels */
1778
1779 /* create buffer for event data */
1780 buffer = (struct r600_resource*)
1781 pipe_buffer_create(ctx->b.screen, 0,
1782 PIPE_USAGE_STAGING, max_rbs * 16);
1783 if (!buffer)
1784 return;
1785
1786 /* initialize buffer with zeroes */
1787 results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_WRITE);
1788 if (results) {
1789 memset(results, 0, max_rbs * 4 * 4);
1790
1791 /* emit EVENT_WRITE for ZPASS_DONE */
1792 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
1793 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
1794 radeon_emit(cs, buffer->gpu_address);
1795 radeon_emit(cs, buffer->gpu_address >> 32);
1796
1797 r600_emit_reloc(ctx, &ctx->gfx, buffer,
1798 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
1799
1800 /* analyze results */
1801 results = r600_buffer_map_sync_with_rings(ctx, buffer, PIPE_TRANSFER_READ);
1802 if (results) {
1803 for(i = 0; i < max_rbs; i++) {
1804 /* at least highest bit will be set if backend is used */
1805 if (results[i*4 + 1])
1806 mask |= (1<<i);
1807 }
1808 }
1809 }
1810
1811 r600_resource_reference(&buffer, NULL);
1812
1813 if (mask)
1814 rscreen->info.enabled_rb_mask = mask;
1815 }
1816
1817 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
1818 { \
1819 .name = name_, \
1820 .query_type = R600_QUERY_##query_type_, \
1821 .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1822 .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
1823 .group_id = group_id_ \
1824 }
1825
1826 #define X(name_, query_type_, type_, result_type_) \
1827 XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1828
1829 #define XG(group_, name_, query_type_, type_, result_type_) \
1830 XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
1831
1832 static struct pipe_driver_query_info r600_driver_query_list[] = {
1833 X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1834 X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1835 X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE),
1836 X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1837 X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1838 X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
1839 X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1840 X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
1841 X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
1842 X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1843 X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1844 X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1845 X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1846 X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1847 X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1848 X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1849 X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1850 X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1851 X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1852 X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1853 X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1854 X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1855 X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1856 X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1857 X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1858 X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1859 X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1860 X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1861 X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1862 X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1863 X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
1864 X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1865 X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1866 X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1867 X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1868 X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1869 X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1870 X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1871
1872 /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1873 * which use it as a fallback path to detect the GPU type.
1874 *
1875 * Note: The names of these queries are significant for GPUPerfStudio
1876 * (and possibly their order as well). */
1877 XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1878 XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1879 XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1880 XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1881 XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1882
1883 X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1884 X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1885 X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1886
1887 /* The following queries must be at the end of the list because their
1888 * availability is adjusted dynamically based on the DRM version. */
1889 X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1890 X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1891 X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1892 X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1893 X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1894 X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1895 X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1896 X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1897 X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1898 X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1899 X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1900 X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1901 X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1902 X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1903 X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1904 X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1905 X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1906 X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1907 X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1908 X("GPU-dma-busy", GPU_DMA_BUSY, UINT64, AVERAGE),
1909 X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1910 X("GPU-ce-busy", GPU_CE_BUSY, UINT64, AVERAGE),
1911 };
1912
1913 #undef X
1914 #undef XG
1915 #undef XFULL
1916
1917 static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
1918 {
1919 if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
1920 return ARRAY_SIZE(r600_driver_query_list);
1921 else if (rscreen->info.drm_major == 3) {
1922 if (rscreen->chip_class >= VI)
1923 return ARRAY_SIZE(r600_driver_query_list);
1924 else
1925 return ARRAY_SIZE(r600_driver_query_list) - 7;
1926 }
1927 else
1928 return ARRAY_SIZE(r600_driver_query_list) - 25;
1929 }
1930
1931 static int r600_get_driver_query_info(struct pipe_screen *screen,
1932 unsigned index,
1933 struct pipe_driver_query_info *info)
1934 {
1935 struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
1936 unsigned num_queries = r600_get_num_queries(rscreen);
1937
1938 if (!info) {
1939 unsigned num_perfcounters =
1940 r600_get_perfcounter_info(rscreen, 0, NULL);
1941
1942 return num_queries + num_perfcounters;
1943 }
1944
1945 if (index >= num_queries)
1946 return r600_get_perfcounter_info(rscreen, index - num_queries, info);
1947
1948 *info = r600_driver_query_list[index];
1949
1950 switch (info->query_type) {
1951 case R600_QUERY_REQUESTED_VRAM:
1952 case R600_QUERY_VRAM_USAGE:
1953 case R600_QUERY_MAPPED_VRAM:
1954 info->max_value.u64 = rscreen->info.vram_size;
1955 break;
1956 case R600_QUERY_REQUESTED_GTT:
1957 case R600_QUERY_GTT_USAGE:
1958 case R600_QUERY_MAPPED_GTT:
1959 info->max_value.u64 = rscreen->info.gart_size;
1960 break;
1961 case R600_QUERY_GPU_TEMPERATURE:
1962 info->max_value.u64 = 125;
1963 break;
1964 case R600_QUERY_VRAM_VIS_USAGE:
1965 info->max_value.u64 = rscreen->info.vram_vis_size;
1966 break;
1967 }
1968
1969 if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
1970 info->group_id += rscreen->perfcounters->num_groups;
1971
1972 return 1;
1973 }
1974
1975 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
1976 * performance counter groups, so be careful when changing this and related
1977 * functions.
1978 */
1979 static int r600_get_driver_query_group_info(struct pipe_screen *screen,
1980 unsigned index,
1981 struct pipe_driver_query_group_info *info)
1982 {
1983 struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
1984 unsigned num_pc_groups = 0;
1985
1986 if (rscreen->perfcounters)
1987 num_pc_groups = rscreen->perfcounters->num_groups;
1988
1989 if (!info)
1990 return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
1991
1992 if (index < num_pc_groups)
1993 return r600_get_perfcounter_group_info(rscreen, index, info);
1994
1995 index -= num_pc_groups;
1996 if (index >= R600_NUM_SW_QUERY_GROUPS)
1997 return 0;
1998
1999 info->name = "GPIN";
2000 info->max_active_queries = 5;
2001 info->num_queries = 5;
2002 return 1;
2003 }
2004
2005 void r600_query_init(struct r600_common_context *rctx)
2006 {
2007 rctx->b.create_query = r600_create_query;
2008 rctx->b.create_batch_query = r600_create_batch_query;
2009 rctx->b.destroy_query = r600_destroy_query;
2010 rctx->b.begin_query = r600_begin_query;
2011 rctx->b.end_query = r600_end_query;
2012 rctx->b.get_query_result = r600_get_query_result;
2013 rctx->b.get_query_result_resource = r600_get_query_result_resource;
2014 rctx->render_cond_atom.emit = r600_emit_query_predication;
2015
2016 if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
2017 rctx->b.render_condition = r600_render_condition;
2018
2019 LIST_INITHEAD(&rctx->active_queries);
2020 }
2021
2022 void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
2023 {
2024 rscreen->b.get_driver_query_info = r600_get_driver_query_info;
2025 rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
2026 }