radeonsi: import r600_streamout from drivers/radeon
[mesa.git] / src / gallium / drivers / radeon / r600_query.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 * Copyright 2014 Marek Olšák <marek.olsak@amd.com>
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "r600_query.h"
26 #include "r600_cs.h"
27 #include "util/u_memory.h"
28 #include "util/u_upload_mgr.h"
29 #include "os/os_time.h"
30 #include "tgsi/tgsi_text.h"
31
32 /* TODO: remove this: */
33 void si_update_prims_generated_query_state(struct r600_common_context *rctx,
34 unsigned type, int diff);
35
36 #define R600_MAX_STREAMS 4
37
38 struct r600_hw_query_params {
39 unsigned start_offset;
40 unsigned end_offset;
41 unsigned fence_offset;
42 unsigned pair_stride;
43 unsigned pair_count;
44 };
45
46 /* Queries without buffer handling or suspend/resume. */
47 struct r600_query_sw {
48 struct r600_query b;
49
50 uint64_t begin_result;
51 uint64_t end_result;
52
53 uint64_t begin_time;
54 uint64_t end_time;
55
56 /* Fence for GPU_FINISHED. */
57 struct pipe_fence_handle *fence;
58 };
59
60 static void r600_query_sw_destroy(struct r600_common_screen *rscreen,
61 struct r600_query *rquery)
62 {
63 struct r600_query_sw *query = (struct r600_query_sw *)rquery;
64
65 rscreen->b.fence_reference(&rscreen->b, &query->fence, NULL);
66 FREE(query);
67 }
68
69 static enum radeon_value_id winsys_id_from_type(unsigned type)
70 {
71 switch (type) {
72 case R600_QUERY_REQUESTED_VRAM: return RADEON_REQUESTED_VRAM_MEMORY;
73 case R600_QUERY_REQUESTED_GTT: return RADEON_REQUESTED_GTT_MEMORY;
74 case R600_QUERY_MAPPED_VRAM: return RADEON_MAPPED_VRAM;
75 case R600_QUERY_MAPPED_GTT: return RADEON_MAPPED_GTT;
76 case R600_QUERY_BUFFER_WAIT_TIME: return RADEON_BUFFER_WAIT_TIME_NS;
77 case R600_QUERY_NUM_MAPPED_BUFFERS: return RADEON_NUM_MAPPED_BUFFERS;
78 case R600_QUERY_NUM_GFX_IBS: return RADEON_NUM_GFX_IBS;
79 case R600_QUERY_NUM_SDMA_IBS: return RADEON_NUM_SDMA_IBS;
80 case R600_QUERY_GFX_BO_LIST_SIZE: return RADEON_GFX_BO_LIST_COUNTER;
81 case R600_QUERY_NUM_BYTES_MOVED: return RADEON_NUM_BYTES_MOVED;
82 case R600_QUERY_NUM_EVICTIONS: return RADEON_NUM_EVICTIONS;
83 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: return RADEON_NUM_VRAM_CPU_PAGE_FAULTS;
84 case R600_QUERY_VRAM_USAGE: return RADEON_VRAM_USAGE;
85 case R600_QUERY_VRAM_VIS_USAGE: return RADEON_VRAM_VIS_USAGE;
86 case R600_QUERY_GTT_USAGE: return RADEON_GTT_USAGE;
87 case R600_QUERY_GPU_TEMPERATURE: return RADEON_GPU_TEMPERATURE;
88 case R600_QUERY_CURRENT_GPU_SCLK: return RADEON_CURRENT_SCLK;
89 case R600_QUERY_CURRENT_GPU_MCLK: return RADEON_CURRENT_MCLK;
90 case R600_QUERY_CS_THREAD_BUSY: return RADEON_CS_THREAD_TIME;
91 default: unreachable("query type does not correspond to winsys id");
92 }
93 }
94
95 static bool r600_query_sw_begin(struct r600_common_context *rctx,
96 struct r600_query *rquery)
97 {
98 struct r600_query_sw *query = (struct r600_query_sw *)rquery;
99 enum radeon_value_id ws_id;
100
101 switch(query->b.type) {
102 case PIPE_QUERY_TIMESTAMP_DISJOINT:
103 case PIPE_QUERY_GPU_FINISHED:
104 break;
105 case R600_QUERY_DRAW_CALLS:
106 query->begin_result = rctx->num_draw_calls;
107 break;
108 case R600_QUERY_DECOMPRESS_CALLS:
109 query->begin_result = rctx->num_decompress_calls;
110 break;
111 case R600_QUERY_MRT_DRAW_CALLS:
112 query->begin_result = rctx->num_mrt_draw_calls;
113 break;
114 case R600_QUERY_PRIM_RESTART_CALLS:
115 query->begin_result = rctx->num_prim_restart_calls;
116 break;
117 case R600_QUERY_SPILL_DRAW_CALLS:
118 query->begin_result = rctx->num_spill_draw_calls;
119 break;
120 case R600_QUERY_COMPUTE_CALLS:
121 query->begin_result = rctx->num_compute_calls;
122 break;
123 case R600_QUERY_SPILL_COMPUTE_CALLS:
124 query->begin_result = rctx->num_spill_compute_calls;
125 break;
126 case R600_QUERY_DMA_CALLS:
127 query->begin_result = rctx->num_dma_calls;
128 break;
129 case R600_QUERY_CP_DMA_CALLS:
130 query->begin_result = rctx->num_cp_dma_calls;
131 break;
132 case R600_QUERY_NUM_VS_FLUSHES:
133 query->begin_result = rctx->num_vs_flushes;
134 break;
135 case R600_QUERY_NUM_PS_FLUSHES:
136 query->begin_result = rctx->num_ps_flushes;
137 break;
138 case R600_QUERY_NUM_CS_FLUSHES:
139 query->begin_result = rctx->num_cs_flushes;
140 break;
141 case R600_QUERY_NUM_CB_CACHE_FLUSHES:
142 query->begin_result = rctx->num_cb_cache_flushes;
143 break;
144 case R600_QUERY_NUM_DB_CACHE_FLUSHES:
145 query->begin_result = rctx->num_db_cache_flushes;
146 break;
147 case R600_QUERY_NUM_L2_INVALIDATES:
148 query->begin_result = rctx->num_L2_invalidates;
149 break;
150 case R600_QUERY_NUM_L2_WRITEBACKS:
151 query->begin_result = rctx->num_L2_writebacks;
152 break;
153 case R600_QUERY_NUM_RESIDENT_HANDLES:
154 query->begin_result = rctx->num_resident_handles;
155 break;
156 case R600_QUERY_TC_OFFLOADED_SLOTS:
157 query->begin_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
158 break;
159 case R600_QUERY_TC_DIRECT_SLOTS:
160 query->begin_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
161 break;
162 case R600_QUERY_TC_NUM_SYNCS:
163 query->begin_result = rctx->tc ? rctx->tc->num_syncs : 0;
164 break;
165 case R600_QUERY_REQUESTED_VRAM:
166 case R600_QUERY_REQUESTED_GTT:
167 case R600_QUERY_MAPPED_VRAM:
168 case R600_QUERY_MAPPED_GTT:
169 case R600_QUERY_VRAM_USAGE:
170 case R600_QUERY_VRAM_VIS_USAGE:
171 case R600_QUERY_GTT_USAGE:
172 case R600_QUERY_GPU_TEMPERATURE:
173 case R600_QUERY_CURRENT_GPU_SCLK:
174 case R600_QUERY_CURRENT_GPU_MCLK:
175 case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
176 case R600_QUERY_NUM_MAPPED_BUFFERS:
177 query->begin_result = 0;
178 break;
179 case R600_QUERY_BUFFER_WAIT_TIME:
180 case R600_QUERY_NUM_GFX_IBS:
181 case R600_QUERY_NUM_SDMA_IBS:
182 case R600_QUERY_NUM_BYTES_MOVED:
183 case R600_QUERY_NUM_EVICTIONS:
184 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
185 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
186 query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
187 break;
188 }
189 case R600_QUERY_GFX_BO_LIST_SIZE:
190 ws_id = winsys_id_from_type(query->b.type);
191 query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
192 query->begin_time = rctx->ws->query_value(rctx->ws,
193 RADEON_NUM_GFX_IBS);
194 break;
195 case R600_QUERY_CS_THREAD_BUSY:
196 ws_id = winsys_id_from_type(query->b.type);
197 query->begin_result = rctx->ws->query_value(rctx->ws, ws_id);
198 query->begin_time = os_time_get_nano();
199 break;
200 case R600_QUERY_GALLIUM_THREAD_BUSY:
201 query->begin_result =
202 rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
203 query->begin_time = os_time_get_nano();
204 break;
205 case R600_QUERY_GPU_LOAD:
206 case R600_QUERY_GPU_SHADERS_BUSY:
207 case R600_QUERY_GPU_TA_BUSY:
208 case R600_QUERY_GPU_GDS_BUSY:
209 case R600_QUERY_GPU_VGT_BUSY:
210 case R600_QUERY_GPU_IA_BUSY:
211 case R600_QUERY_GPU_SX_BUSY:
212 case R600_QUERY_GPU_WD_BUSY:
213 case R600_QUERY_GPU_BCI_BUSY:
214 case R600_QUERY_GPU_SC_BUSY:
215 case R600_QUERY_GPU_PA_BUSY:
216 case R600_QUERY_GPU_DB_BUSY:
217 case R600_QUERY_GPU_CP_BUSY:
218 case R600_QUERY_GPU_CB_BUSY:
219 case R600_QUERY_GPU_SDMA_BUSY:
220 case R600_QUERY_GPU_PFP_BUSY:
221 case R600_QUERY_GPU_MEQ_BUSY:
222 case R600_QUERY_GPU_ME_BUSY:
223 case R600_QUERY_GPU_SURF_SYNC_BUSY:
224 case R600_QUERY_GPU_CP_DMA_BUSY:
225 case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
226 query->begin_result = si_begin_counter(rctx->screen,
227 query->b.type);
228 break;
229 case R600_QUERY_NUM_COMPILATIONS:
230 query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
231 break;
232 case R600_QUERY_NUM_SHADERS_CREATED:
233 query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
234 break;
235 case R600_QUERY_NUM_SHADER_CACHE_HITS:
236 query->begin_result =
237 p_atomic_read(&rctx->screen->num_shader_cache_hits);
238 break;
239 case R600_QUERY_GPIN_ASIC_ID:
240 case R600_QUERY_GPIN_NUM_SIMD:
241 case R600_QUERY_GPIN_NUM_RB:
242 case R600_QUERY_GPIN_NUM_SPI:
243 case R600_QUERY_GPIN_NUM_SE:
244 break;
245 default:
246 unreachable("r600_query_sw_begin: bad query type");
247 }
248
249 return true;
250 }
251
252 static bool r600_query_sw_end(struct r600_common_context *rctx,
253 struct r600_query *rquery)
254 {
255 struct r600_query_sw *query = (struct r600_query_sw *)rquery;
256 enum radeon_value_id ws_id;
257
258 switch(query->b.type) {
259 case PIPE_QUERY_TIMESTAMP_DISJOINT:
260 break;
261 case PIPE_QUERY_GPU_FINISHED:
262 rctx->b.flush(&rctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
263 break;
264 case R600_QUERY_DRAW_CALLS:
265 query->end_result = rctx->num_draw_calls;
266 break;
267 case R600_QUERY_DECOMPRESS_CALLS:
268 query->end_result = rctx->num_decompress_calls;
269 break;
270 case R600_QUERY_MRT_DRAW_CALLS:
271 query->end_result = rctx->num_mrt_draw_calls;
272 break;
273 case R600_QUERY_PRIM_RESTART_CALLS:
274 query->end_result = rctx->num_prim_restart_calls;
275 break;
276 case R600_QUERY_SPILL_DRAW_CALLS:
277 query->end_result = rctx->num_spill_draw_calls;
278 break;
279 case R600_QUERY_COMPUTE_CALLS:
280 query->end_result = rctx->num_compute_calls;
281 break;
282 case R600_QUERY_SPILL_COMPUTE_CALLS:
283 query->end_result = rctx->num_spill_compute_calls;
284 break;
285 case R600_QUERY_DMA_CALLS:
286 query->end_result = rctx->num_dma_calls;
287 break;
288 case R600_QUERY_CP_DMA_CALLS:
289 query->end_result = rctx->num_cp_dma_calls;
290 break;
291 case R600_QUERY_NUM_VS_FLUSHES:
292 query->end_result = rctx->num_vs_flushes;
293 break;
294 case R600_QUERY_NUM_PS_FLUSHES:
295 query->end_result = rctx->num_ps_flushes;
296 break;
297 case R600_QUERY_NUM_CS_FLUSHES:
298 query->end_result = rctx->num_cs_flushes;
299 break;
300 case R600_QUERY_NUM_CB_CACHE_FLUSHES:
301 query->end_result = rctx->num_cb_cache_flushes;
302 break;
303 case R600_QUERY_NUM_DB_CACHE_FLUSHES:
304 query->end_result = rctx->num_db_cache_flushes;
305 break;
306 case R600_QUERY_NUM_L2_INVALIDATES:
307 query->end_result = rctx->num_L2_invalidates;
308 break;
309 case R600_QUERY_NUM_L2_WRITEBACKS:
310 query->end_result = rctx->num_L2_writebacks;
311 break;
312 case R600_QUERY_NUM_RESIDENT_HANDLES:
313 query->end_result = rctx->num_resident_handles;
314 break;
315 case R600_QUERY_TC_OFFLOADED_SLOTS:
316 query->end_result = rctx->tc ? rctx->tc->num_offloaded_slots : 0;
317 break;
318 case R600_QUERY_TC_DIRECT_SLOTS:
319 query->end_result = rctx->tc ? rctx->tc->num_direct_slots : 0;
320 break;
321 case R600_QUERY_TC_NUM_SYNCS:
322 query->end_result = rctx->tc ? rctx->tc->num_syncs : 0;
323 break;
324 case R600_QUERY_REQUESTED_VRAM:
325 case R600_QUERY_REQUESTED_GTT:
326 case R600_QUERY_MAPPED_VRAM:
327 case R600_QUERY_MAPPED_GTT:
328 case R600_QUERY_VRAM_USAGE:
329 case R600_QUERY_VRAM_VIS_USAGE:
330 case R600_QUERY_GTT_USAGE:
331 case R600_QUERY_GPU_TEMPERATURE:
332 case R600_QUERY_CURRENT_GPU_SCLK:
333 case R600_QUERY_CURRENT_GPU_MCLK:
334 case R600_QUERY_BUFFER_WAIT_TIME:
335 case R600_QUERY_NUM_MAPPED_BUFFERS:
336 case R600_QUERY_NUM_GFX_IBS:
337 case R600_QUERY_NUM_SDMA_IBS:
338 case R600_QUERY_NUM_BYTES_MOVED:
339 case R600_QUERY_NUM_EVICTIONS:
340 case R600_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
341 enum radeon_value_id ws_id = winsys_id_from_type(query->b.type);
342 query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
343 break;
344 }
345 case R600_QUERY_GFX_BO_LIST_SIZE:
346 ws_id = winsys_id_from_type(query->b.type);
347 query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
348 query->end_time = rctx->ws->query_value(rctx->ws,
349 RADEON_NUM_GFX_IBS);
350 break;
351 case R600_QUERY_CS_THREAD_BUSY:
352 ws_id = winsys_id_from_type(query->b.type);
353 query->end_result = rctx->ws->query_value(rctx->ws, ws_id);
354 query->end_time = os_time_get_nano();
355 break;
356 case R600_QUERY_GALLIUM_THREAD_BUSY:
357 query->end_result =
358 rctx->tc ? util_queue_get_thread_time_nano(&rctx->tc->queue, 0) : 0;
359 query->end_time = os_time_get_nano();
360 break;
361 case R600_QUERY_GPU_LOAD:
362 case R600_QUERY_GPU_SHADERS_BUSY:
363 case R600_QUERY_GPU_TA_BUSY:
364 case R600_QUERY_GPU_GDS_BUSY:
365 case R600_QUERY_GPU_VGT_BUSY:
366 case R600_QUERY_GPU_IA_BUSY:
367 case R600_QUERY_GPU_SX_BUSY:
368 case R600_QUERY_GPU_WD_BUSY:
369 case R600_QUERY_GPU_BCI_BUSY:
370 case R600_QUERY_GPU_SC_BUSY:
371 case R600_QUERY_GPU_PA_BUSY:
372 case R600_QUERY_GPU_DB_BUSY:
373 case R600_QUERY_GPU_CP_BUSY:
374 case R600_QUERY_GPU_CB_BUSY:
375 case R600_QUERY_GPU_SDMA_BUSY:
376 case R600_QUERY_GPU_PFP_BUSY:
377 case R600_QUERY_GPU_MEQ_BUSY:
378 case R600_QUERY_GPU_ME_BUSY:
379 case R600_QUERY_GPU_SURF_SYNC_BUSY:
380 case R600_QUERY_GPU_CP_DMA_BUSY:
381 case R600_QUERY_GPU_SCRATCH_RAM_BUSY:
382 query->end_result = si_end_counter(rctx->screen,
383 query->b.type,
384 query->begin_result);
385 query->begin_result = 0;
386 break;
387 case R600_QUERY_NUM_COMPILATIONS:
388 query->end_result = p_atomic_read(&rctx->screen->num_compilations);
389 break;
390 case R600_QUERY_NUM_SHADERS_CREATED:
391 query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
392 break;
393 case R600_QUERY_BACK_BUFFER_PS_DRAW_RATIO:
394 query->end_result = rctx->last_tex_ps_draw_ratio;
395 break;
396 case R600_QUERY_NUM_SHADER_CACHE_HITS:
397 query->end_result =
398 p_atomic_read(&rctx->screen->num_shader_cache_hits);
399 break;
400 case R600_QUERY_GPIN_ASIC_ID:
401 case R600_QUERY_GPIN_NUM_SIMD:
402 case R600_QUERY_GPIN_NUM_RB:
403 case R600_QUERY_GPIN_NUM_SPI:
404 case R600_QUERY_GPIN_NUM_SE:
405 break;
406 default:
407 unreachable("r600_query_sw_end: bad query type");
408 }
409
410 return true;
411 }
412
413 static bool r600_query_sw_get_result(struct r600_common_context *rctx,
414 struct r600_query *rquery,
415 bool wait,
416 union pipe_query_result *result)
417 {
418 struct r600_query_sw *query = (struct r600_query_sw *)rquery;
419
420 switch (query->b.type) {
421 case PIPE_QUERY_TIMESTAMP_DISJOINT:
422 /* Convert from cycles per millisecond to cycles per second (Hz). */
423 result->timestamp_disjoint.frequency =
424 (uint64_t)rctx->screen->info.clock_crystal_freq * 1000;
425 result->timestamp_disjoint.disjoint = false;
426 return true;
427 case PIPE_QUERY_GPU_FINISHED: {
428 struct pipe_screen *screen = rctx->b.screen;
429 struct pipe_context *ctx = rquery->b.flushed ? NULL : &rctx->b;
430
431 result->b = screen->fence_finish(screen, ctx, query->fence,
432 wait ? PIPE_TIMEOUT_INFINITE : 0);
433 return result->b;
434 }
435
436 case R600_QUERY_GFX_BO_LIST_SIZE:
437 result->u64 = (query->end_result - query->begin_result) /
438 (query->end_time - query->begin_time);
439 return true;
440 case R600_QUERY_CS_THREAD_BUSY:
441 case R600_QUERY_GALLIUM_THREAD_BUSY:
442 result->u64 = (query->end_result - query->begin_result) * 100 /
443 (query->end_time - query->begin_time);
444 return true;
445 case R600_QUERY_GPIN_ASIC_ID:
446 result->u32 = 0;
447 return true;
448 case R600_QUERY_GPIN_NUM_SIMD:
449 result->u32 = rctx->screen->info.num_good_compute_units;
450 return true;
451 case R600_QUERY_GPIN_NUM_RB:
452 result->u32 = rctx->screen->info.num_render_backends;
453 return true;
454 case R600_QUERY_GPIN_NUM_SPI:
455 result->u32 = 1; /* all supported chips have one SPI per SE */
456 return true;
457 case R600_QUERY_GPIN_NUM_SE:
458 result->u32 = rctx->screen->info.max_se;
459 return true;
460 }
461
462 result->u64 = query->end_result - query->begin_result;
463
464 switch (query->b.type) {
465 case R600_QUERY_BUFFER_WAIT_TIME:
466 case R600_QUERY_GPU_TEMPERATURE:
467 result->u64 /= 1000;
468 break;
469 case R600_QUERY_CURRENT_GPU_SCLK:
470 case R600_QUERY_CURRENT_GPU_MCLK:
471 result->u64 *= 1000000;
472 break;
473 }
474
475 return true;
476 }
477
478
479 static struct r600_query_ops sw_query_ops = {
480 .destroy = r600_query_sw_destroy,
481 .begin = r600_query_sw_begin,
482 .end = r600_query_sw_end,
483 .get_result = r600_query_sw_get_result,
484 .get_result_resource = NULL
485 };
486
487 static struct pipe_query *r600_query_sw_create(unsigned query_type)
488 {
489 struct r600_query_sw *query;
490
491 query = CALLOC_STRUCT(r600_query_sw);
492 if (!query)
493 return NULL;
494
495 query->b.type = query_type;
496 query->b.ops = &sw_query_ops;
497
498 return (struct pipe_query *)query;
499 }
500
501 void si_query_hw_destroy(struct r600_common_screen *rscreen,
502 struct r600_query *rquery)
503 {
504 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
505 struct r600_query_buffer *prev = query->buffer.previous;
506
507 /* Release all query buffers. */
508 while (prev) {
509 struct r600_query_buffer *qbuf = prev;
510 prev = prev->previous;
511 r600_resource_reference(&qbuf->buf, NULL);
512 FREE(qbuf);
513 }
514
515 r600_resource_reference(&query->buffer.buf, NULL);
516 r600_resource_reference(&query->workaround_buf, NULL);
517 FREE(rquery);
518 }
519
520 static struct r600_resource *r600_new_query_buffer(struct r600_common_screen *rscreen,
521 struct r600_query_hw *query)
522 {
523 unsigned buf_size = MAX2(query->result_size,
524 rscreen->info.min_alloc_size);
525
526 /* Queries are normally read by the CPU after
527 * being written by the gpu, hence staging is probably a good
528 * usage pattern.
529 */
530 struct r600_resource *buf = (struct r600_resource*)
531 pipe_buffer_create(&rscreen->b, 0,
532 PIPE_USAGE_STAGING, buf_size);
533 if (!buf)
534 return NULL;
535
536 if (!query->ops->prepare_buffer(rscreen, query, buf)) {
537 r600_resource_reference(&buf, NULL);
538 return NULL;
539 }
540
541 return buf;
542 }
543
544 static bool r600_query_hw_prepare_buffer(struct r600_common_screen *rscreen,
545 struct r600_query_hw *query,
546 struct r600_resource *buffer)
547 {
548 /* Callers ensure that the buffer is currently unused by the GPU. */
549 uint32_t *results = rscreen->ws->buffer_map(buffer->buf, NULL,
550 PIPE_TRANSFER_WRITE |
551 PIPE_TRANSFER_UNSYNCHRONIZED);
552 if (!results)
553 return false;
554
555 memset(results, 0, buffer->b.b.width0);
556
557 if (query->b.type == PIPE_QUERY_OCCLUSION_COUNTER ||
558 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
559 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
560 unsigned max_rbs = rscreen->info.num_render_backends;
561 unsigned enabled_rb_mask = rscreen->info.enabled_rb_mask;
562 unsigned num_results;
563 unsigned i, j;
564
565 /* Set top bits for unused backends. */
566 num_results = buffer->b.b.width0 / query->result_size;
567 for (j = 0; j < num_results; j++) {
568 for (i = 0; i < max_rbs; i++) {
569 if (!(enabled_rb_mask & (1<<i))) {
570 results[(i * 4)+1] = 0x80000000;
571 results[(i * 4)+3] = 0x80000000;
572 }
573 }
574 results += 4 * max_rbs;
575 }
576 }
577
578 return true;
579 }
580
581 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
582 struct r600_query *rquery,
583 bool wait,
584 enum pipe_query_value_type result_type,
585 int index,
586 struct pipe_resource *resource,
587 unsigned offset);
588
589 static struct r600_query_ops query_hw_ops = {
590 .destroy = si_query_hw_destroy,
591 .begin = si_query_hw_begin,
592 .end = si_query_hw_end,
593 .get_result = si_query_hw_get_result,
594 .get_result_resource = r600_query_hw_get_result_resource,
595 };
596
597 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
598 struct r600_query_hw *query,
599 struct r600_resource *buffer,
600 uint64_t va);
601 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
602 struct r600_query_hw *query,
603 struct r600_resource *buffer,
604 uint64_t va);
605 static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
606 struct r600_query_hw *, void *buffer,
607 union pipe_query_result *result);
608 static void r600_query_hw_clear_result(struct r600_query_hw *,
609 union pipe_query_result *);
610
611 static struct r600_query_hw_ops query_hw_default_hw_ops = {
612 .prepare_buffer = r600_query_hw_prepare_buffer,
613 .emit_start = r600_query_hw_do_emit_start,
614 .emit_stop = r600_query_hw_do_emit_stop,
615 .clear_result = r600_query_hw_clear_result,
616 .add_result = r600_query_hw_add_result,
617 };
618
619 bool si_query_hw_init(struct r600_common_screen *rscreen,
620 struct r600_query_hw *query)
621 {
622 query->buffer.buf = r600_new_query_buffer(rscreen, query);
623 if (!query->buffer.buf)
624 return false;
625
626 return true;
627 }
628
629 static struct pipe_query *r600_query_hw_create(struct r600_common_screen *rscreen,
630 unsigned query_type,
631 unsigned index)
632 {
633 struct r600_query_hw *query = CALLOC_STRUCT(r600_query_hw);
634 if (!query)
635 return NULL;
636
637 query->b.type = query_type;
638 query->b.ops = &query_hw_ops;
639 query->ops = &query_hw_default_hw_ops;
640
641 switch (query_type) {
642 case PIPE_QUERY_OCCLUSION_COUNTER:
643 case PIPE_QUERY_OCCLUSION_PREDICATE:
644 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
645 query->result_size = 16 * rscreen->info.num_render_backends;
646 query->result_size += 16; /* for the fence + alignment */
647 query->num_cs_dw_begin = 6;
648 query->num_cs_dw_end = 6 + si_gfx_write_fence_dwords(rscreen);
649 break;
650 case PIPE_QUERY_TIME_ELAPSED:
651 query->result_size = 24;
652 query->num_cs_dw_begin = 8;
653 query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(rscreen);
654 break;
655 case PIPE_QUERY_TIMESTAMP:
656 query->result_size = 16;
657 query->num_cs_dw_end = 8 + si_gfx_write_fence_dwords(rscreen);
658 query->flags = R600_QUERY_HW_FLAG_NO_START;
659 break;
660 case PIPE_QUERY_PRIMITIVES_EMITTED:
661 case PIPE_QUERY_PRIMITIVES_GENERATED:
662 case PIPE_QUERY_SO_STATISTICS:
663 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
664 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
665 query->result_size = 32;
666 query->num_cs_dw_begin = 6;
667 query->num_cs_dw_end = 6;
668 query->stream = index;
669 break;
670 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
671 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
672 query->result_size = 32 * R600_MAX_STREAMS;
673 query->num_cs_dw_begin = 6 * R600_MAX_STREAMS;
674 query->num_cs_dw_end = 6 * R600_MAX_STREAMS;
675 break;
676 case PIPE_QUERY_PIPELINE_STATISTICS:
677 /* 11 values on GCN. */
678 query->result_size = 11 * 16;
679 query->result_size += 8; /* for the fence + alignment */
680 query->num_cs_dw_begin = 6;
681 query->num_cs_dw_end = 6 + si_gfx_write_fence_dwords(rscreen);
682 break;
683 default:
684 assert(0);
685 FREE(query);
686 return NULL;
687 }
688
689 if (!si_query_hw_init(rscreen, query)) {
690 FREE(query);
691 return NULL;
692 }
693
694 return (struct pipe_query *)query;
695 }
696
697 static void r600_update_occlusion_query_state(struct r600_common_context *rctx,
698 unsigned type, int diff)
699 {
700 if (type == PIPE_QUERY_OCCLUSION_COUNTER ||
701 type == PIPE_QUERY_OCCLUSION_PREDICATE ||
702 type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
703 bool old_enable = rctx->num_occlusion_queries != 0;
704 bool old_perfect_enable =
705 rctx->num_perfect_occlusion_queries != 0;
706 bool enable, perfect_enable;
707
708 rctx->num_occlusion_queries += diff;
709 assert(rctx->num_occlusion_queries >= 0);
710
711 if (type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
712 rctx->num_perfect_occlusion_queries += diff;
713 assert(rctx->num_perfect_occlusion_queries >= 0);
714 }
715
716 enable = rctx->num_occlusion_queries != 0;
717 perfect_enable = rctx->num_perfect_occlusion_queries != 0;
718
719 if (enable != old_enable || perfect_enable != old_perfect_enable) {
720 rctx->set_occlusion_query_state(&rctx->b, old_enable,
721 old_perfect_enable);
722 }
723 }
724 }
725
726 static unsigned event_type_for_stream(unsigned stream)
727 {
728 switch (stream) {
729 default:
730 case 0: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS;
731 case 1: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS1;
732 case 2: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS2;
733 case 3: return EVENT_TYPE_SAMPLE_STREAMOUTSTATS3;
734 }
735 }
736
737 static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va,
738 unsigned stream)
739 {
740 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
741 radeon_emit(cs, EVENT_TYPE(event_type_for_stream(stream)) | EVENT_INDEX(3));
742 radeon_emit(cs, va);
743 radeon_emit(cs, va >> 32);
744 }
745
746 static void r600_query_hw_do_emit_start(struct r600_common_context *ctx,
747 struct r600_query_hw *query,
748 struct r600_resource *buffer,
749 uint64_t va)
750 {
751 struct radeon_winsys_cs *cs = ctx->gfx.cs;
752
753 switch (query->b.type) {
754 case PIPE_QUERY_OCCLUSION_COUNTER:
755 case PIPE_QUERY_OCCLUSION_PREDICATE:
756 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
757 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
758 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
759 radeon_emit(cs, va);
760 radeon_emit(cs, va >> 32);
761 break;
762 case PIPE_QUERY_PRIMITIVES_EMITTED:
763 case PIPE_QUERY_PRIMITIVES_GENERATED:
764 case PIPE_QUERY_SO_STATISTICS:
765 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
766 emit_sample_streamout(cs, va, query->stream);
767 break;
768 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
769 for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
770 emit_sample_streamout(cs, va + 32 * stream, stream);
771 break;
772 case PIPE_QUERY_TIME_ELAPSED:
773 /* Write the timestamp from the CP not waiting for
774 * outstanding draws (top-of-pipe).
775 */
776 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
777 radeon_emit(cs, COPY_DATA_COUNT_SEL |
778 COPY_DATA_SRC_SEL(COPY_DATA_TIMESTAMP) |
779 COPY_DATA_DST_SEL(COPY_DATA_MEM_ASYNC));
780 radeon_emit(cs, 0);
781 radeon_emit(cs, 0);
782 radeon_emit(cs, va);
783 radeon_emit(cs, va >> 32);
784 break;
785 case PIPE_QUERY_PIPELINE_STATISTICS:
786 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
787 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
788 radeon_emit(cs, va);
789 radeon_emit(cs, va >> 32);
790 break;
791 default:
792 assert(0);
793 }
794 r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
795 RADEON_PRIO_QUERY);
796 }
797
798 static void r600_query_hw_emit_start(struct r600_common_context *ctx,
799 struct r600_query_hw *query)
800 {
801 uint64_t va;
802
803 if (!query->buffer.buf)
804 return; // previous buffer allocation failure
805
806 r600_update_occlusion_query_state(ctx, query->b.type, 1);
807 si_update_prims_generated_query_state(ctx, query->b.type, 1);
808
809 ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_begin + query->num_cs_dw_end,
810 true);
811
812 /* Get a new query buffer if needed. */
813 if (query->buffer.results_end + query->result_size > query->buffer.buf->b.b.width0) {
814 struct r600_query_buffer *qbuf = MALLOC_STRUCT(r600_query_buffer);
815 *qbuf = query->buffer;
816 query->buffer.results_end = 0;
817 query->buffer.previous = qbuf;
818 query->buffer.buf = r600_new_query_buffer(ctx->screen, query);
819 if (!query->buffer.buf)
820 return;
821 }
822
823 /* emit begin query */
824 va = query->buffer.buf->gpu_address + query->buffer.results_end;
825
826 query->ops->emit_start(ctx, query, query->buffer.buf, va);
827
828 ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
829 }
830
831 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
832 struct r600_query_hw *query,
833 struct r600_resource *buffer,
834 uint64_t va)
835 {
836 struct radeon_winsys_cs *cs = ctx->gfx.cs;
837 uint64_t fence_va = 0;
838
839 switch (query->b.type) {
840 case PIPE_QUERY_OCCLUSION_COUNTER:
841 case PIPE_QUERY_OCCLUSION_PREDICATE:
842 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
843 va += 8;
844 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
845 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
846 radeon_emit(cs, va);
847 radeon_emit(cs, va >> 32);
848
849 fence_va = va + ctx->screen->info.num_render_backends * 16 - 8;
850 break;
851 case PIPE_QUERY_PRIMITIVES_EMITTED:
852 case PIPE_QUERY_PRIMITIVES_GENERATED:
853 case PIPE_QUERY_SO_STATISTICS:
854 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
855 va += 16;
856 emit_sample_streamout(cs, va, query->stream);
857 break;
858 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
859 va += 16;
860 for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream)
861 emit_sample_streamout(cs, va + 32 * stream, stream);
862 break;
863 case PIPE_QUERY_TIME_ELAPSED:
864 va += 8;
865 /* fall through */
866 case PIPE_QUERY_TIMESTAMP:
867 si_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS,
868 0, EOP_DATA_SEL_TIMESTAMP, NULL, va,
869 0, query->b.type);
870 fence_va = va + 8;
871 break;
872 case PIPE_QUERY_PIPELINE_STATISTICS: {
873 unsigned sample_size = (query->result_size - 8) / 2;
874
875 va += sample_size;
876 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
877 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
878 radeon_emit(cs, va);
879 radeon_emit(cs, va >> 32);
880
881 fence_va = va + sample_size;
882 break;
883 }
884 default:
885 assert(0);
886 }
887 r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
888 RADEON_PRIO_QUERY);
889
890 if (fence_va)
891 si_gfx_write_event_eop(ctx, EVENT_TYPE_BOTTOM_OF_PIPE_TS, 0,
892 EOP_DATA_SEL_VALUE_32BIT,
893 query->buffer.buf, fence_va, 0x80000000,
894 query->b.type);
895 }
896
897 static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
898 struct r600_query_hw *query)
899 {
900 uint64_t va;
901
902 if (!query->buffer.buf)
903 return; // previous buffer allocation failure
904
905 /* The queries which need begin already called this in begin_query. */
906 if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
907 ctx->need_gfx_cs_space(&ctx->b, query->num_cs_dw_end, false);
908 }
909
910 /* emit end query */
911 va = query->buffer.buf->gpu_address + query->buffer.results_end;
912
913 query->ops->emit_stop(ctx, query, query->buffer.buf, va);
914
915 query->buffer.results_end += query->result_size;
916
917 if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
918 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
919
920 r600_update_occlusion_query_state(ctx, query->b.type, -1);
921 si_update_prims_generated_query_state(ctx, query->b.type, -1);
922 }
923
924 static void emit_set_predicate(struct r600_common_context *ctx,
925 struct r600_resource *buf, uint64_t va,
926 uint32_t op)
927 {
928 struct radeon_winsys_cs *cs = ctx->gfx.cs;
929
930 if (ctx->chip_class >= GFX9) {
931 radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
932 radeon_emit(cs, op);
933 radeon_emit(cs, va);
934 radeon_emit(cs, va >> 32);
935 } else {
936 radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
937 radeon_emit(cs, va);
938 radeon_emit(cs, op | ((va >> 32) & 0xFF));
939 }
940 r600_emit_reloc(ctx, &ctx->gfx, buf, RADEON_USAGE_READ,
941 RADEON_PRIO_QUERY);
942 }
943
944 static void r600_emit_query_predication(struct r600_common_context *ctx,
945 struct r600_atom *atom)
946 {
947 struct r600_query_hw *query = (struct r600_query_hw *)ctx->render_cond;
948 struct r600_query_buffer *qbuf;
949 uint32_t op;
950 bool flag_wait, invert;
951
952 if (!query)
953 return;
954
955 invert = ctx->render_cond_invert;
956 flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
957 ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
958
959 if (query->workaround_buf) {
960 op = PRED_OP(PREDICATION_OP_BOOL64);
961 } else {
962 switch (query->b.type) {
963 case PIPE_QUERY_OCCLUSION_COUNTER:
964 case PIPE_QUERY_OCCLUSION_PREDICATE:
965 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
966 op = PRED_OP(PREDICATION_OP_ZPASS);
967 break;
968 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
969 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
970 op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
971 invert = !invert;
972 break;
973 default:
974 assert(0);
975 return;
976 }
977 }
978
979 /* if true then invert, see GL_ARB_conditional_render_inverted */
980 if (invert)
981 op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visible or overflow */
982 else
983 op |= PREDICATION_DRAW_VISIBLE; /* Draw if visible or no overflow */
984
985 /* Use the value written by compute shader as a workaround. Note that
986 * the wait flag does not apply in this predication mode.
987 *
988 * The shader outputs the result value to L2. Workarounds only affect VI
989 * and later, where the CP reads data from L2, so we don't need an
990 * additional flush.
991 */
992 if (query->workaround_buf) {
993 uint64_t va = query->workaround_buf->gpu_address + query->workaround_offset;
994 emit_set_predicate(ctx, query->workaround_buf, va, op);
995 return;
996 }
997
998 op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
999
1000 /* emit predicate packets for all data blocks */
1001 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1002 unsigned results_base = 0;
1003 uint64_t va_base = qbuf->buf->gpu_address;
1004
1005 while (results_base < qbuf->results_end) {
1006 uint64_t va = va_base + results_base;
1007
1008 if (query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE) {
1009 for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
1010 emit_set_predicate(ctx, qbuf->buf, va + 32 * stream, op);
1011
1012 /* set CONTINUE bit for all packets except the first */
1013 op |= PREDICATION_CONTINUE;
1014 }
1015 } else {
1016 emit_set_predicate(ctx, qbuf->buf, va, op);
1017 op |= PREDICATION_CONTINUE;
1018 }
1019
1020 results_base += query->result_size;
1021 }
1022 }
1023 }
1024
1025 static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type, unsigned index)
1026 {
1027 struct r600_common_screen *rscreen =
1028 (struct r600_common_screen *)ctx->screen;
1029
1030 if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT ||
1031 query_type == PIPE_QUERY_GPU_FINISHED ||
1032 query_type >= PIPE_QUERY_DRIVER_SPECIFIC)
1033 return r600_query_sw_create(query_type);
1034
1035 return r600_query_hw_create(rscreen, query_type, index);
1036 }
1037
1038 static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
1039 {
1040 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1041 struct r600_query *rquery = (struct r600_query *)query;
1042
1043 rquery->ops->destroy(rctx->screen, rquery);
1044 }
1045
1046 static boolean r600_begin_query(struct pipe_context *ctx,
1047 struct pipe_query *query)
1048 {
1049 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1050 struct r600_query *rquery = (struct r600_query *)query;
1051
1052 return rquery->ops->begin(rctx, rquery);
1053 }
1054
1055 void si_query_hw_reset_buffers(struct r600_common_context *rctx,
1056 struct r600_query_hw *query)
1057 {
1058 struct r600_query_buffer *prev = query->buffer.previous;
1059
1060 /* Discard the old query buffers. */
1061 while (prev) {
1062 struct r600_query_buffer *qbuf = prev;
1063 prev = prev->previous;
1064 r600_resource_reference(&qbuf->buf, NULL);
1065 FREE(qbuf);
1066 }
1067
1068 query->buffer.results_end = 0;
1069 query->buffer.previous = NULL;
1070
1071 /* Obtain a new buffer if the current one can't be mapped without a stall. */
1072 if (si_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
1073 !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
1074 r600_resource_reference(&query->buffer.buf, NULL);
1075 query->buffer.buf = r600_new_query_buffer(rctx->screen, query);
1076 } else {
1077 if (!query->ops->prepare_buffer(rctx->screen, query, query->buffer.buf))
1078 r600_resource_reference(&query->buffer.buf, NULL);
1079 }
1080 }
1081
1082 bool si_query_hw_begin(struct r600_common_context *rctx,
1083 struct r600_query *rquery)
1084 {
1085 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1086
1087 if (query->flags & R600_QUERY_HW_FLAG_NO_START) {
1088 assert(0);
1089 return false;
1090 }
1091
1092 if (!(query->flags & R600_QUERY_HW_FLAG_BEGIN_RESUMES))
1093 si_query_hw_reset_buffers(rctx, query);
1094
1095 r600_resource_reference(&query->workaround_buf, NULL);
1096
1097 r600_query_hw_emit_start(rctx, query);
1098 if (!query->buffer.buf)
1099 return false;
1100
1101 LIST_ADDTAIL(&query->list, &rctx->active_queries);
1102 return true;
1103 }
1104
1105 static bool r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
1106 {
1107 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1108 struct r600_query *rquery = (struct r600_query *)query;
1109
1110 return rquery->ops->end(rctx, rquery);
1111 }
1112
1113 bool si_query_hw_end(struct r600_common_context *rctx,
1114 struct r600_query *rquery)
1115 {
1116 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1117
1118 if (query->flags & R600_QUERY_HW_FLAG_NO_START)
1119 si_query_hw_reset_buffers(rctx, query);
1120
1121 r600_query_hw_emit_stop(rctx, query);
1122
1123 if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
1124 LIST_DELINIT(&query->list);
1125
1126 if (!query->buffer.buf)
1127 return false;
1128
1129 return true;
1130 }
1131
1132 static void r600_get_hw_query_params(struct r600_common_context *rctx,
1133 struct r600_query_hw *rquery, int index,
1134 struct r600_hw_query_params *params)
1135 {
1136 unsigned max_rbs = rctx->screen->info.num_render_backends;
1137
1138 params->pair_stride = 0;
1139 params->pair_count = 1;
1140
1141 switch (rquery->b.type) {
1142 case PIPE_QUERY_OCCLUSION_COUNTER:
1143 case PIPE_QUERY_OCCLUSION_PREDICATE:
1144 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
1145 params->start_offset = 0;
1146 params->end_offset = 8;
1147 params->fence_offset = max_rbs * 16;
1148 params->pair_stride = 16;
1149 params->pair_count = max_rbs;
1150 break;
1151 case PIPE_QUERY_TIME_ELAPSED:
1152 params->start_offset = 0;
1153 params->end_offset = 8;
1154 params->fence_offset = 16;
1155 break;
1156 case PIPE_QUERY_TIMESTAMP:
1157 params->start_offset = 0;
1158 params->end_offset = 0;
1159 params->fence_offset = 8;
1160 break;
1161 case PIPE_QUERY_PRIMITIVES_EMITTED:
1162 params->start_offset = 8;
1163 params->end_offset = 24;
1164 params->fence_offset = params->end_offset + 4;
1165 break;
1166 case PIPE_QUERY_PRIMITIVES_GENERATED:
1167 params->start_offset = 0;
1168 params->end_offset = 16;
1169 params->fence_offset = params->end_offset + 4;
1170 break;
1171 case PIPE_QUERY_SO_STATISTICS:
1172 params->start_offset = 8 - index * 8;
1173 params->end_offset = 24 - index * 8;
1174 params->fence_offset = params->end_offset + 4;
1175 break;
1176 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1177 params->pair_count = R600_MAX_STREAMS;
1178 params->pair_stride = 32;
1179 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1180 params->start_offset = 0;
1181 params->end_offset = 16;
1182
1183 /* We can re-use the high dword of the last 64-bit value as a
1184 * fence: it is initialized as 0, and the high bit is set by
1185 * the write of the streamout stats event.
1186 */
1187 params->fence_offset = rquery->result_size - 4;
1188 break;
1189 case PIPE_QUERY_PIPELINE_STATISTICS:
1190 {
1191 /* Offsets apply to EG+ */
1192 static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
1193 params->start_offset = offsets[index];
1194 params->end_offset = 88 + offsets[index];
1195 params->fence_offset = 2 * 88;
1196 break;
1197 }
1198 default:
1199 unreachable("r600_get_hw_query_params unsupported");
1200 }
1201 }
1202
1203 static unsigned r600_query_read_result(void *map, unsigned start_index, unsigned end_index,
1204 bool test_status_bit)
1205 {
1206 uint32_t *current_result = (uint32_t*)map;
1207 uint64_t start, end;
1208
1209 start = (uint64_t)current_result[start_index] |
1210 (uint64_t)current_result[start_index+1] << 32;
1211 end = (uint64_t)current_result[end_index] |
1212 (uint64_t)current_result[end_index+1] << 32;
1213
1214 if (!test_status_bit ||
1215 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
1216 return end - start;
1217 }
1218 return 0;
1219 }
1220
1221 static void r600_query_hw_add_result(struct r600_common_screen *rscreen,
1222 struct r600_query_hw *query,
1223 void *buffer,
1224 union pipe_query_result *result)
1225 {
1226 unsigned max_rbs = rscreen->info.num_render_backends;
1227
1228 switch (query->b.type) {
1229 case PIPE_QUERY_OCCLUSION_COUNTER: {
1230 for (unsigned i = 0; i < max_rbs; ++i) {
1231 unsigned results_base = i * 16;
1232 result->u64 +=
1233 r600_query_read_result(buffer + results_base, 0, 2, true);
1234 }
1235 break;
1236 }
1237 case PIPE_QUERY_OCCLUSION_PREDICATE:
1238 case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: {
1239 for (unsigned i = 0; i < max_rbs; ++i) {
1240 unsigned results_base = i * 16;
1241 result->b = result->b ||
1242 r600_query_read_result(buffer + results_base, 0, 2, true) != 0;
1243 }
1244 break;
1245 }
1246 case PIPE_QUERY_TIME_ELAPSED:
1247 result->u64 += r600_query_read_result(buffer, 0, 2, false);
1248 break;
1249 case PIPE_QUERY_TIMESTAMP:
1250 result->u64 = *(uint64_t*)buffer;
1251 break;
1252 case PIPE_QUERY_PRIMITIVES_EMITTED:
1253 /* SAMPLE_STREAMOUTSTATS stores this structure:
1254 * {
1255 * u64 NumPrimitivesWritten;
1256 * u64 PrimitiveStorageNeeded;
1257 * }
1258 * We only need NumPrimitivesWritten here. */
1259 result->u64 += r600_query_read_result(buffer, 2, 6, true);
1260 break;
1261 case PIPE_QUERY_PRIMITIVES_GENERATED:
1262 /* Here we read PrimitiveStorageNeeded. */
1263 result->u64 += r600_query_read_result(buffer, 0, 4, true);
1264 break;
1265 case PIPE_QUERY_SO_STATISTICS:
1266 result->so_statistics.num_primitives_written +=
1267 r600_query_read_result(buffer, 2, 6, true);
1268 result->so_statistics.primitives_storage_needed +=
1269 r600_query_read_result(buffer, 0, 4, true);
1270 break;
1271 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
1272 result->b = result->b ||
1273 r600_query_read_result(buffer, 2, 6, true) !=
1274 r600_query_read_result(buffer, 0, 4, true);
1275 break;
1276 case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
1277 for (unsigned stream = 0; stream < R600_MAX_STREAMS; ++stream) {
1278 result->b = result->b ||
1279 r600_query_read_result(buffer, 2, 6, true) !=
1280 r600_query_read_result(buffer, 0, 4, true);
1281 buffer = (char *)buffer + 32;
1282 }
1283 break;
1284 case PIPE_QUERY_PIPELINE_STATISTICS:
1285 result->pipeline_statistics.ps_invocations +=
1286 r600_query_read_result(buffer, 0, 22, false);
1287 result->pipeline_statistics.c_primitives +=
1288 r600_query_read_result(buffer, 2, 24, false);
1289 result->pipeline_statistics.c_invocations +=
1290 r600_query_read_result(buffer, 4, 26, false);
1291 result->pipeline_statistics.vs_invocations +=
1292 r600_query_read_result(buffer, 6, 28, false);
1293 result->pipeline_statistics.gs_invocations +=
1294 r600_query_read_result(buffer, 8, 30, false);
1295 result->pipeline_statistics.gs_primitives +=
1296 r600_query_read_result(buffer, 10, 32, false);
1297 result->pipeline_statistics.ia_primitives +=
1298 r600_query_read_result(buffer, 12, 34, false);
1299 result->pipeline_statistics.ia_vertices +=
1300 r600_query_read_result(buffer, 14, 36, false);
1301 result->pipeline_statistics.hs_invocations +=
1302 r600_query_read_result(buffer, 16, 38, false);
1303 result->pipeline_statistics.ds_invocations +=
1304 r600_query_read_result(buffer, 18, 40, false);
1305 result->pipeline_statistics.cs_invocations +=
1306 r600_query_read_result(buffer, 20, 42, false);
1307 #if 0 /* for testing */
1308 printf("Pipeline stats: IA verts=%llu, IA prims=%llu, VS=%llu, HS=%llu, "
1309 "DS=%llu, GS=%llu, GS prims=%llu, Clipper=%llu, "
1310 "Clipper prims=%llu, PS=%llu, CS=%llu\n",
1311 result->pipeline_statistics.ia_vertices,
1312 result->pipeline_statistics.ia_primitives,
1313 result->pipeline_statistics.vs_invocations,
1314 result->pipeline_statistics.hs_invocations,
1315 result->pipeline_statistics.ds_invocations,
1316 result->pipeline_statistics.gs_invocations,
1317 result->pipeline_statistics.gs_primitives,
1318 result->pipeline_statistics.c_invocations,
1319 result->pipeline_statistics.c_primitives,
1320 result->pipeline_statistics.ps_invocations,
1321 result->pipeline_statistics.cs_invocations);
1322 #endif
1323 break;
1324 default:
1325 assert(0);
1326 }
1327 }
1328
1329 static boolean r600_get_query_result(struct pipe_context *ctx,
1330 struct pipe_query *query, boolean wait,
1331 union pipe_query_result *result)
1332 {
1333 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1334 struct r600_query *rquery = (struct r600_query *)query;
1335
1336 return rquery->ops->get_result(rctx, rquery, wait, result);
1337 }
1338
1339 static void r600_get_query_result_resource(struct pipe_context *ctx,
1340 struct pipe_query *query,
1341 boolean wait,
1342 enum pipe_query_value_type result_type,
1343 int index,
1344 struct pipe_resource *resource,
1345 unsigned offset)
1346 {
1347 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1348 struct r600_query *rquery = (struct r600_query *)query;
1349
1350 rquery->ops->get_result_resource(rctx, rquery, wait, result_type, index,
1351 resource, offset);
1352 }
1353
1354 static void r600_query_hw_clear_result(struct r600_query_hw *query,
1355 union pipe_query_result *result)
1356 {
1357 util_query_clear_result(result, query->b.type);
1358 }
1359
1360 bool si_query_hw_get_result(struct r600_common_context *rctx,
1361 struct r600_query *rquery,
1362 bool wait, union pipe_query_result *result)
1363 {
1364 struct r600_common_screen *rscreen = rctx->screen;
1365 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1366 struct r600_query_buffer *qbuf;
1367
1368 query->ops->clear_result(query, result);
1369
1370 for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
1371 unsigned usage = PIPE_TRANSFER_READ |
1372 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
1373 unsigned results_base = 0;
1374 void *map;
1375
1376 if (rquery->b.flushed)
1377 map = rctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
1378 else
1379 map = si_buffer_map_sync_with_rings(rctx, qbuf->buf, usage);
1380
1381 if (!map)
1382 return false;
1383
1384 while (results_base != qbuf->results_end) {
1385 query->ops->add_result(rscreen, query, map + results_base,
1386 result);
1387 results_base += query->result_size;
1388 }
1389 }
1390
1391 /* Convert the time to expected units. */
1392 if (rquery->type == PIPE_QUERY_TIME_ELAPSED ||
1393 rquery->type == PIPE_QUERY_TIMESTAMP) {
1394 result->u64 = (1000000 * result->u64) / rscreen->info.clock_crystal_freq;
1395 }
1396 return true;
1397 }
1398
1399 /* Create the compute shader that is used to collect the results.
1400 *
1401 * One compute grid with a single thread is launched for every query result
1402 * buffer. The thread (optionally) reads a previous summary buffer, then
1403 * accumulates data from the query result buffer, and writes the result either
1404 * to a summary buffer to be consumed by the next grid invocation or to the
1405 * user-supplied buffer.
1406 *
1407 * Data layout:
1408 *
1409 * CONST
1410 * 0.x = end_offset
1411 * 0.y = result_stride
1412 * 0.z = result_count
1413 * 0.w = bit field:
1414 * 1: read previously accumulated values
1415 * 2: write accumulated values for chaining
1416 * 4: write result available
1417 * 8: convert result to boolean (0/1)
1418 * 16: only read one dword and use that as result
1419 * 32: apply timestamp conversion
1420 * 64: store full 64 bits result
1421 * 128: store signed 32 bits result
1422 * 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
1423 * 1.x = fence_offset
1424 * 1.y = pair_stride
1425 * 1.z = pair_count
1426 *
1427 * BUFFER[0] = query result buffer
1428 * BUFFER[1] = previous summary buffer
1429 * BUFFER[2] = next summary buffer or user-supplied buffer
1430 */
1431 static void r600_create_query_result_shader(struct r600_common_context *rctx)
1432 {
1433 /* TEMP[0].xy = accumulated result so far
1434 * TEMP[0].z = result not available
1435 *
1436 * TEMP[1].x = current result index
1437 * TEMP[1].y = current pair index
1438 */
1439 static const char text_tmpl[] =
1440 "COMP\n"
1441 "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
1442 "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
1443 "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
1444 "DCL BUFFER[0]\n"
1445 "DCL BUFFER[1]\n"
1446 "DCL BUFFER[2]\n"
1447 "DCL CONST[0][0..1]\n"
1448 "DCL TEMP[0..5]\n"
1449 "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
1450 "IMM[1] UINT32 {1, 2, 4, 8}\n"
1451 "IMM[2] UINT32 {16, 32, 64, 128}\n"
1452 "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
1453 "IMM[4] UINT32 {256, 0, 0, 0}\n"
1454
1455 "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
1456 "UIF TEMP[5]\n"
1457 /* Check result availability. */
1458 "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
1459 "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
1460 "MOV TEMP[1], TEMP[0].zzzz\n"
1461 "NOT TEMP[0].z, TEMP[0].zzzz\n"
1462
1463 /* Load result if available. */
1464 "UIF TEMP[1]\n"
1465 "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
1466 "ENDIF\n"
1467 "ELSE\n"
1468 /* Load previously accumulated result if requested. */
1469 "MOV TEMP[0], IMM[0].xxxx\n"
1470 "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
1471 "UIF TEMP[4]\n"
1472 "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
1473 "ENDIF\n"
1474
1475 "MOV TEMP[1].x, IMM[0].xxxx\n"
1476 "BGNLOOP\n"
1477 /* Break if accumulated result so far is not available. */
1478 "UIF TEMP[0].zzzz\n"
1479 "BRK\n"
1480 "ENDIF\n"
1481
1482 /* Break if result_index >= result_count. */
1483 "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
1484 "UIF TEMP[5]\n"
1485 "BRK\n"
1486 "ENDIF\n"
1487
1488 /* Load fence and check result availability */
1489 "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
1490 "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
1491 "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
1492 "NOT TEMP[0].z, TEMP[0].zzzz\n"
1493 "UIF TEMP[0].zzzz\n"
1494 "BRK\n"
1495 "ENDIF\n"
1496
1497 "MOV TEMP[1].y, IMM[0].xxxx\n"
1498 "BGNLOOP\n"
1499 /* Load start and end. */
1500 "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
1501 "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
1502 "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
1503
1504 "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
1505 "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
1506
1507 "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
1508
1509 "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
1510 "UIF TEMP[5].zzzz\n"
1511 /* Load second start/end half-pair and
1512 * take the difference
1513 */
1514 "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
1515 "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
1516 "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
1517
1518 "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
1519 "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
1520 "ENDIF\n"
1521
1522 "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
1523
1524 /* Increment pair index */
1525 "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
1526 "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
1527 "UIF TEMP[5]\n"
1528 "BRK\n"
1529 "ENDIF\n"
1530 "ENDLOOP\n"
1531
1532 /* Increment result index */
1533 "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
1534 "ENDLOOP\n"
1535 "ENDIF\n"
1536
1537 "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
1538 "UIF TEMP[4]\n"
1539 /* Store accumulated data for chaining. */
1540 "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
1541 "ELSE\n"
1542 "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
1543 "UIF TEMP[4]\n"
1544 /* Store result availability. */
1545 "NOT TEMP[0].z, TEMP[0]\n"
1546 "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
1547 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
1548
1549 "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
1550 "UIF TEMP[4]\n"
1551 "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
1552 "ENDIF\n"
1553 "ELSE\n"
1554 /* Store result if it is available. */
1555 "NOT TEMP[4], TEMP[0].zzzz\n"
1556 "UIF TEMP[4]\n"
1557 /* Apply timestamp conversion */
1558 "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
1559 "UIF TEMP[4]\n"
1560 "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
1561 "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
1562 "ENDIF\n"
1563
1564 /* Convert to boolean */
1565 "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
1566 "UIF TEMP[4]\n"
1567 "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
1568 "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
1569 "MOV TEMP[0].y, IMM[0].xxxx\n"
1570 "ENDIF\n"
1571
1572 "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
1573 "UIF TEMP[4]\n"
1574 "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
1575 "ELSE\n"
1576 /* Clamping */
1577 "UIF TEMP[0].yyyy\n"
1578 "MOV TEMP[0].x, IMM[0].wwww\n"
1579 "ENDIF\n"
1580
1581 "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
1582 "UIF TEMP[4]\n"
1583 "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
1584 "ENDIF\n"
1585
1586 "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
1587 "ENDIF\n"
1588 "ENDIF\n"
1589 "ENDIF\n"
1590 "ENDIF\n"
1591
1592 "END\n";
1593
1594 char text[sizeof(text_tmpl) + 32];
1595 struct tgsi_token tokens[1024];
1596 struct pipe_compute_state state = {};
1597
1598 /* Hard code the frequency into the shader so that the backend can
1599 * use the full range of optimizations for divide-by-constant.
1600 */
1601 snprintf(text, sizeof(text), text_tmpl,
1602 rctx->screen->info.clock_crystal_freq);
1603
1604 if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
1605 assert(false);
1606 return;
1607 }
1608
1609 state.ir_type = PIPE_SHADER_IR_TGSI;
1610 state.prog = tokens;
1611
1612 rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state);
1613 }
1614
1615 static void r600_restore_qbo_state(struct r600_common_context *rctx,
1616 struct r600_qbo_state *st)
1617 {
1618 rctx->b.bind_compute_state(&rctx->b, st->saved_compute);
1619
1620 rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
1621 pipe_resource_reference(&st->saved_const0.buffer, NULL);
1622
1623 rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
1624 for (unsigned i = 0; i < 3; ++i)
1625 pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
1626 }
1627
1628 static void r600_query_hw_get_result_resource(struct r600_common_context *rctx,
1629 struct r600_query *rquery,
1630 bool wait,
1631 enum pipe_query_value_type result_type,
1632 int index,
1633 struct pipe_resource *resource,
1634 unsigned offset)
1635 {
1636 struct r600_query_hw *query = (struct r600_query_hw *)rquery;
1637 struct r600_query_buffer *qbuf;
1638 struct r600_query_buffer *qbuf_prev;
1639 struct pipe_resource *tmp_buffer = NULL;
1640 unsigned tmp_buffer_offset = 0;
1641 struct r600_qbo_state saved_state = {};
1642 struct pipe_grid_info grid = {};
1643 struct pipe_constant_buffer constant_buffer = {};
1644 struct pipe_shader_buffer ssbo[3];
1645 struct r600_hw_query_params params;
1646 struct {
1647 uint32_t end_offset;
1648 uint32_t result_stride;
1649 uint32_t result_count;
1650 uint32_t config;
1651 uint32_t fence_offset;
1652 uint32_t pair_stride;
1653 uint32_t pair_count;
1654 } consts;
1655
1656 if (!rctx->query_result_shader) {
1657 r600_create_query_result_shader(rctx);
1658 if (!rctx->query_result_shader)
1659 return;
1660 }
1661
1662 if (query->buffer.previous) {
1663 u_suballocator_alloc(rctx->allocator_zeroed_memory, 16, 16,
1664 &tmp_buffer_offset, &tmp_buffer);
1665 if (!tmp_buffer)
1666 return;
1667 }
1668
1669 rctx->save_qbo_state(&rctx->b, &saved_state);
1670
1671 r600_get_hw_query_params(rctx, query, index >= 0 ? index : 0, &params);
1672 consts.end_offset = params.end_offset - params.start_offset;
1673 consts.fence_offset = params.fence_offset - params.start_offset;
1674 consts.result_stride = query->result_size;
1675 consts.pair_stride = params.pair_stride;
1676 consts.pair_count = params.pair_count;
1677
1678 constant_buffer.buffer_size = sizeof(consts);
1679 constant_buffer.user_buffer = &consts;
1680
1681 ssbo[1].buffer = tmp_buffer;
1682 ssbo[1].buffer_offset = tmp_buffer_offset;
1683 ssbo[1].buffer_size = 16;
1684
1685 ssbo[2] = ssbo[1];
1686
1687 rctx->b.bind_compute_state(&rctx->b, rctx->query_result_shader);
1688
1689 grid.block[0] = 1;
1690 grid.block[1] = 1;
1691 grid.block[2] = 1;
1692 grid.grid[0] = 1;
1693 grid.grid[1] = 1;
1694 grid.grid[2] = 1;
1695
1696 consts.config = 0;
1697 if (index < 0)
1698 consts.config |= 4;
1699 if (query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE ||
1700 query->b.type == PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE)
1701 consts.config |= 8;
1702 else if (query->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE ||
1703 query->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE)
1704 consts.config |= 8 | 256;
1705 else if (query->b.type == PIPE_QUERY_TIMESTAMP ||
1706 query->b.type == PIPE_QUERY_TIME_ELAPSED)
1707 consts.config |= 32;
1708
1709 switch (result_type) {
1710 case PIPE_QUERY_TYPE_U64:
1711 case PIPE_QUERY_TYPE_I64:
1712 consts.config |= 64;
1713 break;
1714 case PIPE_QUERY_TYPE_I32:
1715 consts.config |= 128;
1716 break;
1717 case PIPE_QUERY_TYPE_U32:
1718 break;
1719 }
1720
1721 rctx->flags |= rctx->screen->barrier_flags.cp_to_L2;
1722
1723 for (qbuf = &query->buffer; qbuf; qbuf = qbuf_prev) {
1724 if (query->b.type != PIPE_QUERY_TIMESTAMP) {
1725 qbuf_prev = qbuf->previous;
1726 consts.result_count = qbuf->results_end / query->result_size;
1727 consts.config &= ~3;
1728 if (qbuf != &query->buffer)
1729 consts.config |= 1;
1730 if (qbuf->previous)
1731 consts.config |= 2;
1732 } else {
1733 /* Only read the last timestamp. */
1734 qbuf_prev = NULL;
1735 consts.result_count = 0;
1736 consts.config |= 16;
1737 params.start_offset += qbuf->results_end - query->result_size;
1738 }
1739
1740 rctx->b.set_constant_buffer(&rctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
1741
1742 ssbo[0].buffer = &qbuf->buf->b.b;
1743 ssbo[0].buffer_offset = params.start_offset;
1744 ssbo[0].buffer_size = qbuf->results_end - params.start_offset;
1745
1746 if (!qbuf->previous) {
1747 ssbo[2].buffer = resource;
1748 ssbo[2].buffer_offset = offset;
1749 ssbo[2].buffer_size = 8;
1750
1751 ((struct r600_resource *)resource)->TC_L2_dirty = true;
1752 }
1753
1754 rctx->b.set_shader_buffers(&rctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
1755
1756 if (wait && qbuf == &query->buffer) {
1757 uint64_t va;
1758
1759 /* Wait for result availability. Wait only for readiness
1760 * of the last entry, since the fence writes should be
1761 * serialized in the CP.
1762 */
1763 va = qbuf->buf->gpu_address + qbuf->results_end - query->result_size;
1764 va += params.fence_offset;
1765
1766 si_gfx_wait_fence(rctx, va, 0x80000000, 0x80000000);
1767 }
1768
1769 rctx->b.launch_grid(&rctx->b, &grid);
1770 rctx->flags |= rctx->screen->barrier_flags.compute_to_L2;
1771 }
1772
1773 r600_restore_qbo_state(rctx, &saved_state);
1774 pipe_resource_reference(&tmp_buffer, NULL);
1775 }
1776
1777 static void r600_render_condition(struct pipe_context *ctx,
1778 struct pipe_query *query,
1779 boolean condition,
1780 enum pipe_render_cond_flag mode)
1781 {
1782 struct r600_common_context *rctx = (struct r600_common_context *)ctx;
1783 struct r600_query_hw *rquery = (struct r600_query_hw *)query;
1784 struct r600_atom *atom = &rctx->render_cond_atom;
1785
1786 if (query) {
1787 bool needs_workaround = false;
1788
1789 /* There was a firmware regression in VI which causes successive
1790 * SET_PREDICATION packets to give the wrong answer for
1791 * non-inverted stream overflow predication.
1792 */
1793 if (((rctx->chip_class == VI && rctx->screen->info.pfp_fw_feature < 49) ||
1794 (rctx->chip_class == GFX9 && rctx->screen->info.pfp_fw_feature < 38)) &&
1795 !condition &&
1796 (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE ||
1797 (rquery->b.type == PIPE_QUERY_SO_OVERFLOW_PREDICATE &&
1798 (rquery->buffer.previous ||
1799 rquery->buffer.results_end > rquery->result_size)))) {
1800 needs_workaround = true;
1801 }
1802
1803 if (needs_workaround && !rquery->workaround_buf) {
1804 bool old_force_off = rctx->render_cond_force_off;
1805 rctx->render_cond_force_off = true;
1806
1807 u_suballocator_alloc(
1808 rctx->allocator_zeroed_memory, 8, 8,
1809 &rquery->workaround_offset,
1810 (struct pipe_resource **)&rquery->workaround_buf);
1811
1812 /* Reset to NULL to avoid a redundant SET_PREDICATION
1813 * from launching the compute grid.
1814 */
1815 rctx->render_cond = NULL;
1816
1817 ctx->get_query_result_resource(
1818 ctx, query, true, PIPE_QUERY_TYPE_U64, 0,
1819 &rquery->workaround_buf->b.b, rquery->workaround_offset);
1820
1821 /* Settings this in the render cond atom is too late,
1822 * so set it here. */
1823 rctx->flags |= rctx->screen->barrier_flags.L2_to_cp |
1824 R600_CONTEXT_FLUSH_FOR_RENDER_COND;
1825
1826 rctx->render_cond_force_off = old_force_off;
1827 }
1828 }
1829
1830 rctx->render_cond = query;
1831 rctx->render_cond_invert = condition;
1832 rctx->render_cond_mode = mode;
1833
1834 rctx->set_atom_dirty(rctx, atom, query != NULL);
1835 }
1836
1837 void si_suspend_queries(struct r600_common_context *ctx)
1838 {
1839 struct r600_query_hw *query;
1840
1841 LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
1842 r600_query_hw_emit_stop(ctx, query);
1843 }
1844 assert(ctx->num_cs_dw_queries_suspend == 0);
1845 }
1846
1847 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
1848 struct list_head *query_list)
1849 {
1850 struct r600_query_hw *query;
1851 unsigned num_dw = 0;
1852
1853 LIST_FOR_EACH_ENTRY(query, query_list, list) {
1854 /* begin + end */
1855 num_dw += query->num_cs_dw_begin + query->num_cs_dw_end;
1856
1857 /* Workaround for the fact that
1858 * num_cs_dw_nontimer_queries_suspend is incremented for every
1859 * resumed query, which raises the bar in need_cs_space for
1860 * queries about to be resumed.
1861 */
1862 num_dw += query->num_cs_dw_end;
1863 }
1864 /* guess for ZPASS enable or PERFECT_ZPASS_COUNT enable updates */
1865 num_dw += 13;
1866
1867 return num_dw;
1868 }
1869
1870 void si_resume_queries(struct r600_common_context *ctx)
1871 {
1872 struct r600_query_hw *query;
1873 unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
1874
1875 assert(ctx->num_cs_dw_queries_suspend == 0);
1876
1877 /* Check CS space here. Resuming must not be interrupted by flushes. */
1878 ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, true);
1879
1880 LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
1881 r600_query_hw_emit_start(ctx, query);
1882 }
1883 }
1884
1885 #define XFULL(name_, query_type_, type_, result_type_, group_id_) \
1886 { \
1887 .name = name_, \
1888 .query_type = R600_QUERY_##query_type_, \
1889 .type = PIPE_DRIVER_QUERY_TYPE_##type_, \
1890 .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##result_type_, \
1891 .group_id = group_id_ \
1892 }
1893
1894 #define X(name_, query_type_, type_, result_type_) \
1895 XFULL(name_, query_type_, type_, result_type_, ~(unsigned)0)
1896
1897 #define XG(group_, name_, query_type_, type_, result_type_) \
1898 XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
1899
1900 static struct pipe_driver_query_info r600_driver_query_list[] = {
1901 X("num-compilations", NUM_COMPILATIONS, UINT64, CUMULATIVE),
1902 X("num-shaders-created", NUM_SHADERS_CREATED, UINT64, CUMULATIVE),
1903 X("num-shader-cache-hits", NUM_SHADER_CACHE_HITS, UINT64, CUMULATIVE),
1904 X("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
1905 X("decompress-calls", DECOMPRESS_CALLS, UINT64, AVERAGE),
1906 X("MRT-draw-calls", MRT_DRAW_CALLS, UINT64, AVERAGE),
1907 X("prim-restart-calls", PRIM_RESTART_CALLS, UINT64, AVERAGE),
1908 X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
1909 X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
1910 X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
1911 X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
1912 X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
1913 X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
1914 X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
1915 X("num-cs-flushes", NUM_CS_FLUSHES, UINT64, AVERAGE),
1916 X("num-CB-cache-flushes", NUM_CB_CACHE_FLUSHES, UINT64, AVERAGE),
1917 X("num-DB-cache-flushes", NUM_DB_CACHE_FLUSHES, UINT64, AVERAGE),
1918 X("num-L2-invalidates", NUM_L2_INVALIDATES, UINT64, AVERAGE),
1919 X("num-L2-writebacks", NUM_L2_WRITEBACKS, UINT64, AVERAGE),
1920 X("num-resident-handles", NUM_RESIDENT_HANDLES, UINT64, AVERAGE),
1921 X("tc-offloaded-slots", TC_OFFLOADED_SLOTS, UINT64, AVERAGE),
1922 X("tc-direct-slots", TC_DIRECT_SLOTS, UINT64, AVERAGE),
1923 X("tc-num-syncs", TC_NUM_SYNCS, UINT64, AVERAGE),
1924 X("CS-thread-busy", CS_THREAD_BUSY, UINT64, AVERAGE),
1925 X("gallium-thread-busy", GALLIUM_THREAD_BUSY, UINT64, AVERAGE),
1926 X("requested-VRAM", REQUESTED_VRAM, BYTES, AVERAGE),
1927 X("requested-GTT", REQUESTED_GTT, BYTES, AVERAGE),
1928 X("mapped-VRAM", MAPPED_VRAM, BYTES, AVERAGE),
1929 X("mapped-GTT", MAPPED_GTT, BYTES, AVERAGE),
1930 X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
1931 X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
1932 X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
1933 X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
1934 X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
1935 X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
1936 X("num-evictions", NUM_EVICTIONS, UINT64, CUMULATIVE),
1937 X("VRAM-CPU-page-faults", NUM_VRAM_CPU_PAGE_FAULTS, UINT64, CUMULATIVE),
1938 X("VRAM-usage", VRAM_USAGE, BYTES, AVERAGE),
1939 X("VRAM-vis-usage", VRAM_VIS_USAGE, BYTES, AVERAGE),
1940 X("GTT-usage", GTT_USAGE, BYTES, AVERAGE),
1941 X("back-buffer-ps-draw-ratio", BACK_BUFFER_PS_DRAW_RATIO, UINT64, AVERAGE),
1942
1943 /* GPIN queries are for the benefit of old versions of GPUPerfStudio,
1944 * which use it as a fallback path to detect the GPU type.
1945 *
1946 * Note: The names of these queries are significant for GPUPerfStudio
1947 * (and possibly their order as well). */
1948 XG(GPIN, "GPIN_000", GPIN_ASIC_ID, UINT, AVERAGE),
1949 XG(GPIN, "GPIN_001", GPIN_NUM_SIMD, UINT, AVERAGE),
1950 XG(GPIN, "GPIN_002", GPIN_NUM_RB, UINT, AVERAGE),
1951 XG(GPIN, "GPIN_003", GPIN_NUM_SPI, UINT, AVERAGE),
1952 XG(GPIN, "GPIN_004", GPIN_NUM_SE, UINT, AVERAGE),
1953
1954 X("temperature", GPU_TEMPERATURE, UINT64, AVERAGE),
1955 X("shader-clock", CURRENT_GPU_SCLK, HZ, AVERAGE),
1956 X("memory-clock", CURRENT_GPU_MCLK, HZ, AVERAGE),
1957
1958 /* The following queries must be at the end of the list because their
1959 * availability is adjusted dynamically based on the DRM version. */
1960 X("GPU-load", GPU_LOAD, UINT64, AVERAGE),
1961 X("GPU-shaders-busy", GPU_SHADERS_BUSY, UINT64, AVERAGE),
1962 X("GPU-ta-busy", GPU_TA_BUSY, UINT64, AVERAGE),
1963 X("GPU-gds-busy", GPU_GDS_BUSY, UINT64, AVERAGE),
1964 X("GPU-vgt-busy", GPU_VGT_BUSY, UINT64, AVERAGE),
1965 X("GPU-ia-busy", GPU_IA_BUSY, UINT64, AVERAGE),
1966 X("GPU-sx-busy", GPU_SX_BUSY, UINT64, AVERAGE),
1967 X("GPU-wd-busy", GPU_WD_BUSY, UINT64, AVERAGE),
1968 X("GPU-bci-busy", GPU_BCI_BUSY, UINT64, AVERAGE),
1969 X("GPU-sc-busy", GPU_SC_BUSY, UINT64, AVERAGE),
1970 X("GPU-pa-busy", GPU_PA_BUSY, UINT64, AVERAGE),
1971 X("GPU-db-busy", GPU_DB_BUSY, UINT64, AVERAGE),
1972 X("GPU-cp-busy", GPU_CP_BUSY, UINT64, AVERAGE),
1973 X("GPU-cb-busy", GPU_CB_BUSY, UINT64, AVERAGE),
1974 X("GPU-sdma-busy", GPU_SDMA_BUSY, UINT64, AVERAGE),
1975 X("GPU-pfp-busy", GPU_PFP_BUSY, UINT64, AVERAGE),
1976 X("GPU-meq-busy", GPU_MEQ_BUSY, UINT64, AVERAGE),
1977 X("GPU-me-busy", GPU_ME_BUSY, UINT64, AVERAGE),
1978 X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE),
1979 X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE),
1980 X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE),
1981 };
1982
1983 #undef X
1984 #undef XG
1985 #undef XFULL
1986
1987 static unsigned r600_get_num_queries(struct r600_common_screen *rscreen)
1988 {
1989 if (rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42)
1990 return ARRAY_SIZE(r600_driver_query_list);
1991 else if (rscreen->info.drm_major == 3) {
1992 if (rscreen->chip_class >= VI)
1993 return ARRAY_SIZE(r600_driver_query_list);
1994 else
1995 return ARRAY_SIZE(r600_driver_query_list) - 7;
1996 }
1997 else
1998 return ARRAY_SIZE(r600_driver_query_list) - 25;
1999 }
2000
2001 static int r600_get_driver_query_info(struct pipe_screen *screen,
2002 unsigned index,
2003 struct pipe_driver_query_info *info)
2004 {
2005 struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
2006 unsigned num_queries = r600_get_num_queries(rscreen);
2007
2008 if (!info) {
2009 unsigned num_perfcounters =
2010 si_get_perfcounter_info(rscreen, 0, NULL);
2011
2012 return num_queries + num_perfcounters;
2013 }
2014
2015 if (index >= num_queries)
2016 return si_get_perfcounter_info(rscreen, index - num_queries, info);
2017
2018 *info = r600_driver_query_list[index];
2019
2020 switch (info->query_type) {
2021 case R600_QUERY_REQUESTED_VRAM:
2022 case R600_QUERY_VRAM_USAGE:
2023 case R600_QUERY_MAPPED_VRAM:
2024 info->max_value.u64 = rscreen->info.vram_size;
2025 break;
2026 case R600_QUERY_REQUESTED_GTT:
2027 case R600_QUERY_GTT_USAGE:
2028 case R600_QUERY_MAPPED_GTT:
2029 info->max_value.u64 = rscreen->info.gart_size;
2030 break;
2031 case R600_QUERY_GPU_TEMPERATURE:
2032 info->max_value.u64 = 125;
2033 break;
2034 case R600_QUERY_VRAM_VIS_USAGE:
2035 info->max_value.u64 = rscreen->info.vram_vis_size;
2036 break;
2037 }
2038
2039 if (info->group_id != ~(unsigned)0 && rscreen->perfcounters)
2040 info->group_id += rscreen->perfcounters->num_groups;
2041
2042 return 1;
2043 }
2044
2045 /* Note: Unfortunately, GPUPerfStudio hardcodes the order of hardware
2046 * performance counter groups, so be careful when changing this and related
2047 * functions.
2048 */
2049 static int r600_get_driver_query_group_info(struct pipe_screen *screen,
2050 unsigned index,
2051 struct pipe_driver_query_group_info *info)
2052 {
2053 struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
2054 unsigned num_pc_groups = 0;
2055
2056 if (rscreen->perfcounters)
2057 num_pc_groups = rscreen->perfcounters->num_groups;
2058
2059 if (!info)
2060 return num_pc_groups + R600_NUM_SW_QUERY_GROUPS;
2061
2062 if (index < num_pc_groups)
2063 return si_get_perfcounter_group_info(rscreen, index, info);
2064
2065 index -= num_pc_groups;
2066 if (index >= R600_NUM_SW_QUERY_GROUPS)
2067 return 0;
2068
2069 info->name = "GPIN";
2070 info->max_active_queries = 5;
2071 info->num_queries = 5;
2072 return 1;
2073 }
2074
2075 void si_init_query_functions(struct r600_common_context *rctx)
2076 {
2077 rctx->b.create_query = r600_create_query;
2078 rctx->b.create_batch_query = si_create_batch_query;
2079 rctx->b.destroy_query = r600_destroy_query;
2080 rctx->b.begin_query = r600_begin_query;
2081 rctx->b.end_query = r600_end_query;
2082 rctx->b.get_query_result = r600_get_query_result;
2083 rctx->b.get_query_result_resource = r600_get_query_result_resource;
2084 rctx->render_cond_atom.emit = r600_emit_query_predication;
2085
2086 if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
2087 rctx->b.render_condition = r600_render_condition;
2088
2089 LIST_INITHEAD(&rctx->active_queries);
2090 }
2091
2092 void si_init_screen_query_functions(struct r600_common_screen *rscreen)
2093 {
2094 rscreen->b.get_driver_query_info = r600_get_driver_query_info;
2095 rscreen->b.get_driver_query_group_info = r600_get_driver_query_group_info;
2096 }