radeonsi: separate and disable streamout for now
[mesa.git] / src / gallium / drivers / radeonsi / r600_hw_context.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Jerome Glisse
25 */
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pm4.h"
28 #include "radeonsi_pipe.h"
29 #include "sid.h"
30 #include "util/u_memory.h"
31 #include <errno.h>
32
33 #define GROUP_FORCE_NEW_BLOCK 0
34
35 /* Get backends mask */
36 void r600_get_backend_mask(struct r600_context *ctx)
37 {
38 struct radeon_winsys_cs *cs = ctx->cs;
39 struct si_resource *buffer;
40 uint32_t *results;
41 unsigned num_backends = ctx->screen->info.r600_num_backends;
42 unsigned i, mask = 0;
43
44 /* if backend_map query is supported by the kernel */
45 if (ctx->screen->info.r600_backend_map_valid) {
46 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
47 unsigned backend_map = ctx->screen->info.r600_backend_map;
48 unsigned item_width, item_mask;
49
50 if (ctx->chip_class >= CAYMAN) {
51 item_width = 4;
52 item_mask = 0x7;
53 }
54
55 while(num_tile_pipes--) {
56 i = backend_map & item_mask;
57 mask |= (1<<i);
58 backend_map >>= item_width;
59 }
60 if (mask != 0) {
61 ctx->backend_mask = mask;
62 return;
63 }
64 }
65
66 /* otherwise backup path for older kernels */
67
68 /* create buffer for event data */
69 buffer = si_resource_create_custom(&ctx->screen->screen,
70 PIPE_USAGE_STAGING,
71 ctx->max_db*16);
72 if (!buffer)
73 goto err;
74
75 /* initialize buffer with zeroes */
76 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
77 if (results) {
78 uint64_t va = 0;
79
80 memset(results, 0, ctx->max_db * 4 * 4);
81 ctx->ws->buffer_unmap(buffer->cs_buf);
82
83 /* emit EVENT_WRITE for ZPASS_DONE */
84 va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
85 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
86 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
87 cs->buf[cs->cdw++] = va;
88 cs->buf[cs->cdw++] = va >> 32;
89
90 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
91 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
92
93 /* analyze results */
94 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
95 if (results) {
96 for(i = 0; i < ctx->max_db; i++) {
97 /* at least highest bit will be set if backend is used */
98 if (results[i*4 + 1])
99 mask |= (1<<i);
100 }
101 ctx->ws->buffer_unmap(buffer->cs_buf);
102 }
103 }
104
105 si_resource_reference(&buffer, NULL);
106
107 if (mask != 0) {
108 ctx->backend_mask = mask;
109 return;
110 }
111
112 err:
113 /* fallback to old method - set num_backends lower bits to 1 */
114 ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
115 return;
116 }
117
118 /* initialize */
119 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
120 boolean count_draw_in)
121 {
122 struct r600_atom *state;
123
124 /* The number of dwords we already used in the CS so far. */
125 num_dw += ctx->cs->cdw;
126
127 if (count_draw_in) {
128 /* The number of dwords all the dirty states would take. */
129 LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) {
130 num_dw += state->num_dw;
131 }
132
133 num_dw += ctx->pm4_dirty_cdwords;
134
135 /* The upper-bound of how much a draw command would take. */
136 num_dw += SI_MAX_DRAW_CS_DWORDS;
137 }
138
139 /* Count in queries_suspend. */
140 num_dw += ctx->num_cs_dw_queries_suspend;
141
142 /* Count in streamout_end at the end of CS. */
143 num_dw += ctx->num_cs_dw_streamout_end;
144
145 /* Count in render_condition(NULL) at the end of CS. */
146 if (ctx->predicate_drawing) {
147 num_dw += 3;
148 }
149
150 /* Count in framebuffer cache flushes at the end of CS. */
151 num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
152
153 /* Save 16 dwords for the fence mechanism. */
154 num_dw += 16;
155
156 /* Flush if there's not enough space. */
157 if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
158 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
159 }
160 }
161
162 static void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now)
163 {
164 if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
165 return;
166
167 ctx->atom_surface_sync.flush_flags |=
168 r600_get_cb_flush_flags(ctx) |
169 (ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0);
170
171 if (flush_now) {
172 r600_emit_atom(ctx, &ctx->atom_surface_sync.atom);
173 } else {
174 r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
175 }
176
177 ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
178 }
179
180 void r600_context_flush(struct r600_context *ctx, unsigned flags)
181 {
182 struct radeon_winsys_cs *cs = ctx->cs;
183 struct r600_block *enable_block = NULL;
184 bool queries_suspended = false;
185
186 #if 0
187 bool streamout_suspended = false;
188 #endif
189
190 if (!cs->cdw)
191 return;
192
193 /* suspend queries */
194 if (ctx->num_cs_dw_queries_suspend) {
195 r600_context_queries_suspend(ctx);
196 queries_suspended = true;
197 }
198
199 #if 0
200 if (ctx->num_cs_dw_streamout_end) {
201 r600_context_streamout_end(ctx);
202 streamout_suspended = true;
203 }
204 #endif
205
206 r600_flush_framebuffer(ctx, true);
207
208 /* partial flush is needed to avoid lockups on some chips with user fences */
209 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
210 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
211
212 /* force to keep tiling flags */
213 flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
214
215 /* Flush the CS. */
216 ctx->ws->cs_flush(ctx->cs, flags);
217
218 ctx->pm4_dirty_cdwords = 0;
219 ctx->flags = 0;
220
221 #if 0
222 if (streamout_suspended) {
223 ctx->streamout_start = TRUE;
224 ctx->streamout_append_bitmask = ~0;
225 }
226 #endif
227
228 /* resume queries */
229 if (queries_suspended) {
230 r600_context_queries_resume(ctx);
231 }
232
233 /* set all valid group as dirty so they get reemited on
234 * next draw command
235 */
236 si_pm4_reset_emitted(ctx);
237 }
238
239 void r600_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
240 {
241 struct radeon_winsys_cs *cs = ctx->cs;
242 uint64_t va;
243
244 r600_need_cs_space(ctx, 10, FALSE);
245
246 va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
247 va = va + (offset << 2);
248
249 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
250 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
251 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
252 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
253 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */
254 /* DATA_SEL | INT_EN | ADDRESS_HI */
255 cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
256 cs->buf[cs->cdw++] = value; /* DATA_LO */
257 cs->buf[cs->cdw++] = 0; /* DATA_HI */
258 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
259 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
260 }
261
262 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
263 bool test_status_bit)
264 {
265 uint32_t *current_result = (uint32_t*)map;
266 uint64_t start, end;
267
268 start = (uint64_t)current_result[start_index] |
269 (uint64_t)current_result[start_index+1] << 32;
270 end = (uint64_t)current_result[end_index] |
271 (uint64_t)current_result[end_index+1] << 32;
272
273 if (!test_status_bit ||
274 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
275 return end - start;
276 }
277 return 0;
278 }
279
280 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
281 {
282 unsigned results_base = query->results_start;
283 char *map;
284
285 map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
286 PIPE_TRANSFER_READ |
287 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
288 if (!map)
289 return FALSE;
290
291 /* count all results across all data blocks */
292 switch (query->type) {
293 case PIPE_QUERY_OCCLUSION_COUNTER:
294 while (results_base != query->results_end) {
295 query->result.u64 +=
296 r600_query_read_result(map + results_base, 0, 2, true);
297 results_base = (results_base + 16) % query->buffer->b.b.width0;
298 }
299 break;
300 case PIPE_QUERY_OCCLUSION_PREDICATE:
301 while (results_base != query->results_end) {
302 query->result.b = query->result.b ||
303 r600_query_read_result(map + results_base, 0, 2, true) != 0;
304 results_base = (results_base + 16) % query->buffer->b.b.width0;
305 }
306 break;
307 case PIPE_QUERY_TIME_ELAPSED:
308 while (results_base != query->results_end) {
309 query->result.u64 +=
310 r600_query_read_result(map + results_base, 0, 2, false);
311 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
312 }
313 break;
314 case PIPE_QUERY_PRIMITIVES_EMITTED:
315 /* SAMPLE_STREAMOUTSTATS stores this structure:
316 * {
317 * u64 NumPrimitivesWritten;
318 * u64 PrimitiveStorageNeeded;
319 * }
320 * We only need NumPrimitivesWritten here. */
321 while (results_base != query->results_end) {
322 query->result.u64 +=
323 r600_query_read_result(map + results_base, 2, 6, true);
324 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
325 }
326 break;
327 case PIPE_QUERY_PRIMITIVES_GENERATED:
328 /* Here we read PrimitiveStorageNeeded. */
329 while (results_base != query->results_end) {
330 query->result.u64 +=
331 r600_query_read_result(map + results_base, 0, 4, true);
332 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
333 }
334 break;
335 case PIPE_QUERY_SO_STATISTICS:
336 while (results_base != query->results_end) {
337 query->result.so.num_primitives_written +=
338 r600_query_read_result(map + results_base, 2, 6, true);
339 query->result.so.primitives_storage_needed +=
340 r600_query_read_result(map + results_base, 0, 4, true);
341 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
342 }
343 break;
344 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
345 while (results_base != query->results_end) {
346 query->result.b = query->result.b ||
347 r600_query_read_result(map + results_base, 2, 6, true) !=
348 r600_query_read_result(map + results_base, 0, 4, true);
349 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
350 }
351 break;
352 default:
353 assert(0);
354 }
355
356 query->results_start = query->results_end;
357 ctx->ws->buffer_unmap(query->buffer->cs_buf);
358 return TRUE;
359 }
360
361 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
362 {
363 struct radeon_winsys_cs *cs = ctx->cs;
364 unsigned new_results_end, i;
365 uint32_t *results;
366 uint64_t va;
367
368 r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
369
370 new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
371
372 /* collect current results if query buffer is full */
373 if (new_results_end == query->results_start) {
374 r600_query_result(ctx, query, TRUE);
375 }
376
377 switch (query->type) {
378 case PIPE_QUERY_OCCLUSION_COUNTER:
379 case PIPE_QUERY_OCCLUSION_PREDICATE:
380 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
381 if (results) {
382 results = (uint32_t*)((char*)results + query->results_end);
383 memset(results, 0, query->result_size);
384
385 /* Set top bits for unused backends */
386 for (i = 0; i < ctx->max_db; i++) {
387 if (!(ctx->backend_mask & (1<<i))) {
388 results[(i * 4)+1] = 0x80000000;
389 results[(i * 4)+3] = 0x80000000;
390 }
391 }
392 ctx->ws->buffer_unmap(query->buffer->cs_buf);
393 }
394 break;
395 case PIPE_QUERY_TIME_ELAPSED:
396 break;
397 case PIPE_QUERY_PRIMITIVES_EMITTED:
398 case PIPE_QUERY_PRIMITIVES_GENERATED:
399 case PIPE_QUERY_SO_STATISTICS:
400 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
401 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
402 results = (uint32_t*)((char*)results + query->results_end);
403 memset(results, 0, query->result_size);
404 ctx->ws->buffer_unmap(query->buffer->cs_buf);
405 break;
406 default:
407 assert(0);
408 }
409
410 /* emit begin query */
411 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
412 va += query->results_end;
413
414 switch (query->type) {
415 case PIPE_QUERY_OCCLUSION_COUNTER:
416 case PIPE_QUERY_OCCLUSION_PREDICATE:
417 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
418 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
419 cs->buf[cs->cdw++] = va;
420 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
421 break;
422 case PIPE_QUERY_PRIMITIVES_EMITTED:
423 case PIPE_QUERY_PRIMITIVES_GENERATED:
424 case PIPE_QUERY_SO_STATISTICS:
425 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
426 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
427 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
428 cs->buf[cs->cdw++] = query->results_end;
429 cs->buf[cs->cdw++] = 0;
430 break;
431 case PIPE_QUERY_TIME_ELAPSED:
432 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
433 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
434 cs->buf[cs->cdw++] = va;
435 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
436 cs->buf[cs->cdw++] = 0;
437 cs->buf[cs->cdw++] = 0;
438 break;
439 default:
440 assert(0);
441 }
442 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
443 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
444
445 ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
446 }
447
448 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
449 {
450 struct radeon_winsys_cs *cs = ctx->cs;
451 uint64_t va;
452
453 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
454 /* emit end query */
455 switch (query->type) {
456 case PIPE_QUERY_OCCLUSION_COUNTER:
457 case PIPE_QUERY_OCCLUSION_PREDICATE:
458 va += query->results_end + 8;
459 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
460 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
461 cs->buf[cs->cdw++] = va;
462 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
463 break;
464 case PIPE_QUERY_PRIMITIVES_EMITTED:
465 case PIPE_QUERY_PRIMITIVES_GENERATED:
466 case PIPE_QUERY_SO_STATISTICS:
467 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
468 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
469 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
470 cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
471 cs->buf[cs->cdw++] = 0;
472 break;
473 case PIPE_QUERY_TIME_ELAPSED:
474 va += query->results_end + query->result_size/2;
475 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
476 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
477 cs->buf[cs->cdw++] = va;
478 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
479 cs->buf[cs->cdw++] = 0;
480 cs->buf[cs->cdw++] = 0;
481 break;
482 default:
483 assert(0);
484 }
485 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
486 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
487
488 query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
489 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
490 }
491
492 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
493 int flag_wait)
494 {
495 struct radeon_winsys_cs *cs = ctx->cs;
496 uint64_t va;
497
498 if (operation == PREDICATION_OP_CLEAR) {
499 r600_need_cs_space(ctx, 3, FALSE);
500
501 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
502 cs->buf[cs->cdw++] = 0;
503 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
504 } else {
505 unsigned results_base = query->results_start;
506 unsigned count;
507 uint32_t op;
508
509 /* find count of the query data blocks */
510 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
511 count /= query->result_size;
512
513 r600_need_cs_space(ctx, 5 * count, TRUE);
514
515 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
516 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
517 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
518
519 /* emit predicate packets for all data blocks */
520 while (results_base != query->results_end) {
521 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
522 cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
523 cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
524 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
525 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
526 RADEON_USAGE_READ);
527 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
528
529 /* set CONTINUE bit for all packets except the first */
530 op |= PREDICATION_CONTINUE;
531 }
532 }
533 }
534
535 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
536 {
537 struct r600_query *query;
538 unsigned buffer_size = 4096;
539
540 query = CALLOC_STRUCT(r600_query);
541 if (query == NULL)
542 return NULL;
543
544 query->type = query_type;
545
546 switch (query_type) {
547 case PIPE_QUERY_OCCLUSION_COUNTER:
548 case PIPE_QUERY_OCCLUSION_PREDICATE:
549 query->result_size = 16 * ctx->max_db;
550 query->num_cs_dw = 6;
551 break;
552 case PIPE_QUERY_TIME_ELAPSED:
553 query->result_size = 16;
554 query->num_cs_dw = 8;
555 break;
556 case PIPE_QUERY_PRIMITIVES_EMITTED:
557 case PIPE_QUERY_PRIMITIVES_GENERATED:
558 case PIPE_QUERY_SO_STATISTICS:
559 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
560 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
561 query->result_size = 32;
562 query->num_cs_dw = 6;
563 break;
564 default:
565 assert(0);
566 FREE(query);
567 return NULL;
568 }
569
570 /* adjust buffer size to simplify offsets wrapping math */
571 buffer_size -= buffer_size % query->result_size;
572
573 /* Queries are normally read by the CPU after
574 * being written by the gpu, hence staging is probably a good
575 * usage pattern.
576 */
577 query->buffer = si_resource_create_custom(&ctx->screen->screen,
578 PIPE_USAGE_STAGING,
579 buffer_size);
580 if (!query->buffer) {
581 FREE(query);
582 return NULL;
583 }
584 return query;
585 }
586
587 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
588 {
589 si_resource_reference(&query->buffer, NULL);
590 free(query);
591 }
592
593 boolean r600_context_query_result(struct r600_context *ctx,
594 struct r600_query *query,
595 boolean wait, void *vresult)
596 {
597 boolean *result_b = (boolean*)vresult;
598 uint64_t *result_u64 = (uint64_t*)vresult;
599 struct pipe_query_data_so_statistics *result_so =
600 (struct pipe_query_data_so_statistics*)vresult;
601
602 if (!r600_query_result(ctx, query, wait))
603 return FALSE;
604
605 switch (query->type) {
606 case PIPE_QUERY_OCCLUSION_COUNTER:
607 case PIPE_QUERY_PRIMITIVES_EMITTED:
608 case PIPE_QUERY_PRIMITIVES_GENERATED:
609 *result_u64 = query->result.u64;
610 break;
611 case PIPE_QUERY_OCCLUSION_PREDICATE:
612 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
613 *result_b = query->result.b;
614 break;
615 case PIPE_QUERY_TIME_ELAPSED:
616 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
617 break;
618 case PIPE_QUERY_SO_STATISTICS:
619 *result_so = query->result.so;
620 break;
621 default:
622 assert(0);
623 }
624 return TRUE;
625 }
626
627 void r600_context_queries_suspend(struct r600_context *ctx)
628 {
629 struct r600_query *query;
630
631 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
632 r600_query_end(ctx, query);
633 }
634 assert(ctx->num_cs_dw_queries_suspend == 0);
635 }
636
637 void r600_context_queries_resume(struct r600_context *ctx)
638 {
639 struct r600_query *query;
640
641 assert(ctx->num_cs_dw_queries_suspend == 0);
642
643 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
644 r600_query_begin(ctx, query);
645 }
646 }
647
648 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
649 {
650 struct radeon_winsys_cs *cs = ctx->cs;
651 r600_need_cs_space(ctx, 14 + 21, TRUE);
652
653 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
654 cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
655 cs->buf[cs->cdw++] = 0;
656
657 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
658 cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
659 cs->buf[cs->cdw++] = t->stride >> 2;
660
661 #if 0
662 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
663 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
664 cs->buf[cs->cdw++] = 0; /* src address lo */
665 cs->buf[cs->cdw++] = 0; /* src address hi */
666 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
667 cs->buf[cs->cdw++] = 0; /* unused */
668 #endif
669
670 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
671 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
672
673 #if 0 /* I have not found this useful yet. */
674 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
675 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
676 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
677 cs->buf[cs->cdw++] = 0; /* unused */
678 cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
679 cs->buf[cs->cdw++] = 0; /* unused */
680
681 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
682 cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
683 cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
684
685 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
686 cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
687 cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
688
689 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
690 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct si_resource*)t->b.buffer,
691 RADEON_USAGE_WRITE);
692
693 cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
694 cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
695 cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */
696 cs->buf[cs->cdw++] = 0;
697 cs->buf[cs->cdw++] = 0; /* reference value */
698 cs->buf[cs->cdw++] = 0xffffffff; /* mask */
699 cs->buf[cs->cdw++] = 4; /* poll interval */
700 #endif
701 }