gallium: add condition parameter to render_condition
[mesa.git] / src / gallium / drivers / radeonsi / r600_hw_context.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Jerome Glisse
25 */
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pm4.h"
28 #include "radeonsi_pipe.h"
29 #include "sid.h"
30 #include "util/u_memory.h"
31 #include <errno.h>
32
33 #define GROUP_FORCE_NEW_BLOCK 0
34
35 /* Get backends mask */
36 void si_get_backend_mask(struct r600_context *ctx)
37 {
38 struct radeon_winsys_cs *cs = ctx->cs;
39 struct si_resource *buffer;
40 uint32_t *results;
41 unsigned num_backends = ctx->screen->info.r600_num_backends;
42 unsigned i, mask = 0;
43
44 /* if backend_map query is supported by the kernel */
45 if (ctx->screen->info.r600_backend_map_valid) {
46 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
47 unsigned backend_map = ctx->screen->info.r600_backend_map;
48 unsigned item_width = 4, item_mask = 0x7;
49
50 while(num_tile_pipes--) {
51 i = backend_map & item_mask;
52 mask |= (1<<i);
53 backend_map >>= item_width;
54 }
55 if (mask != 0) {
56 ctx->backend_mask = mask;
57 return;
58 }
59 }
60
61 /* otherwise backup path for older kernels */
62
63 /* create buffer for event data */
64 buffer = si_resource_create_custom(&ctx->screen->screen,
65 PIPE_USAGE_STAGING,
66 ctx->max_db*16);
67 if (!buffer)
68 goto err;
69
70 /* initialize buffer with zeroes */
71 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
72 if (results) {
73 uint64_t va = 0;
74
75 memset(results, 0, ctx->max_db * 4 * 4);
76 ctx->ws->buffer_unmap(buffer->cs_buf);
77
78 /* emit EVENT_WRITE for ZPASS_DONE */
79 va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
80 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
81 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
82 cs->buf[cs->cdw++] = va;
83 cs->buf[cs->cdw++] = va >> 32;
84
85 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
86 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
87
88 /* analyze results */
89 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
90 if (results) {
91 for(i = 0; i < ctx->max_db; i++) {
92 /* at least highest bit will be set if backend is used */
93 if (results[i*4 + 1])
94 mask |= (1<<i);
95 }
96 ctx->ws->buffer_unmap(buffer->cs_buf);
97 }
98 }
99
100 si_resource_reference(&buffer, NULL);
101
102 if (mask != 0) {
103 ctx->backend_mask = mask;
104 return;
105 }
106
107 err:
108 /* fallback to old method - set num_backends lower bits to 1 */
109 ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
110 return;
111 }
112
113 /* initialize */
114 void si_need_cs_space(struct r600_context *ctx, unsigned num_dw,
115 boolean count_draw_in)
116 {
117 /* The number of dwords we already used in the CS so far. */
118 num_dw += ctx->cs->cdw;
119
120 if (count_draw_in) {
121 /* The number of dwords all the dirty states would take. */
122 num_dw += ctx->pm4_dirty_cdwords;
123
124 /* The upper-bound of how much a draw command would take. */
125 num_dw += SI_MAX_DRAW_CS_DWORDS;
126 }
127
128 /* Count in queries_suspend. */
129 num_dw += ctx->num_cs_dw_queries_suspend;
130
131 /* Count in streamout_end at the end of CS. */
132 num_dw += ctx->num_cs_dw_streamout_end;
133
134 /* Count in render_condition(NULL) at the end of CS. */
135 if (ctx->predicate_drawing) {
136 num_dw += 3;
137 }
138
139 /* Count in framebuffer cache flushes at the end of CS. */
140 num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
141
142 /* Save 16 dwords for the fence mechanism. */
143 num_dw += 16;
144
145 #if R600_TRACE_CS
146 if (ctx->screen->trace_bo) {
147 num_dw += R600_TRACE_CS_DWORDS;
148 }
149 #endif
150
151 /* Flush if there's not enough space. */
152 if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
153 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
154 }
155 }
156
157 static void r600_flush_framebuffer(struct r600_context *ctx)
158 {
159 struct si_pm4_state *pm4;
160
161 if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
162 return;
163
164 pm4 = CALLOC_STRUCT(si_pm4_state);
165 si_cmd_surface_sync(pm4, S_0085F0_CB0_DEST_BASE_ENA(1) |
166 S_0085F0_CB1_DEST_BASE_ENA(1) |
167 S_0085F0_CB2_DEST_BASE_ENA(1) |
168 S_0085F0_CB3_DEST_BASE_ENA(1) |
169 S_0085F0_CB4_DEST_BASE_ENA(1) |
170 S_0085F0_CB5_DEST_BASE_ENA(1) |
171 S_0085F0_CB6_DEST_BASE_ENA(1) |
172 S_0085F0_CB7_DEST_BASE_ENA(1) |
173 S_0085F0_DB_ACTION_ENA(1) |
174 S_0085F0_DB_DEST_BASE_ENA(1));
175 si_pm4_emit(ctx, pm4);
176 si_pm4_free_state(ctx, pm4, ~0);
177
178 ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
179 }
180
181 void si_context_flush(struct r600_context *ctx, unsigned flags)
182 {
183 struct radeon_winsys_cs *cs = ctx->cs;
184 bool queries_suspended = false;
185
186 #if 0
187 bool streamout_suspended = false;
188 #endif
189
190 if (!cs->cdw)
191 return;
192
193 /* suspend queries */
194 if (ctx->num_cs_dw_queries_suspend) {
195 r600_context_queries_suspend(ctx);
196 queries_suspended = true;
197 }
198
199 #if 0
200 if (ctx->num_cs_dw_streamout_end) {
201 r600_context_streamout_end(ctx);
202 streamout_suspended = true;
203 }
204 #endif
205
206 r600_flush_framebuffer(ctx);
207
208 /* partial flush is needed to avoid lockups on some chips with user fences */
209 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
210 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
211
212 /* force to keep tiling flags */
213 flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
214
215 #if R600_TRACE_CS
216 if (ctx->screen->trace_bo) {
217 struct r600_screen *rscreen = ctx->screen;
218 unsigned i;
219
220 for (i = 0; i < cs->cdw; i++) {
221 fprintf(stderr, "[%4d] [%5d] 0x%08x\n", rscreen->cs_count, i, cs->buf[i]);
222 }
223 rscreen->cs_count++;
224 }
225 #endif
226
227 /* Flush the CS. */
228 ctx->ws->cs_flush(ctx->cs, flags, 0);
229
230 #if R600_TRACE_CS
231 if (ctx->screen->trace_bo) {
232 struct r600_screen *rscreen = ctx->screen;
233 unsigned i;
234
235 for (i = 0; i < 10; i++) {
236 usleep(5);
237 if (!ctx->ws->buffer_is_busy(rscreen->trace_bo->buf, RADEON_USAGE_READWRITE)) {
238 break;
239 }
240 }
241 if (i == 10) {
242 fprintf(stderr, "timeout on cs lockup likely happen at cs %d dw %d\n",
243 rscreen->trace_ptr[1], rscreen->trace_ptr[0]);
244 } else {
245 fprintf(stderr, "cs %d executed in %dms\n", rscreen->trace_ptr[1], i * 5);
246 }
247 }
248 #endif
249
250 ctx->pm4_dirty_cdwords = 0;
251 ctx->flags = 0;
252
253 #if 0
254 if (streamout_suspended) {
255 ctx->streamout_start = TRUE;
256 ctx->streamout_append_bitmask = ~0;
257 }
258 #endif
259
260 /* resume queries */
261 if (queries_suspended) {
262 r600_context_queries_resume(ctx);
263 }
264
265 /* set all valid group as dirty so they get reemited on
266 * next draw command
267 */
268 si_pm4_reset_emitted(ctx);
269 }
270
271 void si_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
272 {
273 struct radeon_winsys_cs *cs = ctx->cs;
274 uint64_t va;
275
276 si_need_cs_space(ctx, 10, FALSE);
277
278 va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
279 va = va + (offset << 2);
280
281 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
282 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
283 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
284 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
285 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */
286 /* DATA_SEL | INT_EN | ADDRESS_HI */
287 cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
288 cs->buf[cs->cdw++] = value; /* DATA_LO */
289 cs->buf[cs->cdw++] = 0; /* DATA_HI */
290 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
291 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
292 }
293
294 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
295 bool test_status_bit)
296 {
297 uint32_t *current_result = (uint32_t*)map;
298 uint64_t start, end;
299
300 start = (uint64_t)current_result[start_index] |
301 (uint64_t)current_result[start_index+1] << 32;
302 end = (uint64_t)current_result[end_index] |
303 (uint64_t)current_result[end_index+1] << 32;
304
305 if (!test_status_bit ||
306 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
307 return end - start;
308 }
309 return 0;
310 }
311
312 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
313 {
314 unsigned results_base = query->results_start;
315 char *map;
316
317 map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
318 PIPE_TRANSFER_READ |
319 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
320 if (!map)
321 return FALSE;
322
323 /* count all results across all data blocks */
324 switch (query->type) {
325 case PIPE_QUERY_OCCLUSION_COUNTER:
326 while (results_base != query->results_end) {
327 query->result.u64 +=
328 r600_query_read_result(map + results_base, 0, 2, true);
329 results_base = (results_base + 16) % query->buffer->b.b.width0;
330 }
331 break;
332 case PIPE_QUERY_OCCLUSION_PREDICATE:
333 while (results_base != query->results_end) {
334 query->result.b = query->result.b ||
335 r600_query_read_result(map + results_base, 0, 2, true) != 0;
336 results_base = (results_base + 16) % query->buffer->b.b.width0;
337 }
338 break;
339 case PIPE_QUERY_TIME_ELAPSED:
340 while (results_base != query->results_end) {
341 query->result.u64 +=
342 r600_query_read_result(map + results_base, 0, 2, false);
343 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
344 }
345 break;
346 case PIPE_QUERY_PRIMITIVES_EMITTED:
347 /* SAMPLE_STREAMOUTSTATS stores this structure:
348 * {
349 * u64 NumPrimitivesWritten;
350 * u64 PrimitiveStorageNeeded;
351 * }
352 * We only need NumPrimitivesWritten here. */
353 while (results_base != query->results_end) {
354 query->result.u64 +=
355 r600_query_read_result(map + results_base, 2, 6, true);
356 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
357 }
358 break;
359 case PIPE_QUERY_PRIMITIVES_GENERATED:
360 /* Here we read PrimitiveStorageNeeded. */
361 while (results_base != query->results_end) {
362 query->result.u64 +=
363 r600_query_read_result(map + results_base, 0, 4, true);
364 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
365 }
366 break;
367 case PIPE_QUERY_SO_STATISTICS:
368 while (results_base != query->results_end) {
369 query->result.so.num_primitives_written +=
370 r600_query_read_result(map + results_base, 2, 6, true);
371 query->result.so.primitives_storage_needed +=
372 r600_query_read_result(map + results_base, 0, 4, true);
373 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
374 }
375 break;
376 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
377 while (results_base != query->results_end) {
378 query->result.b = query->result.b ||
379 r600_query_read_result(map + results_base, 2, 6, true) !=
380 r600_query_read_result(map + results_base, 0, 4, true);
381 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
382 }
383 break;
384 default:
385 assert(0);
386 }
387
388 query->results_start = query->results_end;
389 ctx->ws->buffer_unmap(query->buffer->cs_buf);
390 return TRUE;
391 }
392
393 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
394 {
395 struct radeon_winsys_cs *cs = ctx->cs;
396 unsigned new_results_end, i;
397 uint32_t *results;
398 uint64_t va;
399
400 si_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
401
402 new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
403
404 /* collect current results if query buffer is full */
405 if (new_results_end == query->results_start) {
406 r600_query_result(ctx, query, TRUE);
407 }
408
409 switch (query->type) {
410 case PIPE_QUERY_OCCLUSION_COUNTER:
411 case PIPE_QUERY_OCCLUSION_PREDICATE:
412 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
413 if (results) {
414 results = (uint32_t*)((char*)results + query->results_end);
415 memset(results, 0, query->result_size);
416
417 /* Set top bits for unused backends */
418 for (i = 0; i < ctx->max_db; i++) {
419 if (!(ctx->backend_mask & (1<<i))) {
420 results[(i * 4)+1] = 0x80000000;
421 results[(i * 4)+3] = 0x80000000;
422 }
423 }
424 ctx->ws->buffer_unmap(query->buffer->cs_buf);
425 }
426 break;
427 case PIPE_QUERY_TIME_ELAPSED:
428 break;
429 case PIPE_QUERY_PRIMITIVES_EMITTED:
430 case PIPE_QUERY_PRIMITIVES_GENERATED:
431 case PIPE_QUERY_SO_STATISTICS:
432 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
433 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
434 results = (uint32_t*)((char*)results + query->results_end);
435 memset(results, 0, query->result_size);
436 ctx->ws->buffer_unmap(query->buffer->cs_buf);
437 break;
438 default:
439 assert(0);
440 }
441
442 /* emit begin query */
443 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
444 va += query->results_end;
445
446 switch (query->type) {
447 case PIPE_QUERY_OCCLUSION_COUNTER:
448 case PIPE_QUERY_OCCLUSION_PREDICATE:
449 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
450 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
451 cs->buf[cs->cdw++] = va;
452 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
453 break;
454 case PIPE_QUERY_PRIMITIVES_EMITTED:
455 case PIPE_QUERY_PRIMITIVES_GENERATED:
456 case PIPE_QUERY_SO_STATISTICS:
457 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
458 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
459 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
460 cs->buf[cs->cdw++] = query->results_end;
461 cs->buf[cs->cdw++] = 0;
462 break;
463 case PIPE_QUERY_TIME_ELAPSED:
464 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
465 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
466 cs->buf[cs->cdw++] = va;
467 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
468 cs->buf[cs->cdw++] = 0;
469 cs->buf[cs->cdw++] = 0;
470 break;
471 default:
472 assert(0);
473 }
474 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
475 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
476
477 ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
478 }
479
480 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
481 {
482 struct radeon_winsys_cs *cs = ctx->cs;
483 uint64_t va;
484
485 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
486 /* emit end query */
487 switch (query->type) {
488 case PIPE_QUERY_OCCLUSION_COUNTER:
489 case PIPE_QUERY_OCCLUSION_PREDICATE:
490 va += query->results_end + 8;
491 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
492 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
493 cs->buf[cs->cdw++] = va;
494 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
495 break;
496 case PIPE_QUERY_PRIMITIVES_EMITTED:
497 case PIPE_QUERY_PRIMITIVES_GENERATED:
498 case PIPE_QUERY_SO_STATISTICS:
499 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
500 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
501 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
502 cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
503 cs->buf[cs->cdw++] = 0;
504 break;
505 case PIPE_QUERY_TIME_ELAPSED:
506 va += query->results_end + query->result_size/2;
507 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
508 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
509 cs->buf[cs->cdw++] = va;
510 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
511 cs->buf[cs->cdw++] = 0;
512 cs->buf[cs->cdw++] = 0;
513 break;
514 default:
515 assert(0);
516 }
517 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
518 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
519
520 query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
521 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
522 }
523
524 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
525 int flag_wait)
526 {
527 struct radeon_winsys_cs *cs = ctx->cs;
528 uint64_t va;
529
530 if (operation == PREDICATION_OP_CLEAR) {
531 si_need_cs_space(ctx, 3, FALSE);
532
533 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
534 cs->buf[cs->cdw++] = 0;
535 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
536 } else {
537 unsigned results_base = query->results_start;
538 unsigned count;
539 uint32_t op;
540
541 /* find count of the query data blocks */
542 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
543 count /= query->result_size;
544
545 si_need_cs_space(ctx, 5 * count, TRUE);
546
547 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
548 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
549 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
550
551 /* emit predicate packets for all data blocks */
552 while (results_base != query->results_end) {
553 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
554 cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
555 cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
556 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
557 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
558 RADEON_USAGE_READ);
559 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
560
561 /* set CONTINUE bit for all packets except the first */
562 op |= PREDICATION_CONTINUE;
563 }
564 }
565 }
566
567 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
568 {
569 struct r600_query *query;
570 unsigned buffer_size = 4096;
571
572 query = CALLOC_STRUCT(r600_query);
573 if (query == NULL)
574 return NULL;
575
576 query->type = query_type;
577
578 switch (query_type) {
579 case PIPE_QUERY_OCCLUSION_COUNTER:
580 case PIPE_QUERY_OCCLUSION_PREDICATE:
581 query->result_size = 16 * ctx->max_db;
582 query->num_cs_dw = 6;
583 break;
584 case PIPE_QUERY_TIME_ELAPSED:
585 query->result_size = 16;
586 query->num_cs_dw = 8;
587 break;
588 case PIPE_QUERY_PRIMITIVES_EMITTED:
589 case PIPE_QUERY_PRIMITIVES_GENERATED:
590 case PIPE_QUERY_SO_STATISTICS:
591 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
592 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
593 query->result_size = 32;
594 query->num_cs_dw = 6;
595 break;
596 default:
597 assert(0);
598 FREE(query);
599 return NULL;
600 }
601
602 /* adjust buffer size to simplify offsets wrapping math */
603 buffer_size -= buffer_size % query->result_size;
604
605 /* Queries are normally read by the CPU after
606 * being written by the gpu, hence staging is probably a good
607 * usage pattern.
608 */
609 query->buffer = si_resource_create_custom(&ctx->screen->screen,
610 PIPE_USAGE_STAGING,
611 buffer_size);
612 if (!query->buffer) {
613 FREE(query);
614 return NULL;
615 }
616 return query;
617 }
618
619 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
620 {
621 si_resource_reference(&query->buffer, NULL);
622 free(query);
623 }
624
625 boolean r600_context_query_result(struct r600_context *ctx,
626 struct r600_query *query,
627 boolean wait, void *vresult)
628 {
629 boolean *result_b = (boolean*)vresult;
630 uint64_t *result_u64 = (uint64_t*)vresult;
631 struct pipe_query_data_so_statistics *result_so =
632 (struct pipe_query_data_so_statistics*)vresult;
633
634 if (!r600_query_result(ctx, query, wait))
635 return FALSE;
636
637 switch (query->type) {
638 case PIPE_QUERY_OCCLUSION_COUNTER:
639 case PIPE_QUERY_PRIMITIVES_EMITTED:
640 case PIPE_QUERY_PRIMITIVES_GENERATED:
641 *result_u64 = query->result.u64;
642 break;
643 case PIPE_QUERY_OCCLUSION_PREDICATE:
644 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
645 *result_b = query->result.b;
646 break;
647 case PIPE_QUERY_TIME_ELAPSED:
648 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
649 break;
650 case PIPE_QUERY_SO_STATISTICS:
651 *result_so = query->result.so;
652 break;
653 default:
654 assert(0);
655 }
656 return TRUE;
657 }
658
659 void r600_context_queries_suspend(struct r600_context *ctx)
660 {
661 struct r600_query *query;
662
663 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
664 r600_query_end(ctx, query);
665 }
666 assert(ctx->num_cs_dw_queries_suspend == 0);
667 }
668
669 void r600_context_queries_resume(struct r600_context *ctx)
670 {
671 struct r600_query *query;
672
673 assert(ctx->num_cs_dw_queries_suspend == 0);
674
675 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
676 r600_query_begin(ctx, query);
677 }
678 }
679
680 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
681 {
682 struct radeon_winsys_cs *cs = ctx->cs;
683 si_need_cs_space(ctx, 14 + 21, TRUE);
684
685 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
686 cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
687 cs->buf[cs->cdw++] = 0;
688
689 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
690 cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
691 cs->buf[cs->cdw++] = t->stride >> 2;
692
693 #if 0
694 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
695 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
696 cs->buf[cs->cdw++] = 0; /* src address lo */
697 cs->buf[cs->cdw++] = 0; /* src address hi */
698 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
699 cs->buf[cs->cdw++] = 0; /* unused */
700 #endif
701
702 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
703 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
704
705 }
706
707 #if R600_TRACE_CS
708 void r600_trace_emit(struct r600_context *rctx)
709 {
710 struct r600_screen *rscreen = rctx->screen;
711 struct radeon_winsys_cs *cs = rctx->cs;
712 uint64_t va;
713
714 va = r600_resource_va(&rscreen->screen, (void*)rscreen->trace_bo);
715 r600_context_bo_reloc(rctx, rscreen->trace_bo, RADEON_USAGE_READWRITE);
716 cs->buf[cs->cdw++] = PKT3(PKT3_WRITE_DATA, 4, 0);
717 cs->buf[cs->cdw++] = PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) |
718 PKT3_WRITE_DATA_WR_CONFIRM |
719 PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME);
720 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;
721 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFFFFFFFUL;
722 cs->buf[cs->cdw++] = cs->cdw;
723 cs->buf[cs->cdw++] = rscreen->cs_count;
724 }
725 #endif