radeon/llvm: Add live-in registers during DAG lowering
[mesa.git] / src / gallium / drivers / radeonsi / r600_hw_context.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Jerome Glisse
25 */
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pm4.h"
28 #include "radeonsi_pipe.h"
29 #include "sid.h"
30 #include "util/u_memory.h"
31 #include <errno.h>
32
33 #define GROUP_FORCE_NEW_BLOCK 0
34
35 /* Get backends mask */
36 void r600_get_backend_mask(struct r600_context *ctx)
37 {
38 struct radeon_winsys_cs *cs = ctx->cs;
39 struct si_resource *buffer;
40 uint32_t *results;
41 unsigned num_backends = ctx->screen->info.r600_num_backends;
42 unsigned i, mask = 0;
43
44 /* if backend_map query is supported by the kernel */
45 if (ctx->screen->info.r600_backend_map_valid) {
46 unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
47 unsigned backend_map = ctx->screen->info.r600_backend_map;
48 unsigned item_width, item_mask;
49
50 if (ctx->chip_class >= CAYMAN) {
51 item_width = 4;
52 item_mask = 0x7;
53 }
54
55 while(num_tile_pipes--) {
56 i = backend_map & item_mask;
57 mask |= (1<<i);
58 backend_map >>= item_width;
59 }
60 if (mask != 0) {
61 ctx->backend_mask = mask;
62 return;
63 }
64 }
65
66 /* otherwise backup path for older kernels */
67
68 /* create buffer for event data */
69 buffer = si_resource_create_custom(&ctx->screen->screen,
70 PIPE_USAGE_STAGING,
71 ctx->max_db*16);
72 if (!buffer)
73 goto err;
74
75 /* initialize buffer with zeroes */
76 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
77 if (results) {
78 uint64_t va = 0;
79
80 memset(results, 0, ctx->max_db * 4 * 4);
81 ctx->ws->buffer_unmap(buffer->cs_buf);
82
83 /* emit EVENT_WRITE for ZPASS_DONE */
84 va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
85 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
86 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
87 cs->buf[cs->cdw++] = va;
88 cs->buf[cs->cdw++] = va >> 32;
89
90 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
91 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
92
93 /* analyze results */
94 results = ctx->ws->buffer_map(buffer->cs_buf, ctx->cs, PIPE_TRANSFER_READ);
95 if (results) {
96 for(i = 0; i < ctx->max_db; i++) {
97 /* at least highest bit will be set if backend is used */
98 if (results[i*4 + 1])
99 mask |= (1<<i);
100 }
101 ctx->ws->buffer_unmap(buffer->cs_buf);
102 }
103 }
104
105 si_resource_reference(&buffer, NULL);
106
107 if (mask != 0) {
108 ctx->backend_mask = mask;
109 return;
110 }
111
112 err:
113 /* fallback to old method - set num_backends lower bits to 1 */
114 ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
115 return;
116 }
117
118 /* initialize */
119 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
120 boolean count_draw_in)
121 {
122 /* The number of dwords we already used in the CS so far. */
123 num_dw += ctx->cs->cdw;
124
125 if (count_draw_in) {
126 /* The number of dwords all the dirty states would take. */
127 num_dw += ctx->pm4_dirty_cdwords;
128
129 /* The upper-bound of how much a draw command would take. */
130 num_dw += SI_MAX_DRAW_CS_DWORDS;
131 }
132
133 /* Count in queries_suspend. */
134 num_dw += ctx->num_cs_dw_queries_suspend;
135
136 /* Count in streamout_end at the end of CS. */
137 num_dw += ctx->num_cs_dw_streamout_end;
138
139 /* Count in render_condition(NULL) at the end of CS. */
140 if (ctx->predicate_drawing) {
141 num_dw += 3;
142 }
143
144 /* Count in framebuffer cache flushes at the end of CS. */
145 num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
146
147 /* Save 16 dwords for the fence mechanism. */
148 num_dw += 16;
149
150 /* Flush if there's not enough space. */
151 if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
152 radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
153 }
154 }
155
156 static void r600_flush_framebuffer(struct r600_context *ctx)
157 {
158 struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
159
160 if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
161 return;
162
163 si_cmd_surface_sync(pm4, S_0085F0_CB0_DEST_BASE_ENA(1) |
164 S_0085F0_CB1_DEST_BASE_ENA(1) |
165 S_0085F0_CB2_DEST_BASE_ENA(1) |
166 S_0085F0_CB3_DEST_BASE_ENA(1) |
167 S_0085F0_CB4_DEST_BASE_ENA(1) |
168 S_0085F0_CB5_DEST_BASE_ENA(1) |
169 S_0085F0_CB6_DEST_BASE_ENA(1) |
170 S_0085F0_CB7_DEST_BASE_ENA(1) |
171 S_0085F0_DB_ACTION_ENA(1) |
172 S_0085F0_DB_DEST_BASE_ENA(1));
173 si_pm4_emit(ctx, pm4);
174 si_pm4_free_state(ctx, pm4, ~0);
175
176 ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
177 }
178
179 void r600_context_flush(struct r600_context *ctx, unsigned flags)
180 {
181 struct radeon_winsys_cs *cs = ctx->cs;
182 bool queries_suspended = false;
183
184 #if 0
185 bool streamout_suspended = false;
186 #endif
187
188 if (!cs->cdw)
189 return;
190
191 /* suspend queries */
192 if (ctx->num_cs_dw_queries_suspend) {
193 r600_context_queries_suspend(ctx);
194 queries_suspended = true;
195 }
196
197 #if 0
198 if (ctx->num_cs_dw_streamout_end) {
199 r600_context_streamout_end(ctx);
200 streamout_suspended = true;
201 }
202 #endif
203
204 r600_flush_framebuffer(ctx);
205
206 /* partial flush is needed to avoid lockups on some chips with user fences */
207 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
208 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
209
210 /* force to keep tiling flags */
211 flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
212
213 /* Flush the CS. */
214 ctx->ws->cs_flush(ctx->cs, flags);
215
216 ctx->pm4_dirty_cdwords = 0;
217 ctx->flags = 0;
218
219 #if 0
220 if (streamout_suspended) {
221 ctx->streamout_start = TRUE;
222 ctx->streamout_append_bitmask = ~0;
223 }
224 #endif
225
226 /* resume queries */
227 if (queries_suspended) {
228 r600_context_queries_resume(ctx);
229 }
230
231 /* set all valid group as dirty so they get reemited on
232 * next draw command
233 */
234 si_pm4_reset_emitted(ctx);
235 }
236
237 void r600_context_emit_fence(struct r600_context *ctx, struct si_resource *fence_bo, unsigned offset, unsigned value)
238 {
239 struct radeon_winsys_cs *cs = ctx->cs;
240 uint64_t va;
241
242 r600_need_cs_space(ctx, 10, FALSE);
243
244 va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
245 va = va + (offset << 2);
246
247 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
248 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
249 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
250 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
251 cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */
252 /* DATA_SEL | INT_EN | ADDRESS_HI */
253 cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
254 cs->buf[cs->cdw++] = value; /* DATA_LO */
255 cs->buf[cs->cdw++] = 0; /* DATA_HI */
256 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
257 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
258 }
259
260 static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
261 bool test_status_bit)
262 {
263 uint32_t *current_result = (uint32_t*)map;
264 uint64_t start, end;
265
266 start = (uint64_t)current_result[start_index] |
267 (uint64_t)current_result[start_index+1] << 32;
268 end = (uint64_t)current_result[end_index] |
269 (uint64_t)current_result[end_index+1] << 32;
270
271 if (!test_status_bit ||
272 ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
273 return end - start;
274 }
275 return 0;
276 }
277
278 static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
279 {
280 unsigned results_base = query->results_start;
281 char *map;
282
283 map = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs,
284 PIPE_TRANSFER_READ |
285 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
286 if (!map)
287 return FALSE;
288
289 /* count all results across all data blocks */
290 switch (query->type) {
291 case PIPE_QUERY_OCCLUSION_COUNTER:
292 while (results_base != query->results_end) {
293 query->result.u64 +=
294 r600_query_read_result(map + results_base, 0, 2, true);
295 results_base = (results_base + 16) % query->buffer->b.b.width0;
296 }
297 break;
298 case PIPE_QUERY_OCCLUSION_PREDICATE:
299 while (results_base != query->results_end) {
300 query->result.b = query->result.b ||
301 r600_query_read_result(map + results_base, 0, 2, true) != 0;
302 results_base = (results_base + 16) % query->buffer->b.b.width0;
303 }
304 break;
305 case PIPE_QUERY_TIME_ELAPSED:
306 while (results_base != query->results_end) {
307 query->result.u64 +=
308 r600_query_read_result(map + results_base, 0, 2, false);
309 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
310 }
311 break;
312 case PIPE_QUERY_PRIMITIVES_EMITTED:
313 /* SAMPLE_STREAMOUTSTATS stores this structure:
314 * {
315 * u64 NumPrimitivesWritten;
316 * u64 PrimitiveStorageNeeded;
317 * }
318 * We only need NumPrimitivesWritten here. */
319 while (results_base != query->results_end) {
320 query->result.u64 +=
321 r600_query_read_result(map + results_base, 2, 6, true);
322 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
323 }
324 break;
325 case PIPE_QUERY_PRIMITIVES_GENERATED:
326 /* Here we read PrimitiveStorageNeeded. */
327 while (results_base != query->results_end) {
328 query->result.u64 +=
329 r600_query_read_result(map + results_base, 0, 4, true);
330 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
331 }
332 break;
333 case PIPE_QUERY_SO_STATISTICS:
334 while (results_base != query->results_end) {
335 query->result.so.num_primitives_written +=
336 r600_query_read_result(map + results_base, 2, 6, true);
337 query->result.so.primitives_storage_needed +=
338 r600_query_read_result(map + results_base, 0, 4, true);
339 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
340 }
341 break;
342 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
343 while (results_base != query->results_end) {
344 query->result.b = query->result.b ||
345 r600_query_read_result(map + results_base, 2, 6, true) !=
346 r600_query_read_result(map + results_base, 0, 4, true);
347 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
348 }
349 break;
350 default:
351 assert(0);
352 }
353
354 query->results_start = query->results_end;
355 ctx->ws->buffer_unmap(query->buffer->cs_buf);
356 return TRUE;
357 }
358
359 void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
360 {
361 struct radeon_winsys_cs *cs = ctx->cs;
362 unsigned new_results_end, i;
363 uint32_t *results;
364 uint64_t va;
365
366 r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
367
368 new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
369
370 /* collect current results if query buffer is full */
371 if (new_results_end == query->results_start) {
372 r600_query_result(ctx, query, TRUE);
373 }
374
375 switch (query->type) {
376 case PIPE_QUERY_OCCLUSION_COUNTER:
377 case PIPE_QUERY_OCCLUSION_PREDICATE:
378 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
379 if (results) {
380 results = (uint32_t*)((char*)results + query->results_end);
381 memset(results, 0, query->result_size);
382
383 /* Set top bits for unused backends */
384 for (i = 0; i < ctx->max_db; i++) {
385 if (!(ctx->backend_mask & (1<<i))) {
386 results[(i * 4)+1] = 0x80000000;
387 results[(i * 4)+3] = 0x80000000;
388 }
389 }
390 ctx->ws->buffer_unmap(query->buffer->cs_buf);
391 }
392 break;
393 case PIPE_QUERY_TIME_ELAPSED:
394 break;
395 case PIPE_QUERY_PRIMITIVES_EMITTED:
396 case PIPE_QUERY_PRIMITIVES_GENERATED:
397 case PIPE_QUERY_SO_STATISTICS:
398 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
399 results = ctx->ws->buffer_map(query->buffer->cs_buf, ctx->cs, PIPE_TRANSFER_WRITE);
400 results = (uint32_t*)((char*)results + query->results_end);
401 memset(results, 0, query->result_size);
402 ctx->ws->buffer_unmap(query->buffer->cs_buf);
403 break;
404 default:
405 assert(0);
406 }
407
408 /* emit begin query */
409 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
410 va += query->results_end;
411
412 switch (query->type) {
413 case PIPE_QUERY_OCCLUSION_COUNTER:
414 case PIPE_QUERY_OCCLUSION_PREDICATE:
415 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
416 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
417 cs->buf[cs->cdw++] = va;
418 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
419 break;
420 case PIPE_QUERY_PRIMITIVES_EMITTED:
421 case PIPE_QUERY_PRIMITIVES_GENERATED:
422 case PIPE_QUERY_SO_STATISTICS:
423 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
424 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
425 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
426 cs->buf[cs->cdw++] = query->results_end;
427 cs->buf[cs->cdw++] = 0;
428 break;
429 case PIPE_QUERY_TIME_ELAPSED:
430 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
431 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
432 cs->buf[cs->cdw++] = va;
433 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
434 cs->buf[cs->cdw++] = 0;
435 cs->buf[cs->cdw++] = 0;
436 break;
437 default:
438 assert(0);
439 }
440 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
441 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
442
443 ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
444 }
445
446 void r600_query_end(struct r600_context *ctx, struct r600_query *query)
447 {
448 struct radeon_winsys_cs *cs = ctx->cs;
449 uint64_t va;
450
451 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
452 /* emit end query */
453 switch (query->type) {
454 case PIPE_QUERY_OCCLUSION_COUNTER:
455 case PIPE_QUERY_OCCLUSION_PREDICATE:
456 va += query->results_end + 8;
457 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
458 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
459 cs->buf[cs->cdw++] = va;
460 cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
461 break;
462 case PIPE_QUERY_PRIMITIVES_EMITTED:
463 case PIPE_QUERY_PRIMITIVES_GENERATED:
464 case PIPE_QUERY_SO_STATISTICS:
465 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
466 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
467 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
468 cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
469 cs->buf[cs->cdw++] = 0;
470 break;
471 case PIPE_QUERY_TIME_ELAPSED:
472 va += query->results_end + query->result_size/2;
473 cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
474 cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
475 cs->buf[cs->cdw++] = va;
476 cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
477 cs->buf[cs->cdw++] = 0;
478 cs->buf[cs->cdw++] = 0;
479 break;
480 default:
481 assert(0);
482 }
483 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
484 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
485
486 query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.width0;
487 ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
488 }
489
490 void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
491 int flag_wait)
492 {
493 struct radeon_winsys_cs *cs = ctx->cs;
494 uint64_t va;
495
496 if (operation == PREDICATION_OP_CLEAR) {
497 r600_need_cs_space(ctx, 3, FALSE);
498
499 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
500 cs->buf[cs->cdw++] = 0;
501 cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
502 } else {
503 unsigned results_base = query->results_start;
504 unsigned count;
505 uint32_t op;
506
507 /* find count of the query data blocks */
508 count = (query->buffer->b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.width0;
509 count /= query->result_size;
510
511 r600_need_cs_space(ctx, 5 * count, TRUE);
512
513 op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
514 (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
515 va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
516
517 /* emit predicate packets for all data blocks */
518 while (results_base != query->results_end) {
519 cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
520 cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
521 cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
522 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
523 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
524 RADEON_USAGE_READ);
525 results_base = (results_base + query->result_size) % query->buffer->b.b.width0;
526
527 /* set CONTINUE bit for all packets except the first */
528 op |= PREDICATION_CONTINUE;
529 }
530 }
531 }
532
533 struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
534 {
535 struct r600_query *query;
536 unsigned buffer_size = 4096;
537
538 query = CALLOC_STRUCT(r600_query);
539 if (query == NULL)
540 return NULL;
541
542 query->type = query_type;
543
544 switch (query_type) {
545 case PIPE_QUERY_OCCLUSION_COUNTER:
546 case PIPE_QUERY_OCCLUSION_PREDICATE:
547 query->result_size = 16 * ctx->max_db;
548 query->num_cs_dw = 6;
549 break;
550 case PIPE_QUERY_TIME_ELAPSED:
551 query->result_size = 16;
552 query->num_cs_dw = 8;
553 break;
554 case PIPE_QUERY_PRIMITIVES_EMITTED:
555 case PIPE_QUERY_PRIMITIVES_GENERATED:
556 case PIPE_QUERY_SO_STATISTICS:
557 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
558 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
559 query->result_size = 32;
560 query->num_cs_dw = 6;
561 break;
562 default:
563 assert(0);
564 FREE(query);
565 return NULL;
566 }
567
568 /* adjust buffer size to simplify offsets wrapping math */
569 buffer_size -= buffer_size % query->result_size;
570
571 /* Queries are normally read by the CPU after
572 * being written by the gpu, hence staging is probably a good
573 * usage pattern.
574 */
575 query->buffer = si_resource_create_custom(&ctx->screen->screen,
576 PIPE_USAGE_STAGING,
577 buffer_size);
578 if (!query->buffer) {
579 FREE(query);
580 return NULL;
581 }
582 return query;
583 }
584
585 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
586 {
587 si_resource_reference(&query->buffer, NULL);
588 free(query);
589 }
590
591 boolean r600_context_query_result(struct r600_context *ctx,
592 struct r600_query *query,
593 boolean wait, void *vresult)
594 {
595 boolean *result_b = (boolean*)vresult;
596 uint64_t *result_u64 = (uint64_t*)vresult;
597 struct pipe_query_data_so_statistics *result_so =
598 (struct pipe_query_data_so_statistics*)vresult;
599
600 if (!r600_query_result(ctx, query, wait))
601 return FALSE;
602
603 switch (query->type) {
604 case PIPE_QUERY_OCCLUSION_COUNTER:
605 case PIPE_QUERY_PRIMITIVES_EMITTED:
606 case PIPE_QUERY_PRIMITIVES_GENERATED:
607 *result_u64 = query->result.u64;
608 break;
609 case PIPE_QUERY_OCCLUSION_PREDICATE:
610 case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
611 *result_b = query->result.b;
612 break;
613 case PIPE_QUERY_TIME_ELAPSED:
614 *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
615 break;
616 case PIPE_QUERY_SO_STATISTICS:
617 *result_so = query->result.so;
618 break;
619 default:
620 assert(0);
621 }
622 return TRUE;
623 }
624
625 void r600_context_queries_suspend(struct r600_context *ctx)
626 {
627 struct r600_query *query;
628
629 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
630 r600_query_end(ctx, query);
631 }
632 assert(ctx->num_cs_dw_queries_suspend == 0);
633 }
634
635 void r600_context_queries_resume(struct r600_context *ctx)
636 {
637 struct r600_query *query;
638
639 assert(ctx->num_cs_dw_queries_suspend == 0);
640
641 LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
642 r600_query_begin(ctx, query);
643 }
644 }
645
646 void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
647 {
648 struct radeon_winsys_cs *cs = ctx->cs;
649 r600_need_cs_space(ctx, 14 + 21, TRUE);
650
651 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
652 cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
653 cs->buf[cs->cdw++] = 0;
654
655 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
656 cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
657 cs->buf[cs->cdw++] = t->stride >> 2;
658
659 #if 0
660 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
661 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
662 cs->buf[cs->cdw++] = 0; /* src address lo */
663 cs->buf[cs->cdw++] = 0; /* src address hi */
664 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
665 cs->buf[cs->cdw++] = 0; /* unused */
666 #endif
667
668 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
669 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
670
671 #if 0 /* I have not found this useful yet. */
672 cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
673 cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
674 cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
675 cs->buf[cs->cdw++] = 0; /* unused */
676 cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
677 cs->buf[cs->cdw++] = 0; /* unused */
678
679 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
680 cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
681 cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
682
683 cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
684 cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
685 cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
686
687 cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
688 cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct si_resource*)t->b.buffer,
689 RADEON_USAGE_WRITE);
690
691 cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
692 cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
693 cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */
694 cs->buf[cs->cdw++] = 0;
695 cs->buf[cs->cdw++] = 0; /* reference value */
696 cs->buf[cs->cdw++] = 0xffffffff; /* mask */
697 cs->buf[cs->cdw++] = 4; /* poll interval */
698 #endif
699 }