2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pm4.h"
28 #include "radeonsi_pipe.h"
30 #include "util/u_memory.h"
33 #define GROUP_FORCE_NEW_BLOCK 0
35 /* Get backends mask */
36 void si_get_backend_mask(struct r600_context
*ctx
)
38 struct radeon_winsys_cs
*cs
= ctx
->cs
;
39 struct si_resource
*buffer
;
41 unsigned num_backends
= ctx
->screen
->info
.r600_num_backends
;
44 /* if backend_map query is supported by the kernel */
45 if (ctx
->screen
->info
.r600_backend_map_valid
) {
46 unsigned num_tile_pipes
= ctx
->screen
->info
.r600_num_tile_pipes
;
47 unsigned backend_map
= ctx
->screen
->info
.r600_backend_map
;
48 unsigned item_width
= 4, item_mask
= 0x7;
50 while(num_tile_pipes
--) {
51 i
= backend_map
& item_mask
;
53 backend_map
>>= item_width
;
56 ctx
->backend_mask
= mask
;
61 /* otherwise backup path for older kernels */
63 /* create buffer for event data */
64 buffer
= si_resource_create_custom(&ctx
->screen
->screen
,
70 /* initialize buffer with zeroes */
71 results
= ctx
->ws
->buffer_map(buffer
->cs_buf
, ctx
->cs
, PIPE_TRANSFER_WRITE
);
75 memset(results
, 0, ctx
->max_db
* 4 * 4);
76 ctx
->ws
->buffer_unmap(buffer
->cs_buf
);
78 /* emit EVENT_WRITE for ZPASS_DONE */
79 va
= r600_resource_va(&ctx
->screen
->screen
, (void *)buffer
);
80 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
81 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1);
82 cs
->buf
[cs
->cdw
++] = va
;
83 cs
->buf
[cs
->cdw
++] = va
>> 32;
85 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
86 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, buffer
, RADEON_USAGE_WRITE
);
89 results
= ctx
->ws
->buffer_map(buffer
->cs_buf
, ctx
->cs
, PIPE_TRANSFER_READ
);
91 for(i
= 0; i
< ctx
->max_db
; i
++) {
92 /* at least highest bit will be set if backend is used */
96 ctx
->ws
->buffer_unmap(buffer
->cs_buf
);
100 si_resource_reference(&buffer
, NULL
);
103 ctx
->backend_mask
= mask
;
108 /* fallback to old method - set num_backends lower bits to 1 */
109 ctx
->backend_mask
= (~((uint32_t)0))>>(32-num_backends
);
114 void si_need_cs_space(struct r600_context
*ctx
, unsigned num_dw
,
115 boolean count_draw_in
)
119 /* The number of dwords we already used in the CS so far. */
120 num_dw
+= ctx
->cs
->cdw
;
122 for (i
= 0; i
< SI_NUM_ATOMS(ctx
); i
++) {
123 if (ctx
->atoms
.array
[i
]->dirty
) {
124 num_dw
+= ctx
->atoms
.array
[i
]->num_dw
;
129 /* The number of dwords all the dirty states would take. */
130 num_dw
+= ctx
->pm4_dirty_cdwords
;
132 /* The upper-bound of how much a draw command would take. */
133 num_dw
+= SI_MAX_DRAW_CS_DWORDS
;
136 /* Count in queries_suspend. */
137 num_dw
+= ctx
->num_cs_dw_queries_suspend
;
139 /* Count in streamout_end at the end of CS. */
140 num_dw
+= ctx
->num_cs_dw_streamout_end
;
142 /* Count in render_condition(NULL) at the end of CS. */
143 if (ctx
->predicate_drawing
) {
147 /* Count in framebuffer cache flushes at the end of CS. */
148 num_dw
+= 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
150 /* Save 16 dwords for the fence mechanism. */
154 if (ctx
->screen
->trace_bo
) {
155 num_dw
+= R600_TRACE_CS_DWORDS
;
159 /* Flush if there's not enough space. */
160 if (num_dw
> RADEON_MAX_CMDBUF_DWORDS
) {
161 radeonsi_flush(&ctx
->context
, NULL
, RADEON_FLUSH_ASYNC
);
165 static void r600_flush_framebuffer(struct r600_context
*ctx
)
167 struct si_pm4_state
*pm4
;
169 if (!(ctx
->flags
& R600_CONTEXT_DST_CACHES_DIRTY
))
172 pm4
= si_pm4_alloc_state(ctx
);
177 si_cmd_surface_sync(pm4
, S_0085F0_CB0_DEST_BASE_ENA(1) |
178 S_0085F0_CB1_DEST_BASE_ENA(1) |
179 S_0085F0_CB2_DEST_BASE_ENA(1) |
180 S_0085F0_CB3_DEST_BASE_ENA(1) |
181 S_0085F0_CB4_DEST_BASE_ENA(1) |
182 S_0085F0_CB5_DEST_BASE_ENA(1) |
183 S_0085F0_CB6_DEST_BASE_ENA(1) |
184 S_0085F0_CB7_DEST_BASE_ENA(1) |
185 S_0085F0_DB_ACTION_ENA(1) |
186 S_0085F0_DB_DEST_BASE_ENA(1));
187 si_cmd_flush_and_inv_cb_meta(pm4
);
189 si_pm4_emit(ctx
, pm4
);
190 si_pm4_free_state(ctx
, pm4
, ~0);
192 ctx
->flags
&= ~R600_CONTEXT_DST_CACHES_DIRTY
;
193 ctx
->flush_and_inv_cb_meta
= false;
196 void si_context_flush(struct r600_context
*ctx
, unsigned flags
)
198 struct radeon_winsys_cs
*cs
= ctx
->cs
;
199 bool queries_suspended
= false;
202 bool streamout_suspended
= false;
208 /* suspend queries */
209 if (ctx
->num_cs_dw_queries_suspend
) {
210 r600_context_queries_suspend(ctx
);
211 queries_suspended
= true;
215 if (ctx
->num_cs_dw_streamout_end
) {
216 r600_context_streamout_end(ctx
);
217 streamout_suspended
= true;
221 r600_flush_framebuffer(ctx
);
223 /* partial flush is needed to avoid lockups on some chips with user fences */
224 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 0, 0);
225 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH
) | EVENT_INDEX(4);
227 /* force to keep tiling flags */
228 flags
|= RADEON_FLUSH_KEEP_TILING_FLAGS
;
231 if (ctx
->screen
->trace_bo
) {
232 struct r600_screen
*rscreen
= ctx
->screen
;
235 for (i
= 0; i
< cs
->cdw
; i
++) {
236 fprintf(stderr
, "[%4d] [%5d] 0x%08x\n", rscreen
->cs_count
, i
, cs
->buf
[i
]);
243 ctx
->ws
->cs_flush(ctx
->cs
, flags
, 0);
246 if (ctx
->screen
->trace_bo
) {
247 struct r600_screen
*rscreen
= ctx
->screen
;
250 for (i
= 0; i
< 10; i
++) {
252 if (!ctx
->ws
->buffer_is_busy(rscreen
->trace_bo
->buf
, RADEON_USAGE_READWRITE
)) {
257 fprintf(stderr
, "timeout on cs lockup likely happen at cs %d dw %d\n",
258 rscreen
->trace_ptr
[1], rscreen
->trace_ptr
[0]);
260 fprintf(stderr
, "cs %d executed in %dms\n", rscreen
->trace_ptr
[1], i
* 5);
265 ctx
->pm4_dirty_cdwords
= 0;
268 /* set all valid group as dirty so they get reemited on
271 si_pm4_reset_emitted(ctx
);
273 /* The CS initialization should be emitted before everything else. */
274 si_pm4_emit(ctx
, ctx
->queued
.named
.init
);
275 ctx
->emitted
.named
.init
= ctx
->queued
.named
.init
;
278 if (streamout_suspended
) {
279 ctx
->streamout_start
= TRUE
;
280 ctx
->streamout_append_bitmask
= ~0;
285 if (queries_suspended
) {
286 r600_context_queries_resume(ctx
);
289 si_all_descriptors_begin_new_cs(ctx
);
292 void si_context_emit_fence(struct r600_context
*ctx
, struct si_resource
*fence_bo
, unsigned offset
, unsigned value
)
294 struct radeon_winsys_cs
*cs
= ctx
->cs
;
297 si_need_cs_space(ctx
, 10, FALSE
);
299 va
= r600_resource_va(&ctx
->screen
->screen
, (void*)fence_bo
);
300 va
= va
+ (offset
<< 2);
302 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 0, 0);
303 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH
) | EVENT_INDEX(4);
304 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE_EOP
, 4, 0);
305 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT
) | EVENT_INDEX(5);
306 cs
->buf
[cs
->cdw
++] = va
& 0xFFFFFFFFUL
; /* ADDRESS_LO */
307 /* DATA_SEL | INT_EN | ADDRESS_HI */
308 cs
->buf
[cs
->cdw
++] = (1 << 29) | (0 << 24) | ((va
>> 32UL) & 0xFF);
309 cs
->buf
[cs
->cdw
++] = value
; /* DATA_LO */
310 cs
->buf
[cs
->cdw
++] = 0; /* DATA_HI */
311 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
312 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, fence_bo
, RADEON_USAGE_WRITE
);
315 static unsigned r600_query_read_result(char *map
, unsigned start_index
, unsigned end_index
,
316 bool test_status_bit
)
318 uint32_t *current_result
= (uint32_t*)map
;
321 start
= (uint64_t)current_result
[start_index
] |
322 (uint64_t)current_result
[start_index
+1] << 32;
323 end
= (uint64_t)current_result
[end_index
] |
324 (uint64_t)current_result
[end_index
+1] << 32;
326 if (!test_status_bit
||
327 ((start
& 0x8000000000000000UL
) && (end
& 0x8000000000000000UL
))) {
333 static boolean
r600_query_result(struct r600_context
*ctx
, struct r600_query
*query
, boolean wait
)
335 unsigned results_base
= query
->results_start
;
338 map
= ctx
->ws
->buffer_map(query
->buffer
->cs_buf
, ctx
->cs
,
340 (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
));
344 /* count all results across all data blocks */
345 switch (query
->type
) {
346 case PIPE_QUERY_OCCLUSION_COUNTER
:
347 while (results_base
!= query
->results_end
) {
349 r600_query_read_result(map
+ results_base
, 0, 2, true);
350 results_base
= (results_base
+ 16) % query
->buffer
->b
.b
.width0
;
353 case PIPE_QUERY_OCCLUSION_PREDICATE
:
354 while (results_base
!= query
->results_end
) {
355 query
->result
.b
= query
->result
.b
||
356 r600_query_read_result(map
+ results_base
, 0, 2, true) != 0;
357 results_base
= (results_base
+ 16) % query
->buffer
->b
.b
.width0
;
360 case PIPE_QUERY_TIME_ELAPSED
:
361 while (results_base
!= query
->results_end
) {
363 r600_query_read_result(map
+ results_base
, 0, 2, false);
364 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
367 case PIPE_QUERY_PRIMITIVES_EMITTED
:
368 /* SAMPLE_STREAMOUTSTATS stores this structure:
370 * u64 NumPrimitivesWritten;
371 * u64 PrimitiveStorageNeeded;
373 * We only need NumPrimitivesWritten here. */
374 while (results_base
!= query
->results_end
) {
376 r600_query_read_result(map
+ results_base
, 2, 6, true);
377 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
380 case PIPE_QUERY_PRIMITIVES_GENERATED
:
381 /* Here we read PrimitiveStorageNeeded. */
382 while (results_base
!= query
->results_end
) {
384 r600_query_read_result(map
+ results_base
, 0, 4, true);
385 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
388 case PIPE_QUERY_SO_STATISTICS
:
389 while (results_base
!= query
->results_end
) {
390 query
->result
.so
.num_primitives_written
+=
391 r600_query_read_result(map
+ results_base
, 2, 6, true);
392 query
->result
.so
.primitives_storage_needed
+=
393 r600_query_read_result(map
+ results_base
, 0, 4, true);
394 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
397 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
398 while (results_base
!= query
->results_end
) {
399 query
->result
.b
= query
->result
.b
||
400 r600_query_read_result(map
+ results_base
, 2, 6, true) !=
401 r600_query_read_result(map
+ results_base
, 0, 4, true);
402 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
409 query
->results_start
= query
->results_end
;
410 ctx
->ws
->buffer_unmap(query
->buffer
->cs_buf
);
414 void r600_query_begin(struct r600_context
*ctx
, struct r600_query
*query
)
416 struct radeon_winsys_cs
*cs
= ctx
->cs
;
417 unsigned new_results_end
, i
;
421 si_need_cs_space(ctx
, query
->num_cs_dw
* 2, TRUE
);
423 new_results_end
= (query
->results_end
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
425 /* collect current results if query buffer is full */
426 if (new_results_end
== query
->results_start
) {
427 r600_query_result(ctx
, query
, TRUE
);
430 switch (query
->type
) {
431 case PIPE_QUERY_OCCLUSION_COUNTER
:
432 case PIPE_QUERY_OCCLUSION_PREDICATE
:
433 results
= ctx
->ws
->buffer_map(query
->buffer
->cs_buf
, ctx
->cs
, PIPE_TRANSFER_WRITE
);
435 results
= (uint32_t*)((char*)results
+ query
->results_end
);
436 memset(results
, 0, query
->result_size
);
438 /* Set top bits for unused backends */
439 for (i
= 0; i
< ctx
->max_db
; i
++) {
440 if (!(ctx
->backend_mask
& (1<<i
))) {
441 results
[(i
* 4)+1] = 0x80000000;
442 results
[(i
* 4)+3] = 0x80000000;
445 ctx
->ws
->buffer_unmap(query
->buffer
->cs_buf
);
448 case PIPE_QUERY_TIME_ELAPSED
:
450 case PIPE_QUERY_PRIMITIVES_EMITTED
:
451 case PIPE_QUERY_PRIMITIVES_GENERATED
:
452 case PIPE_QUERY_SO_STATISTICS
:
453 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
454 results
= ctx
->ws
->buffer_map(query
->buffer
->cs_buf
, ctx
->cs
, PIPE_TRANSFER_WRITE
);
455 results
= (uint32_t*)((char*)results
+ query
->results_end
);
456 memset(results
, 0, query
->result_size
);
457 ctx
->ws
->buffer_unmap(query
->buffer
->cs_buf
);
463 /* emit begin query */
464 va
= r600_resource_va(&ctx
->screen
->screen
, (void*)query
->buffer
);
465 va
+= query
->results_end
;
467 switch (query
->type
) {
468 case PIPE_QUERY_OCCLUSION_COUNTER
:
469 case PIPE_QUERY_OCCLUSION_PREDICATE
:
470 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
471 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1);
472 cs
->buf
[cs
->cdw
++] = va
;
473 cs
->buf
[cs
->cdw
++] = (va
>> 32UL) & 0xFF;
475 case PIPE_QUERY_PRIMITIVES_EMITTED
:
476 case PIPE_QUERY_PRIMITIVES_GENERATED
:
477 case PIPE_QUERY_SO_STATISTICS
:
478 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
479 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
480 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS
) | EVENT_INDEX(3);
481 cs
->buf
[cs
->cdw
++] = query
->results_end
;
482 cs
->buf
[cs
->cdw
++] = 0;
484 case PIPE_QUERY_TIME_ELAPSED
:
485 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE_EOP
, 4, 0);
486 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT
) | EVENT_INDEX(5);
487 cs
->buf
[cs
->cdw
++] = va
;
488 cs
->buf
[cs
->cdw
++] = (3 << 29) | ((va
>> 32UL) & 0xFF);
489 cs
->buf
[cs
->cdw
++] = 0;
490 cs
->buf
[cs
->cdw
++] = 0;
495 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
496 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, query
->buffer
, RADEON_USAGE_WRITE
);
498 ctx
->num_cs_dw_queries_suspend
+= query
->num_cs_dw
;
501 void r600_query_end(struct r600_context
*ctx
, struct r600_query
*query
)
503 struct radeon_winsys_cs
*cs
= ctx
->cs
;
506 va
= r600_resource_va(&ctx
->screen
->screen
, (void*)query
->buffer
);
508 switch (query
->type
) {
509 case PIPE_QUERY_OCCLUSION_COUNTER
:
510 case PIPE_QUERY_OCCLUSION_PREDICATE
:
511 va
+= query
->results_end
+ 8;
512 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
513 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1);
514 cs
->buf
[cs
->cdw
++] = va
;
515 cs
->buf
[cs
->cdw
++] = (va
>> 32UL) & 0xFF;
517 case PIPE_QUERY_PRIMITIVES_EMITTED
:
518 case PIPE_QUERY_PRIMITIVES_GENERATED
:
519 case PIPE_QUERY_SO_STATISTICS
:
520 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
521 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
522 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS
) | EVENT_INDEX(3);
523 cs
->buf
[cs
->cdw
++] = query
->results_end
+ query
->result_size
/2;
524 cs
->buf
[cs
->cdw
++] = 0;
526 case PIPE_QUERY_TIME_ELAPSED
:
527 va
+= query
->results_end
+ query
->result_size
/2;
528 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE_EOP
, 4, 0);
529 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT
) | EVENT_INDEX(5);
530 cs
->buf
[cs
->cdw
++] = va
;
531 cs
->buf
[cs
->cdw
++] = (3 << 29) | ((va
>> 32UL) & 0xFF);
532 cs
->buf
[cs
->cdw
++] = 0;
533 cs
->buf
[cs
->cdw
++] = 0;
538 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
539 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, query
->buffer
, RADEON_USAGE_WRITE
);
541 query
->results_end
= (query
->results_end
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
542 ctx
->num_cs_dw_queries_suspend
-= query
->num_cs_dw
;
545 void r600_query_predication(struct r600_context
*ctx
, struct r600_query
*query
, int operation
,
548 struct radeon_winsys_cs
*cs
= ctx
->cs
;
551 if (operation
== PREDICATION_OP_CLEAR
) {
552 si_need_cs_space(ctx
, 3, FALSE
);
554 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_PREDICATION
, 1, 0);
555 cs
->buf
[cs
->cdw
++] = 0;
556 cs
->buf
[cs
->cdw
++] = PRED_OP(PREDICATION_OP_CLEAR
);
558 unsigned results_base
= query
->results_start
;
562 /* find count of the query data blocks */
563 count
= (query
->buffer
->b
.b
.width0
+ query
->results_end
- query
->results_start
) % query
->buffer
->b
.b
.width0
;
564 count
/= query
->result_size
;
566 si_need_cs_space(ctx
, 5 * count
, TRUE
);
568 op
= PRED_OP(operation
) | PREDICATION_DRAW_VISIBLE
|
569 (flag_wait
? PREDICATION_HINT_WAIT
: PREDICATION_HINT_NOWAIT_DRAW
);
570 va
= r600_resource_va(&ctx
->screen
->screen
, (void*)query
->buffer
);
572 /* emit predicate packets for all data blocks */
573 while (results_base
!= query
->results_end
) {
574 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_PREDICATION
, 1, 0);
575 cs
->buf
[cs
->cdw
++] = (va
+ results_base
) & 0xFFFFFFFFUL
;
576 cs
->buf
[cs
->cdw
++] = op
| (((va
+ results_base
) >> 32UL) & 0xFF);
577 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
578 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, query
->buffer
,
580 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
582 /* set CONTINUE bit for all packets except the first */
583 op
|= PREDICATION_CONTINUE
;
588 struct r600_query
*r600_context_query_create(struct r600_context
*ctx
, unsigned query_type
)
590 struct r600_query
*query
;
591 unsigned buffer_size
= 4096;
593 query
= CALLOC_STRUCT(r600_query
);
597 query
->type
= query_type
;
599 switch (query_type
) {
600 case PIPE_QUERY_OCCLUSION_COUNTER
:
601 case PIPE_QUERY_OCCLUSION_PREDICATE
:
602 query
->result_size
= 16 * ctx
->max_db
;
603 query
->num_cs_dw
= 6;
605 case PIPE_QUERY_TIME_ELAPSED
:
606 query
->result_size
= 16;
607 query
->num_cs_dw
= 8;
609 case PIPE_QUERY_PRIMITIVES_EMITTED
:
610 case PIPE_QUERY_PRIMITIVES_GENERATED
:
611 case PIPE_QUERY_SO_STATISTICS
:
612 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
613 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
614 query
->result_size
= 32;
615 query
->num_cs_dw
= 6;
623 /* adjust buffer size to simplify offsets wrapping math */
624 buffer_size
-= buffer_size
% query
->result_size
;
626 /* Queries are normally read by the CPU after
627 * being written by the gpu, hence staging is probably a good
630 query
->buffer
= si_resource_create_custom(&ctx
->screen
->screen
,
633 if (!query
->buffer
) {
640 void r600_context_query_destroy(struct r600_context
*ctx
, struct r600_query
*query
)
642 si_resource_reference(&query
->buffer
, NULL
);
646 boolean
r600_context_query_result(struct r600_context
*ctx
,
647 struct r600_query
*query
,
648 boolean wait
, void *vresult
)
650 boolean
*result_b
= (boolean
*)vresult
;
651 uint64_t *result_u64
= (uint64_t*)vresult
;
652 struct pipe_query_data_so_statistics
*result_so
=
653 (struct pipe_query_data_so_statistics
*)vresult
;
655 if (!r600_query_result(ctx
, query
, wait
))
658 switch (query
->type
) {
659 case PIPE_QUERY_OCCLUSION_COUNTER
:
660 case PIPE_QUERY_PRIMITIVES_EMITTED
:
661 case PIPE_QUERY_PRIMITIVES_GENERATED
:
662 *result_u64
= query
->result
.u64
;
664 case PIPE_QUERY_OCCLUSION_PREDICATE
:
665 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
666 *result_b
= query
->result
.b
;
668 case PIPE_QUERY_TIME_ELAPSED
:
669 *result_u64
= (1000000 * query
->result
.u64
) / ctx
->screen
->info
.r600_clock_crystal_freq
;
671 case PIPE_QUERY_SO_STATISTICS
:
672 *result_so
= query
->result
.so
;
680 void r600_context_queries_suspend(struct r600_context
*ctx
)
682 struct r600_query
*query
;
684 LIST_FOR_EACH_ENTRY(query
, &ctx
->active_query_list
, list
) {
685 r600_query_end(ctx
, query
);
687 assert(ctx
->num_cs_dw_queries_suspend
== 0);
690 void r600_context_queries_resume(struct r600_context
*ctx
)
692 struct r600_query
*query
;
694 assert(ctx
->num_cs_dw_queries_suspend
== 0);
696 LIST_FOR_EACH_ENTRY(query
, &ctx
->active_query_list
, list
) {
697 r600_query_begin(ctx
, query
);
701 void r600_context_draw_opaque_count(struct r600_context
*ctx
, struct r600_so_target
*t
)
703 struct radeon_winsys_cs
*cs
= ctx
->cs
;
704 si_need_cs_space(ctx
, 14 + 21, TRUE
);
706 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_CONTEXT_REG
, 1, 0);
707 cs
->buf
[cs
->cdw
++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET
- SI_CONTEXT_REG_OFFSET
) >> 2;
708 cs
->buf
[cs
->cdw
++] = 0;
710 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_CONTEXT_REG
, 1, 0);
711 cs
->buf
[cs
->cdw
++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE
- SI_CONTEXT_REG_OFFSET
) >> 2;
712 cs
->buf
[cs
->cdw
++] = t
->stride
>> 2;
715 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_COPY_DW
, 4, 0);
716 cs
->buf
[cs
->cdw
++] = COPY_DW_SRC_IS_MEM
| COPY_DW_DST_IS_REG
;
717 cs
->buf
[cs
->cdw
++] = 0; /* src address lo */
718 cs
->buf
[cs
->cdw
++] = 0; /* src address hi */
719 cs
->buf
[cs
->cdw
++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
>> 2; /* dst register */
720 cs
->buf
[cs
->cdw
++] = 0; /* unused */
723 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
724 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, t
->filled_size
, RADEON_USAGE_READ
);
729 void r600_trace_emit(struct r600_context
*rctx
)
731 struct r600_screen
*rscreen
= rctx
->screen
;
732 struct radeon_winsys_cs
*cs
= rctx
->cs
;
735 va
= r600_resource_va(&rscreen
->screen
, (void*)rscreen
->trace_bo
);
736 r600_context_bo_reloc(rctx
, rscreen
->trace_bo
, RADEON_USAGE_READWRITE
);
737 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_WRITE_DATA
, 4, 0);
738 cs
->buf
[cs
->cdw
++] = PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC
) |
739 PKT3_WRITE_DATA_WR_CONFIRM
|
740 PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME
);
741 cs
->buf
[cs
->cdw
++] = va
& 0xFFFFFFFFUL
;
742 cs
->buf
[cs
->cdw
++] = (va
>> 32UL) & 0xFFFFFFFFUL
;
743 cs
->buf
[cs
->cdw
++] = cs
->cdw
;
744 cs
->buf
[cs
->cdw
++] = rscreen
->cs_count
;