2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "r600_hw_context_priv.h"
27 #include "radeonsi_pipe.h"
29 #include "util/u_memory.h"
32 #define GROUP_FORCE_NEW_BLOCK 0
34 /* Get backends mask */
35 void r600_get_backend_mask(struct r600_context
*ctx
)
37 struct radeon_winsys_cs
*cs
= ctx
->cs
;
38 struct r600_resource
*buffer
;
40 unsigned num_backends
= ctx
->screen
->info
.r600_num_backends
;
43 /* if backend_map query is supported by the kernel */
44 if (ctx
->screen
->info
.r600_backend_map_valid
) {
45 unsigned num_tile_pipes
= ctx
->screen
->info
.r600_num_tile_pipes
;
46 unsigned backend_map
= ctx
->screen
->info
.r600_backend_map
;
47 unsigned item_width
, item_mask
;
49 if (ctx
->chip_class
>= CAYMAN
) {
54 while(num_tile_pipes
--) {
55 i
= backend_map
& item_mask
;
57 backend_map
>>= item_width
;
60 ctx
->backend_mask
= mask
;
65 /* otherwise backup path for older kernels */
67 /* create buffer for event data */
68 buffer
= (struct r600_resource
*)
69 pipe_buffer_create(&ctx
->screen
->screen
, PIPE_BIND_CUSTOM
,
70 PIPE_USAGE_STAGING
, ctx
->max_db
*16);
74 /* initialize buffer with zeroes */
75 results
= ctx
->ws
->buffer_map(buffer
->cs_buf
, ctx
->cs
, PIPE_TRANSFER_WRITE
);
79 memset(results
, 0, ctx
->max_db
* 4 * 4);
80 ctx
->ws
->buffer_unmap(buffer
->cs_buf
);
82 /* emit EVENT_WRITE for ZPASS_DONE */
83 va
= r600_resource_va(&ctx
->screen
->screen
, (void *)buffer
);
84 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
85 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1);
86 cs
->buf
[cs
->cdw
++] = va
;
87 cs
->buf
[cs
->cdw
++] = va
>> 32;
89 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
90 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, buffer
, RADEON_USAGE_WRITE
);
93 results
= ctx
->ws
->buffer_map(buffer
->cs_buf
, ctx
->cs
, PIPE_TRANSFER_READ
);
95 for(i
= 0; i
< ctx
->max_db
; i
++) {
96 /* at least highest bit will be set if backend is used */
100 ctx
->ws
->buffer_unmap(buffer
->cs_buf
);
104 pipe_resource_reference((struct pipe_resource
**)&buffer
, NULL
);
107 ctx
->backend_mask
= mask
;
112 /* fallback to old method - set num_backends lower bits to 1 */
113 ctx
->backend_mask
= (~((uint32_t)0))>>(32-num_backends
);
117 static inline void r600_context_ps_partial_flush(struct r600_context
*ctx
)
119 struct radeon_winsys_cs
*cs
= ctx
->cs
;
121 if (!(ctx
->flags
& R600_CONTEXT_DRAW_PENDING
))
124 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 0, 0);
125 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH
) | EVENT_INDEX(4);
127 ctx
->flags
&= ~R600_CONTEXT_DRAW_PENDING
;
130 void r600_init_cs(struct r600_context
*ctx
)
132 struct radeon_winsys_cs
*cs
= ctx
->cs
;
134 /* All asics require this one */
135 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_CONTEXT_CONTROL
, 1, 0);
136 cs
->buf
[cs
->cdw
++] = 0x80000000;
137 cs
->buf
[cs
->cdw
++] = 0x80000000;
139 ctx
->init_dwords
= cs
->cdw
;
142 static void r600_init_block(struct r600_context
*ctx
,
143 struct r600_block
*block
,
144 const struct r600_reg
*reg
, int index
, int nreg
,
145 unsigned opcode
, unsigned offset_base
)
150 /* initialize block */
152 block
->status
|= R600_BLOCK_STATUS_DIRTY
; /* dirty all blocks at start */
153 block
->start_offset
= reg
[i
].offset
;
154 block
->pm4
[block
->pm4_ndwords
++] = PKT3(opcode
, n
, 0);
155 block
->pm4
[block
->pm4_ndwords
++] = (block
->start_offset
- offset_base
) >> 2;
156 block
->reg
= &block
->pm4
[block
->pm4_ndwords
];
157 block
->pm4_ndwords
+= n
;
159 block
->nreg_dirty
= n
;
160 LIST_INITHEAD(&block
->list
);
161 LIST_INITHEAD(&block
->enable_list
);
163 for (j
= 0; j
< n
; j
++) {
164 if (reg
[i
+j
].flags
& REG_FLAG_DIRTY_ALWAYS
) {
165 block
->flags
|= REG_FLAG_DIRTY_ALWAYS
;
167 if (reg
[i
+j
].flags
& REG_FLAG_ENABLE_ALWAYS
) {
168 if (!(block
->status
& R600_BLOCK_STATUS_ENABLED
)) {
169 block
->status
|= R600_BLOCK_STATUS_ENABLED
;
170 LIST_ADDTAIL(&block
->enable_list
, &ctx
->enable_list
);
171 LIST_ADDTAIL(&block
->list
,&ctx
->dirty
);
174 if (reg
[i
+j
].flags
& REG_FLAG_FLUSH_CHANGE
) {
175 block
->flags
|= REG_FLAG_FLUSH_CHANGE
;
178 if (reg
[i
+j
].flags
& REG_FLAG_NEED_BO
) {
180 assert(block
->nbo
< R600_BLOCK_MAX_BO
);
181 block
->pm4_bo_index
[j
] = block
->nbo
;
182 block
->pm4
[block
->pm4_ndwords
++] = PKT3(PKT3_NOP
, 0, 0);
183 block
->pm4
[block
->pm4_ndwords
++] = 0x00000000;
184 block
->reloc
[block
->nbo
].bo_pm4_index
= block
->pm4_ndwords
- 1;
187 /* check that we stay in limit */
188 assert(block
->pm4_ndwords
< R600_BLOCK_MAX_REG
);
191 int r600_context_add_block(struct r600_context
*ctx
, const struct r600_reg
*reg
, unsigned nreg
,
192 unsigned opcode
, unsigned offset_base
)
194 struct r600_block
*block
;
195 struct r600_range
*range
;
198 for (unsigned i
= 0, n
= 0; i
< nreg
; i
+= n
) {
199 /* ignore new block balise */
200 if (reg
[i
].offset
== GROUP_FORCE_NEW_BLOCK
) {
205 /* register that need relocation are in their own group */
206 /* find number of consecutive registers */
208 offset
= reg
[i
].offset
;
209 while (reg
[i
+ n
].offset
== offset
) {
214 if (n
>= (R600_BLOCK_MAX_REG
- 2))
218 /* allocate new block */
219 block
= calloc(1, sizeof(struct r600_block
));
224 for (int j
= 0; j
< n
; j
++) {
225 range
= &ctx
->range
[CTX_RANGE_ID(reg
[i
+ j
].offset
)];
226 /* create block table if it doesn't exist */
228 range
->blocks
= calloc(1 << HASH_SHIFT
, sizeof(void *));
232 range
->blocks
[CTX_BLOCK_ID(reg
[i
+ j
].offset
)] = block
;
235 r600_init_block(ctx
, block
, reg
, i
, n
, opcode
, offset_base
);
243 void r600_context_fini(struct r600_context
*ctx
)
245 struct r600_block
*block
;
246 struct r600_range
*range
;
248 for (int i
= 0; i
< NUM_RANGES
; i
++) {
249 if (!ctx
->range
[i
].blocks
)
251 for (int j
= 0; j
< (1 << HASH_SHIFT
); j
++) {
252 block
= ctx
->range
[i
].blocks
[j
];
254 for (int k
= 0, offset
= block
->start_offset
; k
< block
->nreg
; k
++, offset
+= 4) {
255 range
= &ctx
->range
[CTX_RANGE_ID(offset
)];
256 range
->blocks
[CTX_BLOCK_ID(offset
)] = NULL
;
258 for (int k
= 1; k
<= block
->nbo
; k
++) {
259 pipe_resource_reference((struct pipe_resource
**)&block
->reloc
[k
].bo
, NULL
);
264 free(ctx
->range
[i
].blocks
);
268 ctx
->ws
->cs_destroy(ctx
->cs
);
271 int r600_setup_block_table(struct r600_context
*ctx
)
273 /* setup block table */
275 ctx
->blocks
= calloc(ctx
->nblocks
, sizeof(void*));
278 for (int i
= 0; i
< NUM_RANGES
; i
++) {
279 if (!ctx
->range
[i
].blocks
)
281 for (int j
= 0, add
; j
< (1 << HASH_SHIFT
); j
++) {
282 if (!ctx
->range
[i
].blocks
[j
])
286 for (int k
= 0; k
< c
; k
++) {
287 if (ctx
->blocks
[k
] == ctx
->range
[i
].blocks
[j
]) {
293 assert(c
< ctx
->nblocks
);
294 ctx
->blocks
[c
++] = ctx
->range
[i
].blocks
[j
];
295 j
+= (ctx
->range
[i
].blocks
[j
]->nreg
) - 1;
303 void r600_need_cs_space(struct r600_context
*ctx
, unsigned num_dw
,
304 boolean count_draw_in
)
306 struct r600_atom
*state
;
308 /* The number of dwords we already used in the CS so far. */
309 num_dw
+= ctx
->cs
->cdw
;
312 /* The number of dwords all the dirty states would take. */
313 LIST_FOR_EACH_ENTRY(state
, &ctx
->dirty_states
, head
) {
314 num_dw
+= state
->num_dw
;
317 num_dw
+= ctx
->pm4_dirty_cdwords
;
319 /* The upper-bound of how much a draw command would take. */
320 num_dw
+= R600_MAX_DRAW_CS_DWORDS
;
323 /* Count in queries_suspend. */
324 num_dw
+= ctx
->num_cs_dw_queries_suspend
;
326 /* Count in streamout_end at the end of CS. */
327 num_dw
+= ctx
->num_cs_dw_streamout_end
;
329 /* Count in render_condition(NULL) at the end of CS. */
330 if (ctx
->predicate_drawing
) {
334 /* Count in framebuffer cache flushes at the end of CS. */
335 num_dw
+= 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
337 /* Save 16 dwords for the fence mechanism. */
340 /* Flush if there's not enough space. */
341 if (num_dw
> RADEON_MAX_CMDBUF_DWORDS
) {
342 radeonsi_flush(&ctx
->context
, NULL
, RADEON_FLUSH_ASYNC
);
346 void r600_context_dirty_block(struct r600_context
*ctx
,
347 struct r600_block
*block
,
348 int dirty
, int index
)
350 if ((index
+ 1) > block
->nreg_dirty
)
351 block
->nreg_dirty
= index
+ 1;
353 if ((dirty
!= (block
->status
& R600_BLOCK_STATUS_DIRTY
)) || !(block
->status
& R600_BLOCK_STATUS_ENABLED
)) {
354 block
->status
|= R600_BLOCK_STATUS_DIRTY
;
355 ctx
->pm4_dirty_cdwords
+= block
->pm4_ndwords
;
356 if (!(block
->status
& R600_BLOCK_STATUS_ENABLED
)) {
357 block
->status
|= R600_BLOCK_STATUS_ENABLED
;
358 LIST_ADDTAIL(&block
->enable_list
, &ctx
->enable_list
);
360 LIST_ADDTAIL(&block
->list
,&ctx
->dirty
);
362 if (block
->flags
& REG_FLAG_FLUSH_CHANGE
) {
363 r600_context_ps_partial_flush(ctx
);
368 void r600_context_pipe_state_set(struct r600_context
*ctx
, struct r600_pipe_state
*state
)
370 struct r600_block
*block
;
372 for (int i
= 0; i
< state
->nregs
; i
++) {
373 unsigned id
, reloc_id
;
374 struct r600_pipe_reg
*reg
= &state
->regs
[i
];
379 dirty
= block
->status
& R600_BLOCK_STATUS_DIRTY
;
381 if (reg
->value
!= block
->reg
[id
]) {
382 block
->reg
[id
] = reg
->value
;
383 dirty
|= R600_BLOCK_STATUS_DIRTY
;
385 if (block
->flags
& REG_FLAG_DIRTY_ALWAYS
)
386 dirty
|= R600_BLOCK_STATUS_DIRTY
;
387 if (block
->pm4_bo_index
[id
]) {
388 /* find relocation */
389 reloc_id
= block
->pm4_bo_index
[id
];
390 pipe_resource_reference((struct pipe_resource
**)&block
->reloc
[reloc_id
].bo
, ®
->bo
->b
.b
);
391 block
->reloc
[reloc_id
].bo_usage
= reg
->bo_usage
;
392 /* always force dirty for relocs for now */
393 dirty
|= R600_BLOCK_STATUS_DIRTY
;
397 r600_context_dirty_block(ctx
, block
, dirty
, id
);
401 struct r600_resource
*r600_context_reg_bo(struct r600_context
*ctx
, unsigned offset
)
403 struct r600_range
*range
;
404 struct r600_block
*block
;
407 range
= &ctx
->range
[CTX_RANGE_ID(offset
)];
408 block
= range
->blocks
[CTX_BLOCK_ID(offset
)];
409 offset
-= block
->start_offset
;
410 id
= block
->pm4_bo_index
[offset
>> 2];
411 if (block
->reloc
[id
].bo
) {
412 return block
->reloc
[id
].bo
;
417 void r600_context_block_emit_dirty(struct r600_context
*ctx
, struct r600_block
*block
)
419 struct radeon_winsys_cs
*cs
= ctx
->cs
;
420 int optional
= block
->nbo
== 0 && !(block
->flags
& REG_FLAG_DIRTY_ALWAYS
);
421 int cp_dwords
= block
->pm4_ndwords
, start_dword
= 0;
423 int nbo
= block
->nbo
;
425 if (block
->nreg_dirty
== 0 && optional
) {
430 ctx
->flags
|= R600_CONTEXT_CHECK_EVENT_FLUSH
;
432 for (int j
= 0; j
< block
->nreg
; j
++) {
433 if (block
->pm4_bo_index
[j
]) {
434 /* find relocation */
435 struct r600_block_reloc
*reloc
= &block
->reloc
[block
->pm4_bo_index
[j
]];
436 block
->pm4
[reloc
->bo_pm4_index
] =
437 r600_context_bo_reloc(ctx
, reloc
->bo
, reloc
->bo_usage
);
443 ctx
->flags
&= ~R600_CONTEXT_CHECK_EVENT_FLUSH
;
446 optional
&= (block
->nreg_dirty
!= block
->nreg
);
448 new_dwords
= block
->nreg_dirty
;
449 start_dword
= cs
->cdw
;
450 cp_dwords
= new_dwords
+ 2;
452 memcpy(&cs
->buf
[cs
->cdw
], block
->pm4
, cp_dwords
* 4);
453 cs
->cdw
+= cp_dwords
;
458 newword
= cs
->buf
[start_dword
];
459 newword
&= PKT_COUNT_C
;
460 newword
|= PKT_COUNT_S(new_dwords
);
461 cs
->buf
[start_dword
] = newword
;
464 block
->status
^= R600_BLOCK_STATUS_DIRTY
;
465 block
->nreg_dirty
= 0;
466 LIST_DELINIT(&block
->list
);
469 void r600_inval_shader_cache(struct r600_context
*ctx
)
471 ctx
->atom_surface_sync
.flush_flags
|= S_0085F0_SH_ICACHE_ACTION_ENA(1);
472 ctx
->atom_surface_sync
.flush_flags
|= S_0085F0_SH_KCACHE_ACTION_ENA(1);
473 r600_atom_dirty(ctx
, &ctx
->atom_surface_sync
.atom
);
476 void r600_inval_texture_cache(struct r600_context
*ctx
)
478 ctx
->atom_surface_sync
.flush_flags
|= S_0085F0_TC_ACTION_ENA(1);
479 r600_atom_dirty(ctx
, &ctx
->atom_surface_sync
.atom
);
482 void r600_inval_vertex_cache(struct r600_context
*ctx
)
484 /* Some GPUs don't have the vertex cache and must use the texture cache instead. */
485 ctx
->atom_surface_sync
.flush_flags
|= S_0085F0_TC_ACTION_ENA(1);
486 r600_atom_dirty(ctx
, &ctx
->atom_surface_sync
.atom
);
489 void r600_flush_framebuffer(struct r600_context
*ctx
, bool flush_now
)
491 if (!(ctx
->flags
& R600_CONTEXT_DST_CACHES_DIRTY
))
494 ctx
->atom_surface_sync
.flush_flags
|=
495 r600_get_cb_flush_flags(ctx
) |
496 (ctx
->framebuffer
.zsbuf
? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0);
499 r600_emit_atom(ctx
, &ctx
->atom_surface_sync
.atom
);
501 r600_atom_dirty(ctx
, &ctx
->atom_surface_sync
.atom
);
504 ctx
->flags
&= ~R600_CONTEXT_DST_CACHES_DIRTY
;
507 void r600_context_flush(struct r600_context
*ctx
, unsigned flags
)
509 struct radeon_winsys_cs
*cs
= ctx
->cs
;
510 struct r600_block
*enable_block
= NULL
;
511 bool queries_suspended
= false;
512 bool streamout_suspended
= false;
514 if (cs
->cdw
== ctx
->init_dwords
)
517 /* suspend queries */
518 if (ctx
->num_cs_dw_queries_suspend
) {
519 r600_context_queries_suspend(ctx
);
520 queries_suspended
= true;
523 if (ctx
->num_cs_dw_streamout_end
) {
524 r600_context_streamout_end(ctx
);
525 streamout_suspended
= true;
528 r600_flush_framebuffer(ctx
, true);
530 /* partial flush is needed to avoid lockups on some chips with user fences */
531 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 0, 0);
532 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH
) | EVENT_INDEX(4);
535 ctx
->ws
->cs_flush(ctx
->cs
, flags
);
537 ctx
->pm4_dirty_cdwords
= 0;
542 if (streamout_suspended
) {
543 ctx
->streamout_start
= TRUE
;
544 ctx
->streamout_append_bitmask
= ~0;
548 if (queries_suspended
) {
549 r600_context_queries_resume(ctx
);
552 /* set all valid group as dirty so they get reemited on
555 LIST_FOR_EACH_ENTRY(enable_block
, &ctx
->enable_list
, enable_list
) {
556 if(!(enable_block
->status
& R600_BLOCK_STATUS_DIRTY
)) {
557 LIST_ADDTAIL(&enable_block
->list
,&ctx
->dirty
);
558 enable_block
->status
|= R600_BLOCK_STATUS_DIRTY
;
560 ctx
->pm4_dirty_cdwords
+= enable_block
->pm4_ndwords
;
561 enable_block
->nreg_dirty
= enable_block
->nreg
;
565 void r600_context_emit_fence(struct r600_context
*ctx
, struct r600_resource
*fence_bo
, unsigned offset
, unsigned value
)
567 struct radeon_winsys_cs
*cs
= ctx
->cs
;
570 r600_need_cs_space(ctx
, 10, FALSE
);
572 va
= r600_resource_va(&ctx
->screen
->screen
, (void*)fence_bo
);
573 va
= va
+ (offset
<< 2);
575 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 0, 0);
576 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH
) | EVENT_INDEX(4);
577 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE_EOP
, 4, 0);
578 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT
) | EVENT_INDEX(5);
579 cs
->buf
[cs
->cdw
++] = va
& 0xFFFFFFFFUL
; /* ADDRESS_LO */
580 /* DATA_SEL | INT_EN | ADDRESS_HI */
581 cs
->buf
[cs
->cdw
++] = (1 << 29) | (0 << 24) | ((va
>> 32UL) & 0xFF);
582 cs
->buf
[cs
->cdw
++] = value
; /* DATA_LO */
583 cs
->buf
[cs
->cdw
++] = 0; /* DATA_HI */
584 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
585 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, fence_bo
, RADEON_USAGE_WRITE
);
588 static unsigned r600_query_read_result(char *map
, unsigned start_index
, unsigned end_index
,
589 bool test_status_bit
)
591 uint32_t *current_result
= (uint32_t*)map
;
594 start
= (uint64_t)current_result
[start_index
] |
595 (uint64_t)current_result
[start_index
+1] << 32;
596 end
= (uint64_t)current_result
[end_index
] |
597 (uint64_t)current_result
[end_index
+1] << 32;
599 if (!test_status_bit
||
600 ((start
& 0x8000000000000000UL
) && (end
& 0x8000000000000000UL
))) {
606 static boolean
r600_query_result(struct r600_context
*ctx
, struct r600_query
*query
, boolean wait
)
608 unsigned results_base
= query
->results_start
;
611 map
= ctx
->ws
->buffer_map(query
->buffer
->cs_buf
, ctx
->cs
,
613 (wait
? 0 : PIPE_TRANSFER_DONTBLOCK
));
617 /* count all results across all data blocks */
618 switch (query
->type
) {
619 case PIPE_QUERY_OCCLUSION_COUNTER
:
620 while (results_base
!= query
->results_end
) {
622 r600_query_read_result(map
+ results_base
, 0, 2, true);
623 results_base
= (results_base
+ 16) % query
->buffer
->b
.b
.width0
;
626 case PIPE_QUERY_OCCLUSION_PREDICATE
:
627 while (results_base
!= query
->results_end
) {
628 query
->result
.b
= query
->result
.b
||
629 r600_query_read_result(map
+ results_base
, 0, 2, true) != 0;
630 results_base
= (results_base
+ 16) % query
->buffer
->b
.b
.width0
;
633 case PIPE_QUERY_TIME_ELAPSED
:
634 while (results_base
!= query
->results_end
) {
636 r600_query_read_result(map
+ results_base
, 0, 2, false);
637 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
640 case PIPE_QUERY_PRIMITIVES_EMITTED
:
641 /* SAMPLE_STREAMOUTSTATS stores this structure:
643 * u64 NumPrimitivesWritten;
644 * u64 PrimitiveStorageNeeded;
646 * We only need NumPrimitivesWritten here. */
647 while (results_base
!= query
->results_end
) {
649 r600_query_read_result(map
+ results_base
, 2, 6, true);
650 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
653 case PIPE_QUERY_PRIMITIVES_GENERATED
:
654 /* Here we read PrimitiveStorageNeeded. */
655 while (results_base
!= query
->results_end
) {
657 r600_query_read_result(map
+ results_base
, 0, 4, true);
658 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
661 case PIPE_QUERY_SO_STATISTICS
:
662 while (results_base
!= query
->results_end
) {
663 query
->result
.so
.num_primitives_written
+=
664 r600_query_read_result(map
+ results_base
, 2, 6, true);
665 query
->result
.so
.primitives_storage_needed
+=
666 r600_query_read_result(map
+ results_base
, 0, 4, true);
667 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
670 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
671 while (results_base
!= query
->results_end
) {
672 query
->result
.b
= query
->result
.b
||
673 r600_query_read_result(map
+ results_base
, 2, 6, true) !=
674 r600_query_read_result(map
+ results_base
, 0, 4, true);
675 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
682 query
->results_start
= query
->results_end
;
683 ctx
->ws
->buffer_unmap(query
->buffer
->cs_buf
);
687 void r600_query_begin(struct r600_context
*ctx
, struct r600_query
*query
)
689 struct radeon_winsys_cs
*cs
= ctx
->cs
;
690 unsigned new_results_end
, i
;
694 r600_need_cs_space(ctx
, query
->num_cs_dw
* 2, TRUE
);
696 new_results_end
= (query
->results_end
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
698 /* collect current results if query buffer is full */
699 if (new_results_end
== query
->results_start
) {
700 r600_query_result(ctx
, query
, TRUE
);
703 switch (query
->type
) {
704 case PIPE_QUERY_OCCLUSION_COUNTER
:
705 case PIPE_QUERY_OCCLUSION_PREDICATE
:
706 results
= ctx
->ws
->buffer_map(query
->buffer
->cs_buf
, ctx
->cs
, PIPE_TRANSFER_WRITE
);
708 results
= (uint32_t*)((char*)results
+ query
->results_end
);
709 memset(results
, 0, query
->result_size
);
711 /* Set top bits for unused backends */
712 for (i
= 0; i
< ctx
->max_db
; i
++) {
713 if (!(ctx
->backend_mask
& (1<<i
))) {
714 results
[(i
* 4)+1] = 0x80000000;
715 results
[(i
* 4)+3] = 0x80000000;
718 ctx
->ws
->buffer_unmap(query
->buffer
->cs_buf
);
721 case PIPE_QUERY_TIME_ELAPSED
:
723 case PIPE_QUERY_PRIMITIVES_EMITTED
:
724 case PIPE_QUERY_PRIMITIVES_GENERATED
:
725 case PIPE_QUERY_SO_STATISTICS
:
726 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
727 results
= ctx
->ws
->buffer_map(query
->buffer
->cs_buf
, ctx
->cs
, PIPE_TRANSFER_WRITE
);
728 results
= (uint32_t*)((char*)results
+ query
->results_end
);
729 memset(results
, 0, query
->result_size
);
730 ctx
->ws
->buffer_unmap(query
->buffer
->cs_buf
);
736 /* emit begin query */
737 va
= r600_resource_va(&ctx
->screen
->screen
, (void*)query
->buffer
);
738 va
+= query
->results_end
;
740 switch (query
->type
) {
741 case PIPE_QUERY_OCCLUSION_COUNTER
:
742 case PIPE_QUERY_OCCLUSION_PREDICATE
:
743 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
744 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1);
745 cs
->buf
[cs
->cdw
++] = va
;
746 cs
->buf
[cs
->cdw
++] = (va
>> 32UL) & 0xFF;
748 case PIPE_QUERY_PRIMITIVES_EMITTED
:
749 case PIPE_QUERY_PRIMITIVES_GENERATED
:
750 case PIPE_QUERY_SO_STATISTICS
:
751 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
752 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
753 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS
) | EVENT_INDEX(3);
754 cs
->buf
[cs
->cdw
++] = query
->results_end
;
755 cs
->buf
[cs
->cdw
++] = 0;
757 case PIPE_QUERY_TIME_ELAPSED
:
758 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE_EOP
, 4, 0);
759 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT
) | EVENT_INDEX(5);
760 cs
->buf
[cs
->cdw
++] = va
;
761 cs
->buf
[cs
->cdw
++] = (3 << 29) | ((va
>> 32UL) & 0xFF);
762 cs
->buf
[cs
->cdw
++] = 0;
763 cs
->buf
[cs
->cdw
++] = 0;
768 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
769 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, query
->buffer
, RADEON_USAGE_WRITE
);
771 ctx
->num_cs_dw_queries_suspend
+= query
->num_cs_dw
;
774 void r600_query_end(struct r600_context
*ctx
, struct r600_query
*query
)
776 struct radeon_winsys_cs
*cs
= ctx
->cs
;
779 va
= r600_resource_va(&ctx
->screen
->screen
, (void*)query
->buffer
);
781 switch (query
->type
) {
782 case PIPE_QUERY_OCCLUSION_COUNTER
:
783 case PIPE_QUERY_OCCLUSION_PREDICATE
:
784 va
+= query
->results_end
+ 8;
785 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
786 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE
) | EVENT_INDEX(1);
787 cs
->buf
[cs
->cdw
++] = va
;
788 cs
->buf
[cs
->cdw
++] = (va
>> 32UL) & 0xFF;
790 case PIPE_QUERY_PRIMITIVES_EMITTED
:
791 case PIPE_QUERY_PRIMITIVES_GENERATED
:
792 case PIPE_QUERY_SO_STATISTICS
:
793 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
794 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE
, 2, 0);
795 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS
) | EVENT_INDEX(3);
796 cs
->buf
[cs
->cdw
++] = query
->results_end
+ query
->result_size
/2;
797 cs
->buf
[cs
->cdw
++] = 0;
799 case PIPE_QUERY_TIME_ELAPSED
:
800 va
+= query
->results_end
+ query
->result_size
/2;
801 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_EVENT_WRITE_EOP
, 4, 0);
802 cs
->buf
[cs
->cdw
++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT
) | EVENT_INDEX(5);
803 cs
->buf
[cs
->cdw
++] = va
;
804 cs
->buf
[cs
->cdw
++] = (3 << 29) | ((va
>> 32UL) & 0xFF);
805 cs
->buf
[cs
->cdw
++] = 0;
806 cs
->buf
[cs
->cdw
++] = 0;
811 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
812 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, query
->buffer
, RADEON_USAGE_WRITE
);
814 query
->results_end
= (query
->results_end
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
815 ctx
->num_cs_dw_queries_suspend
-= query
->num_cs_dw
;
818 void r600_query_predication(struct r600_context
*ctx
, struct r600_query
*query
, int operation
,
821 struct radeon_winsys_cs
*cs
= ctx
->cs
;
824 if (operation
== PREDICATION_OP_CLEAR
) {
825 r600_need_cs_space(ctx
, 3, FALSE
);
827 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_PREDICATION
, 1, 0);
828 cs
->buf
[cs
->cdw
++] = 0;
829 cs
->buf
[cs
->cdw
++] = PRED_OP(PREDICATION_OP_CLEAR
);
831 unsigned results_base
= query
->results_start
;
835 /* find count of the query data blocks */
836 count
= (query
->buffer
->b
.b
.width0
+ query
->results_end
- query
->results_start
) % query
->buffer
->b
.b
.width0
;
837 count
/= query
->result_size
;
839 r600_need_cs_space(ctx
, 5 * count
, TRUE
);
841 op
= PRED_OP(operation
) | PREDICATION_DRAW_VISIBLE
|
842 (flag_wait
? PREDICATION_HINT_WAIT
: PREDICATION_HINT_NOWAIT_DRAW
);
843 va
= r600_resource_va(&ctx
->screen
->screen
, (void*)query
->buffer
);
845 /* emit predicate packets for all data blocks */
846 while (results_base
!= query
->results_end
) {
847 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_PREDICATION
, 1, 0);
848 cs
->buf
[cs
->cdw
++] = (va
+ results_base
) & 0xFFFFFFFFUL
;
849 cs
->buf
[cs
->cdw
++] = op
| (((va
+ results_base
) >> 32UL) & 0xFF);
850 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
851 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, query
->buffer
,
853 results_base
= (results_base
+ query
->result_size
) % query
->buffer
->b
.b
.width0
;
855 /* set CONTINUE bit for all packets except the first */
856 op
|= PREDICATION_CONTINUE
;
861 struct r600_query
*r600_context_query_create(struct r600_context
*ctx
, unsigned query_type
)
863 struct r600_query
*query
;
864 unsigned buffer_size
= 4096;
866 query
= CALLOC_STRUCT(r600_query
);
870 query
->type
= query_type
;
872 switch (query_type
) {
873 case PIPE_QUERY_OCCLUSION_COUNTER
:
874 case PIPE_QUERY_OCCLUSION_PREDICATE
:
875 query
->result_size
= 16 * ctx
->max_db
;
876 query
->num_cs_dw
= 6;
878 case PIPE_QUERY_TIME_ELAPSED
:
879 query
->result_size
= 16;
880 query
->num_cs_dw
= 8;
882 case PIPE_QUERY_PRIMITIVES_EMITTED
:
883 case PIPE_QUERY_PRIMITIVES_GENERATED
:
884 case PIPE_QUERY_SO_STATISTICS
:
885 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
886 /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
887 query
->result_size
= 32;
888 query
->num_cs_dw
= 6;
896 /* adjust buffer size to simplify offsets wrapping math */
897 buffer_size
-= buffer_size
% query
->result_size
;
899 /* Queries are normally read by the CPU after
900 * being written by the gpu, hence staging is probably a good
903 query
->buffer
= (struct r600_resource
*)
904 pipe_buffer_create(&ctx
->screen
->screen
, PIPE_BIND_CUSTOM
, PIPE_USAGE_STAGING
, buffer_size
);
905 if (!query
->buffer
) {
912 void r600_context_query_destroy(struct r600_context
*ctx
, struct r600_query
*query
)
914 pipe_resource_reference((struct pipe_resource
**)&query
->buffer
, NULL
);
918 boolean
r600_context_query_result(struct r600_context
*ctx
,
919 struct r600_query
*query
,
920 boolean wait
, void *vresult
)
922 boolean
*result_b
= (boolean
*)vresult
;
923 uint64_t *result_u64
= (uint64_t*)vresult
;
924 struct pipe_query_data_so_statistics
*result_so
=
925 (struct pipe_query_data_so_statistics
*)vresult
;
927 if (!r600_query_result(ctx
, query
, wait
))
930 switch (query
->type
) {
931 case PIPE_QUERY_OCCLUSION_COUNTER
:
932 case PIPE_QUERY_PRIMITIVES_EMITTED
:
933 case PIPE_QUERY_PRIMITIVES_GENERATED
:
934 *result_u64
= query
->result
.u64
;
936 case PIPE_QUERY_OCCLUSION_PREDICATE
:
937 case PIPE_QUERY_SO_OVERFLOW_PREDICATE
:
938 *result_b
= query
->result
.b
;
940 case PIPE_QUERY_TIME_ELAPSED
:
941 *result_u64
= (1000000 * query
->result
.u64
) / ctx
->screen
->info
.r600_clock_crystal_freq
;
943 case PIPE_QUERY_SO_STATISTICS
:
944 *result_so
= query
->result
.so
;
952 void r600_context_queries_suspend(struct r600_context
*ctx
)
954 struct r600_query
*query
;
956 LIST_FOR_EACH_ENTRY(query
, &ctx
->active_query_list
, list
) {
957 r600_query_end(ctx
, query
);
959 assert(ctx
->num_cs_dw_queries_suspend
== 0);
962 void r600_context_queries_resume(struct r600_context
*ctx
)
964 struct r600_query
*query
;
966 assert(ctx
->num_cs_dw_queries_suspend
== 0);
968 LIST_FOR_EACH_ENTRY(query
, &ctx
->active_query_list
, list
) {
969 r600_query_begin(ctx
, query
);
973 void r600_context_streamout_begin(struct r600_context
*ctx
)
975 struct radeon_winsys_cs
*cs
= ctx
->cs
;
976 struct r600_so_target
**t
= ctx
->so_targets
;
977 unsigned *strides
= ctx
->vs_shader_so_strides
;
978 unsigned buffer_en
, i
;
980 buffer_en
= (ctx
->num_so_targets
>= 1 && t
[0] ? 1 : 0) |
981 (ctx
->num_so_targets
>= 2 && t
[1] ? 2 : 0) |
982 (ctx
->num_so_targets
>= 3 && t
[2] ? 4 : 0) |
983 (ctx
->num_so_targets
>= 4 && t
[3] ? 8 : 0);
985 ctx
->num_cs_dw_streamout_end
=
986 12 + /* flush_vgt_streamout */
987 util_bitcount(buffer_en
) * 8 +
990 r600_need_cs_space(ctx
,
991 12 + /* flush_vgt_streamout */
993 util_bitcount(buffer_en
& ctx
->streamout_append_bitmask
) * 8 +
994 util_bitcount(buffer_en
& ~ctx
->streamout_append_bitmask
) * 6 +
995 ctx
->num_cs_dw_streamout_end
, TRUE
);
997 if (ctx
->chip_class
>= CAYMAN
) {
998 evergreen_flush_vgt_streamout(ctx
);
999 evergreen_set_streamout_enable(ctx
, buffer_en
);
1002 for (i
= 0; i
< ctx
->num_so_targets
; i
++) {
1005 t
[i
]->stride
= strides
[i
];
1008 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_CONTEXT_REG
, 3, 0);
1009 cs
->buf
[cs
->cdw
++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0
+
1010 16*i
- SI_CONTEXT_REG_OFFSET
) >> 2;
1011 cs
->buf
[cs
->cdw
++] = (t
[i
]->b
.buffer_offset
+
1012 t
[i
]->b
.buffer_size
) >> 2; /* BUFFER_SIZE (in DW) */
1013 cs
->buf
[cs
->cdw
++] = strides
[i
] >> 2; /* VTX_STRIDE (in DW) */
1014 cs
->buf
[cs
->cdw
++] = 0; /* BUFFER_BASE */
1016 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
1017 cs
->buf
[cs
->cdw
++] =
1018 r600_context_bo_reloc(ctx
, r600_resource(t
[i
]->b
.buffer
),
1019 RADEON_USAGE_WRITE
);
1021 if (ctx
->streamout_append_bitmask
& (1 << i
)) {
1023 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0);
1024 cs
->buf
[cs
->cdw
++] = STRMOUT_SELECT_BUFFER(i
) |
1025 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM
); /* control */
1026 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1027 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1028 cs
->buf
[cs
->cdw
++] = 0; /* src address lo */
1029 cs
->buf
[cs
->cdw
++] = 0; /* src address hi */
1031 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
1032 cs
->buf
[cs
->cdw
++] =
1033 r600_context_bo_reloc(ctx
, t
[i
]->filled_size
,
1036 /* Start from the beginning. */
1037 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0);
1038 cs
->buf
[cs
->cdw
++] = STRMOUT_SELECT_BUFFER(i
) |
1039 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET
); /* control */
1040 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1041 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1042 cs
->buf
[cs
->cdw
++] = t
[i
]->b
.buffer_offset
>> 2; /* buffer offset in DW */
1043 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1050 void r600_context_streamout_end(struct r600_context
*ctx
)
1052 struct radeon_winsys_cs
*cs
= ctx
->cs
;
1053 struct r600_so_target
**t
= ctx
->so_targets
;
1054 unsigned i
, flush_flags
= 0;
1056 evergreen_flush_vgt_streamout(ctx
);
1058 for (i
= 0; i
< ctx
->num_so_targets
; i
++) {
1061 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE
, 4, 0);
1062 cs
->buf
[cs
->cdw
++] = STRMOUT_SELECT_BUFFER(i
) |
1063 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE
) |
1064 STRMOUT_STORE_BUFFER_FILLED_SIZE
; /* control */
1065 cs
->buf
[cs
->cdw
++] = 0; /* dst address lo */
1066 cs
->buf
[cs
->cdw
++] = 0; /* dst address hi */
1067 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1068 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1070 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
1071 cs
->buf
[cs
->cdw
++] =
1072 r600_context_bo_reloc(ctx
, t
[i
]->filled_size
,
1073 RADEON_USAGE_WRITE
);
1075 flush_flags
|= S_0085F0_SO0_DEST_BASE_ENA(1) << i
;
1080 evergreen_set_streamout_enable(ctx
, 0);
1082 ctx
->atom_surface_sync
.flush_flags
|= flush_flags
;
1083 r600_atom_dirty(ctx
, &ctx
->atom_surface_sync
.atom
);
1085 ctx
->num_cs_dw_streamout_end
= 0;
1087 /* XXX print some debug info */
1088 for (i
= 0; i
< ctx
->num_so_targets
; i
++) {
1092 uint32_t *ptr
= ctx
->ws
->buffer_map(t
[i
]->filled_size
->cs_buf
, ctx
->cs
, RADEON_USAGE_READ
);
1093 printf("FILLED_SIZE%i: %u\n", i
, *ptr
);
1094 ctx
->ws
->buffer_unmap(t
[i
]->filled_size
->cs_buf
);
1098 void r600_context_draw_opaque_count(struct r600_context
*ctx
, struct r600_so_target
*t
)
1100 struct radeon_winsys_cs
*cs
= ctx
->cs
;
1101 r600_need_cs_space(ctx
, 14 + 21, TRUE
);
1103 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_CONTEXT_REG
, 1, 0);
1104 cs
->buf
[cs
->cdw
++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET
- SI_CONTEXT_REG_OFFSET
) >> 2;
1105 cs
->buf
[cs
->cdw
++] = 0;
1107 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_CONTEXT_REG
, 1, 0);
1108 cs
->buf
[cs
->cdw
++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE
- SI_CONTEXT_REG_OFFSET
) >> 2;
1109 cs
->buf
[cs
->cdw
++] = t
->stride
>> 2;
1112 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_COPY_DW
, 4, 0);
1113 cs
->buf
[cs
->cdw
++] = COPY_DW_SRC_IS_MEM
| COPY_DW_DST_IS_REG
;
1114 cs
->buf
[cs
->cdw
++] = 0; /* src address lo */
1115 cs
->buf
[cs
->cdw
++] = 0; /* src address hi */
1116 cs
->buf
[cs
->cdw
++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
>> 2; /* dst register */
1117 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1120 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
1121 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, t
->filled_size
, RADEON_USAGE_READ
);
1123 #if 0 /* I have not found this useful yet. */
1124 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_COPY_DW
, 4, 0);
1125 cs
->buf
[cs
->cdw
++] = COPY_DW_SRC_IS_REG
| COPY_DW_DST_IS_REG
;
1126 cs
->buf
[cs
->cdw
++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE
>> 2; /* src register */
1127 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1128 cs
->buf
[cs
->cdw
++] = R_0085F4_CP_COHER_SIZE
>> 2; /* dst register */
1129 cs
->buf
[cs
->cdw
++] = 0; /* unused */
1131 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_CONFIG_REG
, 1, 0);
1132 cs
->buf
[cs
->cdw
++] = (R_0085F0_CP_COHER_CNTL
- SI_CONFIG_REG_OFFSET
) >> 2;
1133 cs
->buf
[cs
->cdw
++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t
->so_index
;
1135 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_SET_CONFIG_REG
, 1, 0);
1136 cs
->buf
[cs
->cdw
++] = (R_0085F8_CP_COHER_BASE
- SI_CONFIG_REG_OFFSET
) >> 2;
1137 cs
->buf
[cs
->cdw
++] = t
->b
.buffer_offset
>> 2;
1139 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_NOP
, 0, 0);
1140 cs
->buf
[cs
->cdw
++] = r600_context_bo_reloc(ctx
, (struct r600_resource
*)t
->b
.buffer
,
1141 RADEON_USAGE_WRITE
);
1143 cs
->buf
[cs
->cdw
++] = PKT3(PKT3_WAIT_REG_MEM
, 5, 0);
1144 cs
->buf
[cs
->cdw
++] = WAIT_REG_MEM_EQUAL
; /* wait until the register is equal to the reference value */
1145 cs
->buf
[cs
->cdw
++] = R_0085FC_CP_COHER_STATUS
>> 2; /* register */
1146 cs
->buf
[cs
->cdw
++] = 0;
1147 cs
->buf
[cs
->cdw
++] = 0; /* reference value */
1148 cs
->buf
[cs
->cdw
++] = 0xffffffff; /* mask */
1149 cs
->buf
[cs
->cdw
++] = 4; /* poll interval */