2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
29 This file replaces libdrm's radeon_cs_gem with our own implemention.
30 It's optimized specifically for Radeon DRM.
31 Adding buffers and space checking are faster and simpler than their
32 counterparts in libdrm (the time complexity of all the functions
33 is O(1) in nearly all scenarios, thanks to hashing).
37 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38 also adds the size of 'buf' to the used_gart and used_vram winsys variables
39 based on the domains, which are simply or'd for the accounting purposes.
40 The adding is skipped if the reloc is already present in the list, but it
41 accounts any newly-referenced domains.
43 cs_validate is then called, which just checks:
44 used_vram/gart < vram/gart_size * 0.8
45 The 0.8 number allows for some memory fragmentation. If the validation
46 fails, the pipe driver flushes CS and tries do the validation again,
47 i.e. it validates only that one operation. If it fails again, it drops
48 the operation on the floor and prints some nasty message to stderr.
49 (done in the pipe driver)
51 cs_write_reloc(cs, buf) just writes a reloc that has been added using
52 cs_add_buffer. The read_domain and write_domain parameters have been removed,
53 because we already specify them in cs_add_buffer.
56 #include "radeon_drm_cs.h"
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
69 static struct pipe_fence_handle
*
70 radeon_cs_create_fence(struct radeon_winsys_cs
*rcs
);
71 static void radeon_fence_reference(struct pipe_fence_handle
**dst
,
72 struct pipe_fence_handle
*src
);
74 static struct radeon_winsys_ctx
*radeon_drm_ctx_create(struct radeon_winsys
*ws
)
76 /* No context support here. Just return the winsys pointer
77 * as the "context". */
78 return (struct radeon_winsys_ctx
*)ws
;
81 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx
*ctx
)
83 /* No context support here. */
86 static bool radeon_init_cs_context(struct radeon_cs_context
*csc
,
87 struct radeon_drm_winsys
*ws
)
93 csc
->chunks
[0].chunk_id
= RADEON_CHUNK_ID_IB
;
94 csc
->chunks
[0].length_dw
= 0;
95 csc
->chunks
[0].chunk_data
= (uint64_t)(uintptr_t)csc
->buf
;
96 csc
->chunks
[1].chunk_id
= RADEON_CHUNK_ID_RELOCS
;
97 csc
->chunks
[1].length_dw
= 0;
98 csc
->chunks
[1].chunk_data
= (uint64_t)(uintptr_t)csc
->relocs
;
99 csc
->chunks
[2].chunk_id
= RADEON_CHUNK_ID_FLAGS
;
100 csc
->chunks
[2].length_dw
= 2;
101 csc
->chunks
[2].chunk_data
= (uint64_t)(uintptr_t)&csc
->flags
;
103 csc
->chunk_array
[0] = (uint64_t)(uintptr_t)&csc
->chunks
[0];
104 csc
->chunk_array
[1] = (uint64_t)(uintptr_t)&csc
->chunks
[1];
105 csc
->chunk_array
[2] = (uint64_t)(uintptr_t)&csc
->chunks
[2];
107 csc
->cs
.chunks
= (uint64_t)(uintptr_t)csc
->chunk_array
;
109 for (i
= 0; i
< ARRAY_SIZE(csc
->reloc_indices_hashlist
); i
++) {
110 csc
->reloc_indices_hashlist
[i
] = -1;
115 static void radeon_cs_context_cleanup(struct radeon_cs_context
*csc
)
119 for (i
= 0; i
< csc
->num_relocs
; i
++) {
120 p_atomic_dec(&csc
->relocs_bo
[i
].bo
->num_cs_references
);
121 radeon_bo_reference(&csc
->relocs_bo
[i
].bo
, NULL
);
123 for (i
= 0; i
< csc
->num_slab_buffers
; ++i
) {
124 p_atomic_dec(&csc
->slab_buffers
[i
].bo
->num_cs_references
);
125 radeon_bo_reference(&csc
->slab_buffers
[i
].bo
, NULL
);
129 csc
->num_validated_relocs
= 0;
130 csc
->num_slab_buffers
= 0;
131 csc
->chunks
[0].length_dw
= 0;
132 csc
->chunks
[1].length_dw
= 0;
134 for (i
= 0; i
< ARRAY_SIZE(csc
->reloc_indices_hashlist
); i
++) {
135 csc
->reloc_indices_hashlist
[i
] = -1;
139 static void radeon_destroy_cs_context(struct radeon_cs_context
*csc
)
141 radeon_cs_context_cleanup(csc
);
142 FREE(csc
->slab_buffers
);
143 FREE(csc
->relocs_bo
);
148 static struct radeon_winsys_cs
*
149 radeon_drm_cs_create(struct radeon_winsys_ctx
*ctx
,
150 enum ring_type ring_type
,
151 void (*flush
)(void *ctx
, unsigned flags
,
152 struct pipe_fence_handle
**fence
),
155 struct radeon_drm_winsys
*ws
= (struct radeon_drm_winsys
*)ctx
;
156 struct radeon_drm_cs
*cs
;
158 cs
= CALLOC_STRUCT(radeon_drm_cs
);
162 util_queue_fence_init(&cs
->flush_completed
);
165 cs
->flush_cs
= flush
;
166 cs
->flush_data
= flush_ctx
;
168 if (!radeon_init_cs_context(&cs
->csc1
, cs
->ws
)) {
172 if (!radeon_init_cs_context(&cs
->csc2
, cs
->ws
)) {
173 radeon_destroy_cs_context(&cs
->csc1
);
178 /* Set the first command buffer as current. */
181 cs
->base
.current
.buf
= cs
->csc
->buf
;
182 cs
->base
.current
.max_dw
= ARRAY_SIZE(cs
->csc
->buf
);
183 cs
->ring_type
= ring_type
;
185 p_atomic_inc(&ws
->num_cs
);
189 int radeon_lookup_buffer(struct radeon_cs_context
*csc
, struct radeon_bo
*bo
)
191 unsigned hash
= bo
->hash
& (ARRAY_SIZE(csc
->reloc_indices_hashlist
)-1);
192 struct radeon_bo_item
*buffers
;
193 unsigned num_buffers
;
194 int i
= csc
->reloc_indices_hashlist
[hash
];
197 buffers
= csc
->relocs_bo
;
198 num_buffers
= csc
->num_relocs
;
200 buffers
= csc
->slab_buffers
;
201 num_buffers
= csc
->num_slab_buffers
;
204 /* not found or found */
205 if (i
== -1 || (i
< num_buffers
&& buffers
[i
].bo
== bo
))
208 /* Hash collision, look for the BO in the list of relocs linearly. */
209 for (i
= num_buffers
- 1; i
>= 0; i
--) {
210 if (buffers
[i
].bo
== bo
) {
211 /* Put this reloc in the hash list.
212 * This will prevent additional hash collisions if there are
213 * several consecutive lookup_buffer calls for the same buffer.
215 * Example: Assuming buffers A,B,C collide in the hash list,
216 * the following sequence of relocs:
217 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
218 * will collide here: ^ and here: ^,
219 * meaning that we should get very few collisions in the end. */
220 csc
->reloc_indices_hashlist
[hash
] = i
;
227 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs
*cs
,
228 struct radeon_bo
*bo
)
230 struct radeon_cs_context
*csc
= cs
->csc
;
231 struct drm_radeon_cs_reloc
*reloc
;
232 unsigned hash
= bo
->hash
& (ARRAY_SIZE(csc
->reloc_indices_hashlist
)-1);
235 i
= radeon_lookup_buffer(csc
, bo
);
238 /* For async DMA, every add_buffer call must add a buffer to the list
239 * no matter how many duplicates there are. This is due to the fact
240 * the DMA CS checker doesn't use NOP packets for offset patching,
241 * but always uses the i-th buffer from the list to patch the i-th
242 * offset. If there are N offsets in a DMA CS, there must also be N
243 * buffers in the relocation list.
245 * This doesn't have to be done if virtual memory is enabled,
246 * because there is no offset patching with virtual memory.
248 if (cs
->ring_type
!= RING_DMA
|| cs
->ws
->info
.has_virtual_memory
) {
253 /* New relocation, check if the backing array is large enough. */
254 if (csc
->num_relocs
>= csc
->max_relocs
) {
256 csc
->max_relocs
= MAX2(csc
->max_relocs
+ 16, (unsigned)(csc
->max_relocs
* 1.3));
258 size
= csc
->max_relocs
* sizeof(csc
->relocs_bo
[0]);
259 csc
->relocs_bo
= realloc(csc
->relocs_bo
, size
);
261 size
= csc
->max_relocs
* sizeof(struct drm_radeon_cs_reloc
);
262 csc
->relocs
= realloc(csc
->relocs
, size
);
264 csc
->chunks
[1].chunk_data
= (uint64_t)(uintptr_t)csc
->relocs
;
267 /* Initialize the new relocation. */
268 csc
->relocs_bo
[csc
->num_relocs
].bo
= NULL
;
269 csc
->relocs_bo
[csc
->num_relocs
].u
.real
.priority_usage
= 0;
270 radeon_bo_reference(&csc
->relocs_bo
[csc
->num_relocs
].bo
, bo
);
271 p_atomic_inc(&bo
->num_cs_references
);
272 reloc
= &csc
->relocs
[csc
->num_relocs
];
273 reloc
->handle
= bo
->handle
;
274 reloc
->read_domains
= 0;
275 reloc
->write_domain
= 0;
278 csc
->reloc_indices_hashlist
[hash
] = csc
->num_relocs
;
280 csc
->chunks
[1].length_dw
+= RELOC_DWORDS
;
282 return csc
->num_relocs
++;
285 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs
*cs
,
286 struct radeon_bo
*bo
)
288 struct radeon_cs_context
*csc
= cs
->csc
;
290 struct radeon_bo_item
*item
;
294 idx
= radeon_lookup_buffer(csc
, bo
);
298 real_idx
= radeon_lookup_or_add_real_buffer(cs
, bo
->u
.slab
.real
);
300 /* Check if the backing array is large enough. */
301 if (csc
->num_slab_buffers
>= csc
->max_slab_buffers
) {
302 unsigned new_max
= MAX2(csc
->max_slab_buffers
+ 16,
303 (unsigned)(csc
->max_slab_buffers
* 1.3));
304 struct radeon_bo_item
*new_buffers
=
305 REALLOC(csc
->slab_buffers
,
306 csc
->max_slab_buffers
* sizeof(*new_buffers
),
307 new_max
* sizeof(*new_buffers
));
309 fprintf(stderr
, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
313 csc
->max_slab_buffers
= new_max
;
314 csc
->slab_buffers
= new_buffers
;
317 /* Initialize the new relocation. */
318 idx
= csc
->num_slab_buffers
++;
319 item
= &csc
->slab_buffers
[idx
];
322 item
->u
.slab
.real_idx
= real_idx
;
323 radeon_bo_reference(&item
->bo
, bo
);
324 p_atomic_inc(&bo
->num_cs_references
);
326 hash
= bo
->hash
& (ARRAY_SIZE(csc
->reloc_indices_hashlist
)-1);
327 csc
->reloc_indices_hashlist
[hash
] = idx
;
332 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs
*rcs
,
333 struct pb_buffer
*buf
,
334 enum radeon_bo_usage usage
,
335 enum radeon_bo_domain domains
,
336 enum radeon_bo_priority priority
)
338 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
339 struct radeon_bo
*bo
= (struct radeon_bo
*)buf
;
340 enum radeon_bo_domain added_domains
;
342 /* If VRAM is just stolen system memory, allow both VRAM and
343 * GTT, whichever has free space. If a buffer is evicted from
344 * VRAM to GTT, it will stay there.
346 if (!cs
->ws
->info
.has_dedicated_vram
)
347 domains
|= RADEON_DOMAIN_GTT
;
349 enum radeon_bo_domain rd
= usage
& RADEON_USAGE_READ
? domains
: 0;
350 enum radeon_bo_domain wd
= usage
& RADEON_USAGE_WRITE
? domains
: 0;
351 struct drm_radeon_cs_reloc
*reloc
;
355 index
= radeon_lookup_or_add_slab_buffer(cs
, bo
);
359 index
= cs
->csc
->slab_buffers
[index
].u
.slab
.real_idx
;
361 index
= radeon_lookup_or_add_real_buffer(cs
, bo
);
364 reloc
= &cs
->csc
->relocs
[index
];
365 added_domains
= (rd
| wd
) & ~(reloc
->read_domains
| reloc
->write_domain
);
366 reloc
->read_domains
|= rd
;
367 reloc
->write_domain
|= wd
;
368 reloc
->flags
= MAX2(reloc
->flags
, priority
);
369 cs
->csc
->relocs_bo
[index
].u
.real
.priority_usage
|= 1ull << priority
;
371 if (added_domains
& RADEON_DOMAIN_VRAM
)
372 cs
->base
.used_vram
+= bo
->base
.size
;
373 else if (added_domains
& RADEON_DOMAIN_GTT
)
374 cs
->base
.used_gart
+= bo
->base
.size
;
379 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs
*rcs
,
380 struct pb_buffer
*buf
)
382 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
384 return radeon_lookup_buffer(cs
->csc
, (struct radeon_bo
*)buf
);
387 static bool radeon_drm_cs_validate(struct radeon_winsys_cs
*rcs
)
389 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
391 cs
->base
.used_gart
< cs
->ws
->info
.gart_size
* 0.8 &&
392 cs
->base
.used_vram
< cs
->ws
->info
.vram_size
* 0.8;
395 cs
->csc
->num_validated_relocs
= cs
->csc
->num_relocs
;
397 /* Remove lately-added buffers. The validation failed with them
398 * and the CS is about to be flushed because of that. Keep only
399 * the already-validated buffers. */
402 for (i
= cs
->csc
->num_validated_relocs
; i
< cs
->csc
->num_relocs
; i
++) {
403 p_atomic_dec(&cs
->csc
->relocs_bo
[i
].bo
->num_cs_references
);
404 radeon_bo_reference(&cs
->csc
->relocs_bo
[i
].bo
, NULL
);
406 cs
->csc
->num_relocs
= cs
->csc
->num_validated_relocs
;
408 /* Flush if there are any relocs. Clean up otherwise. */
409 if (cs
->csc
->num_relocs
) {
410 cs
->flush_cs(cs
->flush_data
, PIPE_FLUSH_ASYNC
, NULL
);
412 radeon_cs_context_cleanup(cs
->csc
);
413 cs
->base
.used_vram
= 0;
414 cs
->base
.used_gart
= 0;
416 assert(cs
->base
.current
.cdw
== 0);
417 if (cs
->base
.current
.cdw
!= 0) {
418 fprintf(stderr
, "radeon: Unexpected error in %s.\n", __func__
);
425 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs
*rcs
, unsigned dw
)
427 assert(rcs
->current
.cdw
<= rcs
->current
.max_dw
);
428 return rcs
->current
.max_dw
- rcs
->current
.cdw
>= dw
;
431 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs
*rcs
,
432 struct radeon_bo_list_item
*list
)
434 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
438 for (i
= 0; i
< cs
->csc
->num_relocs
; i
++) {
439 list
[i
].bo_size
= cs
->csc
->relocs_bo
[i
].bo
->base
.size
;
440 list
[i
].vm_address
= cs
->csc
->relocs_bo
[i
].bo
->va
;
441 list
[i
].priority_usage
= cs
->csc
->relocs_bo
[i
].u
.real
.priority_usage
;
444 return cs
->csc
->num_relocs
;
447 void radeon_drm_cs_emit_ioctl_oneshot(void *job
, int thread_index
)
449 struct radeon_cs_context
*csc
= ((struct radeon_drm_cs
*)job
)->cst
;
453 r
= drmCommandWriteRead(csc
->fd
, DRM_RADEON_CS
,
454 &csc
->cs
, sizeof(struct drm_radeon_cs
));
457 fprintf(stderr
, "radeon: Not enough memory for command submission.\n");
458 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
461 fprintf(stderr
, "radeon: The kernel rejected CS, dumping...\n");
462 for (i
= 0; i
< csc
->chunks
[0].length_dw
; i
++) {
463 fprintf(stderr
, "0x%08X\n", csc
->buf
[i
]);
466 fprintf(stderr
, "radeon: The kernel rejected CS, "
467 "see dmesg for more information (%i).\n", r
);
471 for (i
= 0; i
< csc
->num_relocs
; i
++)
472 p_atomic_dec(&csc
->relocs_bo
[i
].bo
->num_active_ioctls
);
473 for (i
= 0; i
< csc
->num_slab_buffers
; i
++)
474 p_atomic_dec(&csc
->slab_buffers
[i
].bo
->num_active_ioctls
);
476 radeon_cs_context_cleanup(csc
);
480 * Make sure previous submission of this cs are completed
482 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs
*rcs
)
484 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
486 /* Wait for any pending ioctl of this CS to complete. */
487 if (util_queue_is_initialized(&cs
->ws
->cs_queue
))
488 util_queue_fence_wait(&cs
->flush_completed
);
491 /* Add the given fence to a slab buffer fence list.
493 * There is a potential race condition when bo participates in submissions on
494 * two or more threads simultaneously. Since we do not know which of the
495 * submissions will be sent to the GPU first, we have to keep the fences
496 * of all submissions.
498 * However, fences that belong to submissions that have already returned from
499 * their respective ioctl do not have to be kept, because we know that they
500 * will signal earlier.
502 static void radeon_bo_slab_fence(struct radeon_bo
*bo
, struct radeon_bo
*fence
)
506 assert(fence
->num_cs_references
);
508 /* Cleanup older fences */
510 for (unsigned src
= 0; src
< bo
->u
.slab
.num_fences
; ++src
) {
511 if (bo
->u
.slab
.fences
[src
]->num_cs_references
) {
512 bo
->u
.slab
.fences
[dst
] = bo
->u
.slab
.fences
[src
];
515 radeon_bo_reference(&bo
->u
.slab
.fences
[src
], NULL
);
518 bo
->u
.slab
.num_fences
= dst
;
520 /* Check available space for the new fence */
521 if (bo
->u
.slab
.num_fences
>= bo
->u
.slab
.max_fences
) {
522 unsigned new_max_fences
= bo
->u
.slab
.max_fences
+ 1;
523 struct radeon_bo
**new_fences
= REALLOC(bo
->u
.slab
.fences
,
524 bo
->u
.slab
.max_fences
* sizeof(*new_fences
),
525 new_max_fences
* sizeof(*new_fences
));
527 fprintf(stderr
, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
531 bo
->u
.slab
.fences
= new_fences
;
532 bo
->u
.slab
.max_fences
= new_max_fences
;
535 /* Add the new fence */
536 bo
->u
.slab
.fences
[bo
->u
.slab
.num_fences
] = NULL
;
537 radeon_bo_reference(&bo
->u
.slab
.fences
[bo
->u
.slab
.num_fences
], fence
);
538 bo
->u
.slab
.num_fences
++;
541 DEBUG_GET_ONCE_BOOL_OPTION(noop
, "RADEON_NOOP", false)
543 static int radeon_drm_cs_flush(struct radeon_winsys_cs
*rcs
,
545 struct pipe_fence_handle
**pfence
)
547 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
548 struct radeon_cs_context
*tmp
;
550 switch (cs
->ring_type
) {
552 /* pad DMA ring to 8 DWs */
553 if (cs
->ws
->info
.chip_class
<= SI
) {
554 while (rcs
->current
.cdw
& 7)
555 radeon_emit(&cs
->base
, 0xf0000000); /* NOP packet */
557 while (rcs
->current
.cdw
& 7)
558 radeon_emit(&cs
->base
, 0x00000000); /* NOP packet */
562 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
563 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
565 if (cs
->ws
->info
.gfx_ib_pad_with_type2
) {
566 while (rcs
->current
.cdw
& 7)
567 radeon_emit(&cs
->base
, 0x80000000); /* type2 nop packet */
569 while (rcs
->current
.cdw
& 7)
570 radeon_emit(&cs
->base
, 0xffff1000); /* type3 nop packet */
574 while (rcs
->current
.cdw
& 15)
575 radeon_emit(&cs
->base
, 0x80000000); /* type2 nop packet */
581 if (rcs
->current
.cdw
> rcs
->current
.max_dw
) {
582 fprintf(stderr
, "radeon: command stream overflowed\n");
585 if (pfence
|| cs
->csc
->num_slab_buffers
) {
586 struct pipe_fence_handle
*fence
;
588 if (cs
->next_fence
) {
589 fence
= cs
->next_fence
;
590 cs
->next_fence
= NULL
;
592 fence
= radeon_cs_create_fence(rcs
);
597 radeon_fence_reference(pfence
, fence
);
599 mtx_lock(&cs
->ws
->bo_fence_lock
);
600 for (unsigned i
= 0; i
< cs
->csc
->num_slab_buffers
; ++i
) {
601 struct radeon_bo
*bo
= cs
->csc
->slab_buffers
[i
].bo
;
602 p_atomic_inc(&bo
->num_active_ioctls
);
603 radeon_bo_slab_fence(bo
, (struct radeon_bo
*)fence
);
605 mtx_unlock(&cs
->ws
->bo_fence_lock
);
607 radeon_fence_reference(&fence
, NULL
);
610 radeon_fence_reference(&cs
->next_fence
, NULL
);
613 radeon_drm_cs_sync_flush(rcs
);
615 /* Swap command streams. */
620 /* If the CS is not empty or overflowed, emit it in a separate thread. */
621 if (cs
->base
.current
.cdw
&& cs
->base
.current
.cdw
<= cs
->base
.current
.max_dw
&& !debug_get_option_noop()) {
622 unsigned i
, num_relocs
;
624 num_relocs
= cs
->cst
->num_relocs
;
626 cs
->cst
->chunks
[0].length_dw
= cs
->base
.current
.cdw
;
628 for (i
= 0; i
< num_relocs
; i
++) {
629 /* Update the number of active asynchronous CS ioctls for the buffer. */
630 p_atomic_inc(&cs
->cst
->relocs_bo
[i
].bo
->num_active_ioctls
);
633 switch (cs
->ring_type
) {
635 cs
->cst
->flags
[0] = 0;
636 cs
->cst
->flags
[1] = RADEON_CS_RING_DMA
;
637 cs
->cst
->cs
.num_chunks
= 3;
638 if (cs
->ws
->info
.has_virtual_memory
) {
639 cs
->cst
->flags
[0] |= RADEON_CS_USE_VM
;
644 cs
->cst
->flags
[0] = 0;
645 cs
->cst
->flags
[1] = RADEON_CS_RING_UVD
;
646 cs
->cst
->cs
.num_chunks
= 3;
650 cs
->cst
->flags
[0] = 0;
651 cs
->cst
->flags
[1] = RADEON_CS_RING_VCE
;
652 cs
->cst
->cs
.num_chunks
= 3;
658 cs
->cst
->flags
[0] = RADEON_CS_KEEP_TILING_FLAGS
;
659 cs
->cst
->flags
[1] = RADEON_CS_RING_GFX
;
660 cs
->cst
->cs
.num_chunks
= 3;
662 if (cs
->ws
->info
.has_virtual_memory
) {
663 cs
->cst
->flags
[0] |= RADEON_CS_USE_VM
;
664 cs
->cst
->cs
.num_chunks
= 3;
666 if (flags
& PIPE_FLUSH_END_OF_FRAME
) {
667 cs
->cst
->flags
[0] |= RADEON_CS_END_OF_FRAME
;
668 cs
->cst
->cs
.num_chunks
= 3;
670 if (cs
->ring_type
== RING_COMPUTE
) {
671 cs
->cst
->flags
[1] = RADEON_CS_RING_COMPUTE
;
672 cs
->cst
->cs
.num_chunks
= 3;
677 if (util_queue_is_initialized(&cs
->ws
->cs_queue
)) {
678 util_queue_add_job(&cs
->ws
->cs_queue
, cs
, &cs
->flush_completed
,
679 radeon_drm_cs_emit_ioctl_oneshot
, NULL
);
680 if (!(flags
& PIPE_FLUSH_ASYNC
))
681 radeon_drm_cs_sync_flush(rcs
);
683 radeon_drm_cs_emit_ioctl_oneshot(cs
, 0);
686 radeon_cs_context_cleanup(cs
->cst
);
689 /* Prepare a new CS. */
690 cs
->base
.current
.buf
= cs
->csc
->buf
;
691 cs
->base
.current
.cdw
= 0;
692 cs
->base
.used_vram
= 0;
693 cs
->base
.used_gart
= 0;
695 if (cs
->ring_type
== RING_GFX
)
696 cs
->ws
->num_gfx_IBs
++;
697 else if (cs
->ring_type
== RING_DMA
)
698 cs
->ws
->num_sdma_IBs
++;
702 static void radeon_drm_cs_destroy(struct radeon_winsys_cs
*rcs
)
704 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
706 radeon_drm_cs_sync_flush(rcs
);
707 util_queue_fence_destroy(&cs
->flush_completed
);
708 radeon_cs_context_cleanup(&cs
->csc1
);
709 radeon_cs_context_cleanup(&cs
->csc2
);
710 p_atomic_dec(&cs
->ws
->num_cs
);
711 radeon_destroy_cs_context(&cs
->csc1
);
712 radeon_destroy_cs_context(&cs
->csc2
);
713 radeon_fence_reference(&cs
->next_fence
, NULL
);
717 static bool radeon_bo_is_referenced(struct radeon_winsys_cs
*rcs
,
718 struct pb_buffer
*_buf
,
719 enum radeon_bo_usage usage
)
721 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
722 struct radeon_bo
*bo
= (struct radeon_bo
*)_buf
;
725 if (!bo
->num_cs_references
)
728 index
= radeon_lookup_buffer(cs
->csc
, bo
);
733 index
= cs
->csc
->slab_buffers
[index
].u
.slab
.real_idx
;
735 if ((usage
& RADEON_USAGE_WRITE
) && cs
->csc
->relocs
[index
].write_domain
)
737 if ((usage
& RADEON_USAGE_READ
) && cs
->csc
->relocs
[index
].read_domains
)
745 static struct pipe_fence_handle
*
746 radeon_cs_create_fence(struct radeon_winsys_cs
*rcs
)
748 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
749 struct pb_buffer
*fence
;
751 /* Create a fence, which is a dummy BO. */
752 fence
= cs
->ws
->base
.buffer_create(&cs
->ws
->base
, 1, 1,
753 RADEON_DOMAIN_GTT
, RADEON_FLAG_NO_SUBALLOC
);
757 /* Add the fence as a dummy relocation. */
758 cs
->ws
->base
.cs_add_buffer(rcs
, fence
,
759 RADEON_USAGE_READWRITE
, RADEON_DOMAIN_GTT
,
761 return (struct pipe_fence_handle
*)fence
;
764 static bool radeon_fence_wait(struct radeon_winsys
*ws
,
765 struct pipe_fence_handle
*fence
,
768 return ws
->buffer_wait((struct pb_buffer
*)fence
, timeout
,
769 RADEON_USAGE_READWRITE
);
772 static void radeon_fence_reference(struct pipe_fence_handle
**dst
,
773 struct pipe_fence_handle
*src
)
775 pb_reference((struct pb_buffer
**)dst
, (struct pb_buffer
*)src
);
778 static struct pipe_fence_handle
*
779 radeon_drm_cs_get_next_fence(struct radeon_winsys_cs
*rcs
)
781 struct radeon_drm_cs
*cs
= radeon_drm_cs(rcs
);
782 struct pipe_fence_handle
*fence
= NULL
;
784 if (cs
->next_fence
) {
785 radeon_fence_reference(&fence
, cs
->next_fence
);
789 fence
= radeon_cs_create_fence(rcs
);
793 radeon_fence_reference(&cs
->next_fence
, fence
);
798 radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs
*cs
,
799 struct pipe_fence_handle
*fence
)
801 /* TODO: Handle the following unlikely multi-threaded scenario:
803 * Thread 1 / Context 1 Thread 2 / Context 2
804 * -------------------- --------------------
805 * f = cs_get_next_fence()
806 * cs_add_fence_dependency(f)
810 * We currently assume that this does not happen because we don't support
811 * asynchronous flushes on Radeon.
815 void radeon_drm_cs_init_functions(struct radeon_drm_winsys
*ws
)
817 ws
->base
.ctx_create
= radeon_drm_ctx_create
;
818 ws
->base
.ctx_destroy
= radeon_drm_ctx_destroy
;
819 ws
->base
.cs_create
= radeon_drm_cs_create
;
820 ws
->base
.cs_destroy
= radeon_drm_cs_destroy
;
821 ws
->base
.cs_add_buffer
= radeon_drm_cs_add_buffer
;
822 ws
->base
.cs_lookup_buffer
= radeon_drm_cs_lookup_buffer
;
823 ws
->base
.cs_validate
= radeon_drm_cs_validate
;
824 ws
->base
.cs_check_space
= radeon_drm_cs_check_space
;
825 ws
->base
.cs_get_buffer_list
= radeon_drm_cs_get_buffer_list
;
826 ws
->base
.cs_flush
= radeon_drm_cs_flush
;
827 ws
->base
.cs_get_next_fence
= radeon_drm_cs_get_next_fence
;
828 ws
->base
.cs_is_buffer_referenced
= radeon_bo_is_referenced
;
829 ws
->base
.cs_sync_flush
= radeon_drm_cs_sync_flush
;
830 ws
->base
.cs_add_fence_dependency
= radeon_drm_cs_add_fence_dependency
;
831 ws
->base
.fence_wait
= radeon_fence_wait
;
832 ws
->base
.fence_reference
= radeon_fence_reference
;