radeonsi: remove 'Authors:' comments
[mesa.git] / src / gallium / winsys / radeon / drm / radeon_drm_cs.c
1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 /*
29 This file replaces libdrm's radeon_cs_gem with our own implemention.
30 It's optimized specifically for Radeon DRM.
31 Adding buffers and space checking are faster and simpler than their
32 counterparts in libdrm (the time complexity of all the functions
33 is O(1) in nearly all scenarios, thanks to hashing).
34
35 It works like this:
36
37 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38 also adds the size of 'buf' to the used_gart and used_vram winsys variables
39 based on the domains, which are simply or'd for the accounting purposes.
40 The adding is skipped if the reloc is already present in the list, but it
41 accounts any newly-referenced domains.
42
43 cs_validate is then called, which just checks:
44 used_vram/gart < vram/gart_size * 0.8
45 The 0.8 number allows for some memory fragmentation. If the validation
46 fails, the pipe driver flushes CS and tries do the validation again,
47 i.e. it validates only that one operation. If it fails again, it drops
48 the operation on the floor and prints some nasty message to stderr.
49 (done in the pipe driver)
50
51 cs_write_reloc(cs, buf) just writes a reloc that has been added using
52 cs_add_buffer. The read_domain and write_domain parameters have been removed,
53 because we already specify them in cs_add_buffer.
54 */
55
56 #include "radeon_drm_cs.h"
57
58 #include "util/u_memory.h"
59 #include "os/os_time.h"
60
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65
66
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68
69 static struct pipe_fence_handle *
70 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
71 static void radeon_fence_reference(struct pipe_fence_handle **dst,
72 struct pipe_fence_handle *src);
73
74 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
75 {
76 /* No context support here. Just return the winsys pointer
77 * as the "context". */
78 return (struct radeon_winsys_ctx*)ws;
79 }
80
81 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
82 {
83 /* No context support here. */
84 }
85
86 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
87 struct radeon_drm_winsys *ws)
88 {
89 int i;
90
91 csc->fd = ws->fd;
92
93 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
94 csc->chunks[0].length_dw = 0;
95 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
96 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
97 csc->chunks[1].length_dw = 0;
98 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
99 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
100 csc->chunks[2].length_dw = 2;
101 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
102
103 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
104 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
105 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
106
107 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
108
109 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
110 csc->reloc_indices_hashlist[i] = -1;
111 }
112 return true;
113 }
114
115 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
116 {
117 unsigned i;
118
119 for (i = 0; i < csc->num_relocs; i++) {
120 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
121 radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
122 }
123 for (i = 0; i < csc->num_slab_buffers; ++i) {
124 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
125 radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
126 }
127
128 csc->num_relocs = 0;
129 csc->num_validated_relocs = 0;
130 csc->num_slab_buffers = 0;
131 csc->chunks[0].length_dw = 0;
132 csc->chunks[1].length_dw = 0;
133
134 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
135 csc->reloc_indices_hashlist[i] = -1;
136 }
137 }
138
139 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
140 {
141 radeon_cs_context_cleanup(csc);
142 FREE(csc->slab_buffers);
143 FREE(csc->relocs_bo);
144 FREE(csc->relocs);
145 }
146
147
148 static struct radeon_winsys_cs *
149 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
150 enum ring_type ring_type,
151 void (*flush)(void *ctx, unsigned flags,
152 struct pipe_fence_handle **fence),
153 void *flush_ctx)
154 {
155 struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
156 struct radeon_drm_cs *cs;
157
158 cs = CALLOC_STRUCT(radeon_drm_cs);
159 if (!cs) {
160 return NULL;
161 }
162 util_queue_fence_init(&cs->flush_completed);
163
164 cs->ws = ws;
165 cs->flush_cs = flush;
166 cs->flush_data = flush_ctx;
167
168 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
169 FREE(cs);
170 return NULL;
171 }
172 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
173 radeon_destroy_cs_context(&cs->csc1);
174 FREE(cs);
175 return NULL;
176 }
177
178 /* Set the first command buffer as current. */
179 cs->csc = &cs->csc1;
180 cs->cst = &cs->csc2;
181 cs->base.current.buf = cs->csc->buf;
182 cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
183 cs->ring_type = ring_type;
184
185 p_atomic_inc(&ws->num_cs);
186 return &cs->base;
187 }
188
189 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
190 {
191 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
192 struct radeon_bo_item *buffers;
193 unsigned num_buffers;
194 int i = csc->reloc_indices_hashlist[hash];
195
196 if (bo->handle) {
197 buffers = csc->relocs_bo;
198 num_buffers = csc->num_relocs;
199 } else {
200 buffers = csc->slab_buffers;
201 num_buffers = csc->num_slab_buffers;
202 }
203
204 /* not found or found */
205 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
206 return i;
207
208 /* Hash collision, look for the BO in the list of relocs linearly. */
209 for (i = num_buffers - 1; i >= 0; i--) {
210 if (buffers[i].bo == bo) {
211 /* Put this reloc in the hash list.
212 * This will prevent additional hash collisions if there are
213 * several consecutive lookup_buffer calls for the same buffer.
214 *
215 * Example: Assuming buffers A,B,C collide in the hash list,
216 * the following sequence of relocs:
217 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
218 * will collide here: ^ and here: ^,
219 * meaning that we should get very few collisions in the end. */
220 csc->reloc_indices_hashlist[hash] = i;
221 return i;
222 }
223 }
224 return -1;
225 }
226
227 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
228 struct radeon_bo *bo)
229 {
230 struct radeon_cs_context *csc = cs->csc;
231 struct drm_radeon_cs_reloc *reloc;
232 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
233 int i = -1;
234
235 i = radeon_lookup_buffer(csc, bo);
236
237 if (i >= 0) {
238 /* For async DMA, every add_buffer call must add a buffer to the list
239 * no matter how many duplicates there are. This is due to the fact
240 * the DMA CS checker doesn't use NOP packets for offset patching,
241 * but always uses the i-th buffer from the list to patch the i-th
242 * offset. If there are N offsets in a DMA CS, there must also be N
243 * buffers in the relocation list.
244 *
245 * This doesn't have to be done if virtual memory is enabled,
246 * because there is no offset patching with virtual memory.
247 */
248 if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
249 return i;
250 }
251 }
252
253 /* New relocation, check if the backing array is large enough. */
254 if (csc->num_relocs >= csc->max_relocs) {
255 uint32_t size;
256 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
257
258 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
259 csc->relocs_bo = realloc(csc->relocs_bo, size);
260
261 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
262 csc->relocs = realloc(csc->relocs, size);
263
264 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
265 }
266
267 /* Initialize the new relocation. */
268 csc->relocs_bo[csc->num_relocs].bo = NULL;
269 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
270 radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
271 p_atomic_inc(&bo->num_cs_references);
272 reloc = &csc->relocs[csc->num_relocs];
273 reloc->handle = bo->handle;
274 reloc->read_domains = 0;
275 reloc->write_domain = 0;
276 reloc->flags = 0;
277
278 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
279
280 csc->chunks[1].length_dw += RELOC_DWORDS;
281
282 return csc->num_relocs++;
283 }
284
285 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
286 struct radeon_bo *bo)
287 {
288 struct radeon_cs_context *csc = cs->csc;
289 unsigned hash;
290 struct radeon_bo_item *item;
291 int idx;
292 int real_idx;
293
294 idx = radeon_lookup_buffer(csc, bo);
295 if (idx >= 0)
296 return idx;
297
298 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
299
300 /* Check if the backing array is large enough. */
301 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
302 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
303 (unsigned)(csc->max_slab_buffers * 1.3));
304 struct radeon_bo_item *new_buffers =
305 REALLOC(csc->slab_buffers,
306 csc->max_slab_buffers * sizeof(*new_buffers),
307 new_max * sizeof(*new_buffers));
308 if (!new_buffers) {
309 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
310 return -1;
311 }
312
313 csc->max_slab_buffers = new_max;
314 csc->slab_buffers = new_buffers;
315 }
316
317 /* Initialize the new relocation. */
318 idx = csc->num_slab_buffers++;
319 item = &csc->slab_buffers[idx];
320
321 item->bo = NULL;
322 item->u.slab.real_idx = real_idx;
323 radeon_bo_reference(&item->bo, bo);
324 p_atomic_inc(&bo->num_cs_references);
325
326 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
327 csc->reloc_indices_hashlist[hash] = idx;
328
329 return idx;
330 }
331
332 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
333 struct pb_buffer *buf,
334 enum radeon_bo_usage usage,
335 enum radeon_bo_domain domains,
336 enum radeon_bo_priority priority)
337 {
338 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
339 struct radeon_bo *bo = (struct radeon_bo*)buf;
340 enum radeon_bo_domain added_domains;
341 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
342 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
343 struct drm_radeon_cs_reloc *reloc;
344 int index;
345
346 if (!bo->handle) {
347 index = radeon_lookup_or_add_slab_buffer(cs, bo);
348 if (index < 0)
349 return 0;
350
351 index = cs->csc->slab_buffers[index].u.slab.real_idx;
352 } else {
353 index = radeon_lookup_or_add_real_buffer(cs, bo);
354 }
355
356 reloc = &cs->csc->relocs[index];
357 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
358 reloc->read_domains |= rd;
359 reloc->write_domain |= wd;
360 reloc->flags = MAX2(reloc->flags, priority);
361 cs->csc->relocs_bo[index].u.real.priority_usage |= 1ull << priority;
362
363 if (added_domains & RADEON_DOMAIN_VRAM)
364 cs->base.used_vram += bo->base.size;
365 else if (added_domains & RADEON_DOMAIN_GTT)
366 cs->base.used_gart += bo->base.size;
367
368 return index;
369 }
370
371 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
372 struct pb_buffer *buf)
373 {
374 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
375
376 return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
377 }
378
379 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
380 {
381 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
382 bool status =
383 cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
384 cs->base.used_vram < cs->ws->info.vram_size * 0.8;
385
386 if (status) {
387 cs->csc->num_validated_relocs = cs->csc->num_relocs;
388 } else {
389 /* Remove lately-added buffers. The validation failed with them
390 * and the CS is about to be flushed because of that. Keep only
391 * the already-validated buffers. */
392 unsigned i;
393
394 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
395 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
396 radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
397 }
398 cs->csc->num_relocs = cs->csc->num_validated_relocs;
399
400 /* Flush if there are any relocs. Clean up otherwise. */
401 if (cs->csc->num_relocs) {
402 cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
403 } else {
404 radeon_cs_context_cleanup(cs->csc);
405 cs->base.used_vram = 0;
406 cs->base.used_gart = 0;
407
408 assert(cs->base.current.cdw == 0);
409 if (cs->base.current.cdw != 0) {
410 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
411 }
412 }
413 }
414 return status;
415 }
416
417 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
418 {
419 assert(rcs->current.cdw <= rcs->current.max_dw);
420 return rcs->current.max_dw - rcs->current.cdw >= dw;
421 }
422
423 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
424 struct radeon_bo_list_item *list)
425 {
426 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
427 int i;
428
429 if (list) {
430 for (i = 0; i < cs->csc->num_relocs; i++) {
431 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
432 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
433 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
434 }
435 }
436 return cs->csc->num_relocs;
437 }
438
439 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
440 {
441 struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
442 unsigned i;
443 int r;
444
445 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
446 &csc->cs, sizeof(struct drm_radeon_cs));
447 if (r) {
448 if (r == -ENOMEM)
449 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
450 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
451 unsigned i;
452
453 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
454 for (i = 0; i < csc->chunks[0].length_dw; i++) {
455 fprintf(stderr, "0x%08X\n", csc->buf[i]);
456 }
457 } else {
458 fprintf(stderr, "radeon: The kernel rejected CS, "
459 "see dmesg for more information (%i).\n", r);
460 }
461 }
462
463 for (i = 0; i < csc->num_relocs; i++)
464 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
465 for (i = 0; i < csc->num_slab_buffers; i++)
466 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
467
468 radeon_cs_context_cleanup(csc);
469 }
470
471 /*
472 * Make sure previous submission of this cs are completed
473 */
474 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
475 {
476 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
477
478 /* Wait for any pending ioctl of this CS to complete. */
479 if (util_queue_is_initialized(&cs->ws->cs_queue))
480 util_queue_fence_wait(&cs->flush_completed);
481 }
482
483 /* Add the given fence to a slab buffer fence list.
484 *
485 * There is a potential race condition when bo participates in submissions on
486 * two or more threads simultaneously. Since we do not know which of the
487 * submissions will be sent to the GPU first, we have to keep the fences
488 * of all submissions.
489 *
490 * However, fences that belong to submissions that have already returned from
491 * their respective ioctl do not have to be kept, because we know that they
492 * will signal earlier.
493 */
494 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
495 {
496 unsigned dst;
497
498 assert(fence->num_cs_references);
499
500 /* Cleanup older fences */
501 dst = 0;
502 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
503 if (bo->u.slab.fences[src]->num_cs_references) {
504 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
505 dst++;
506 } else {
507 radeon_bo_reference(&bo->u.slab.fences[src], NULL);
508 }
509 }
510 bo->u.slab.num_fences = dst;
511
512 /* Check available space for the new fence */
513 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
514 unsigned new_max_fences = bo->u.slab.max_fences + 1;
515 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
516 bo->u.slab.max_fences * sizeof(*new_fences),
517 new_max_fences * sizeof(*new_fences));
518 if (!new_fences) {
519 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
520 return;
521 }
522
523 bo->u.slab.fences = new_fences;
524 bo->u.slab.max_fences = new_max_fences;
525 }
526
527 /* Add the new fence */
528 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
529 radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
530 bo->u.slab.num_fences++;
531 }
532
533 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
534
535 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
536 unsigned flags,
537 struct pipe_fence_handle **pfence)
538 {
539 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
540 struct radeon_cs_context *tmp;
541
542 switch (cs->ring_type) {
543 case RING_DMA:
544 /* pad DMA ring to 8 DWs */
545 if (cs->ws->info.chip_class <= SI) {
546 while (rcs->current.cdw & 7)
547 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
548 } else {
549 while (rcs->current.cdw & 7)
550 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
551 }
552 break;
553 case RING_GFX:
554 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
555 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
556 */
557 if (cs->ws->info.gfx_ib_pad_with_type2) {
558 while (rcs->current.cdw & 7)
559 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
560 } else {
561 while (rcs->current.cdw & 7)
562 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
563 }
564 break;
565 case RING_UVD:
566 while (rcs->current.cdw & 15)
567 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
568 break;
569 default:
570 break;
571 }
572
573 if (rcs->current.cdw > rcs->current.max_dw) {
574 fprintf(stderr, "radeon: command stream overflowed\n");
575 }
576
577 if (pfence || cs->csc->num_slab_buffers) {
578 struct pipe_fence_handle *fence;
579
580 if (cs->next_fence) {
581 fence = cs->next_fence;
582 cs->next_fence = NULL;
583 } else {
584 fence = radeon_cs_create_fence(rcs);
585 }
586
587 if (fence) {
588 if (pfence)
589 radeon_fence_reference(pfence, fence);
590
591 mtx_lock(&cs->ws->bo_fence_lock);
592 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
593 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
594 p_atomic_inc(&bo->num_active_ioctls);
595 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
596 }
597 mtx_unlock(&cs->ws->bo_fence_lock);
598
599 radeon_fence_reference(&fence, NULL);
600 }
601 } else {
602 radeon_fence_reference(&cs->next_fence, NULL);
603 }
604
605 radeon_drm_cs_sync_flush(rcs);
606
607 /* Swap command streams. */
608 tmp = cs->csc;
609 cs->csc = cs->cst;
610 cs->cst = tmp;
611
612 /* If the CS is not empty or overflowed, emit it in a separate thread. */
613 if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
614 unsigned i, num_relocs;
615
616 num_relocs = cs->cst->num_relocs;
617
618 cs->cst->chunks[0].length_dw = cs->base.current.cdw;
619
620 for (i = 0; i < num_relocs; i++) {
621 /* Update the number of active asynchronous CS ioctls for the buffer. */
622 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
623 }
624
625 switch (cs->ring_type) {
626 case RING_DMA:
627 cs->cst->flags[0] = 0;
628 cs->cst->flags[1] = RADEON_CS_RING_DMA;
629 cs->cst->cs.num_chunks = 3;
630 if (cs->ws->info.has_virtual_memory) {
631 cs->cst->flags[0] |= RADEON_CS_USE_VM;
632 }
633 break;
634
635 case RING_UVD:
636 cs->cst->flags[0] = 0;
637 cs->cst->flags[1] = RADEON_CS_RING_UVD;
638 cs->cst->cs.num_chunks = 3;
639 break;
640
641 case RING_VCE:
642 cs->cst->flags[0] = 0;
643 cs->cst->flags[1] = RADEON_CS_RING_VCE;
644 cs->cst->cs.num_chunks = 3;
645 break;
646
647 default:
648 case RING_GFX:
649 case RING_COMPUTE:
650 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
651 cs->cst->flags[1] = RADEON_CS_RING_GFX;
652 cs->cst->cs.num_chunks = 3;
653
654 if (cs->ws->info.has_virtual_memory) {
655 cs->cst->flags[0] |= RADEON_CS_USE_VM;
656 cs->cst->cs.num_chunks = 3;
657 }
658 if (flags & RADEON_FLUSH_END_OF_FRAME) {
659 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
660 cs->cst->cs.num_chunks = 3;
661 }
662 if (cs->ring_type == RING_COMPUTE) {
663 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
664 cs->cst->cs.num_chunks = 3;
665 }
666 break;
667 }
668
669 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
670 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
671 radeon_drm_cs_emit_ioctl_oneshot, NULL);
672 if (!(flags & RADEON_FLUSH_ASYNC))
673 radeon_drm_cs_sync_flush(rcs);
674 } else {
675 radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
676 }
677 } else {
678 radeon_cs_context_cleanup(cs->cst);
679 }
680
681 /* Prepare a new CS. */
682 cs->base.current.buf = cs->csc->buf;
683 cs->base.current.cdw = 0;
684 cs->base.used_vram = 0;
685 cs->base.used_gart = 0;
686
687 if (cs->ring_type == RING_GFX)
688 cs->ws->num_gfx_IBs++;
689 else if (cs->ring_type == RING_DMA)
690 cs->ws->num_sdma_IBs++;
691 return 0;
692 }
693
694 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
695 {
696 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
697
698 radeon_drm_cs_sync_flush(rcs);
699 util_queue_fence_destroy(&cs->flush_completed);
700 radeon_cs_context_cleanup(&cs->csc1);
701 radeon_cs_context_cleanup(&cs->csc2);
702 p_atomic_dec(&cs->ws->num_cs);
703 radeon_destroy_cs_context(&cs->csc1);
704 radeon_destroy_cs_context(&cs->csc2);
705 radeon_fence_reference(&cs->next_fence, NULL);
706 FREE(cs);
707 }
708
709 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
710 struct pb_buffer *_buf,
711 enum radeon_bo_usage usage)
712 {
713 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
714 struct radeon_bo *bo = (struct radeon_bo*)_buf;
715 int index;
716
717 if (!bo->num_cs_references)
718 return false;
719
720 index = radeon_lookup_buffer(cs->csc, bo);
721 if (index == -1)
722 return false;
723
724 if (!bo->handle)
725 index = cs->csc->slab_buffers[index].u.slab.real_idx;
726
727 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
728 return true;
729 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
730 return true;
731
732 return false;
733 }
734
735 /* FENCES */
736
737 static struct pipe_fence_handle *
738 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
739 {
740 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
741 struct pb_buffer *fence;
742
743 /* Create a fence, which is a dummy BO. */
744 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
745 RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
746 if (!fence)
747 return NULL;
748
749 /* Add the fence as a dummy relocation. */
750 cs->ws->base.cs_add_buffer(rcs, fence,
751 RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
752 RADEON_PRIO_FENCE);
753 return (struct pipe_fence_handle*)fence;
754 }
755
756 static bool radeon_fence_wait(struct radeon_winsys *ws,
757 struct pipe_fence_handle *fence,
758 uint64_t timeout)
759 {
760 return ws->buffer_wait((struct pb_buffer*)fence, timeout,
761 RADEON_USAGE_READWRITE);
762 }
763
764 static void radeon_fence_reference(struct pipe_fence_handle **dst,
765 struct pipe_fence_handle *src)
766 {
767 pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
768 }
769
770 static struct pipe_fence_handle *
771 radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
772 {
773 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
774 struct pipe_fence_handle *fence = NULL;
775
776 if (cs->next_fence) {
777 radeon_fence_reference(&fence, cs->next_fence);
778 return fence;
779 }
780
781 fence = radeon_cs_create_fence(rcs);
782 if (!fence)
783 return NULL;
784
785 radeon_fence_reference(&cs->next_fence, fence);
786 return fence;
787 }
788
789 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
790 {
791 ws->base.ctx_create = radeon_drm_ctx_create;
792 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
793 ws->base.cs_create = radeon_drm_cs_create;
794 ws->base.cs_destroy = radeon_drm_cs_destroy;
795 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
796 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
797 ws->base.cs_validate = radeon_drm_cs_validate;
798 ws->base.cs_check_space = radeon_drm_cs_check_space;
799 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
800 ws->base.cs_flush = radeon_drm_cs_flush;
801 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
802 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
803 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
804 ws->base.fence_wait = radeon_fence_wait;
805 ws->base.fence_reference = radeon_fence_reference;
806 }