gallium/radeon: rename RADEON_FLAG_HANDLE -> RADEON_FLAG_NO_SUBALLOC
[mesa.git] / src / gallium / winsys / radeon / drm / radeon_drm_cs.c
1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27 /*
28 * Authors:
29 * Marek Olšák <maraeo@gmail.com>
30 *
31 * Based on work from libdrm_radeon by:
32 * Aapo Tahkola <aet@rasterburn.org>
33 * Nicolai Haehnle <prefect_@gmx.net>
34 * Jérôme Glisse <glisse@freedesktop.org>
35 */
36
37 /*
38 This file replaces libdrm's radeon_cs_gem with our own implemention.
39 It's optimized specifically for Radeon DRM.
40 Adding buffers and space checking are faster and simpler than their
41 counterparts in libdrm (the time complexity of all the functions
42 is O(1) in nearly all scenarios, thanks to hashing).
43
44 It works like this:
45
46 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
47 also adds the size of 'buf' to the used_gart and used_vram winsys variables
48 based on the domains, which are simply or'd for the accounting purposes.
49 The adding is skipped if the reloc is already present in the list, but it
50 accounts any newly-referenced domains.
51
52 cs_validate is then called, which just checks:
53 used_vram/gart < vram/gart_size * 0.8
54 The 0.8 number allows for some memory fragmentation. If the validation
55 fails, the pipe driver flushes CS and tries do the validation again,
56 i.e. it validates only that one operation. If it fails again, it drops
57 the operation on the floor and prints some nasty message to stderr.
58 (done in the pipe driver)
59
60 cs_write_reloc(cs, buf) just writes a reloc that has been added using
61 cs_add_buffer. The read_domain and write_domain parameters have been removed,
62 because we already specify them in cs_add_buffer.
63 */
64
65 #include "radeon_drm_cs.h"
66
67 #include "util/u_memory.h"
68 #include "os/os_time.h"
69
70 #include <stdio.h>
71 #include <stdlib.h>
72 #include <stdint.h>
73 #include <xf86drm.h>
74
75
76 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
77
78 static struct pipe_fence_handle *
79 radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
80 static void radeon_fence_reference(struct pipe_fence_handle **dst,
81 struct pipe_fence_handle *src);
82
83 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
84 {
85 /* No context support here. Just return the winsys pointer
86 * as the "context". */
87 return (struct radeon_winsys_ctx*)ws;
88 }
89
90 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
91 {
92 /* No context support here. */
93 }
94
95 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
96 struct radeon_drm_winsys *ws)
97 {
98 int i;
99
100 csc->fd = ws->fd;
101
102 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
103 csc->chunks[0].length_dw = 0;
104 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
105 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
106 csc->chunks[1].length_dw = 0;
107 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
108 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
109 csc->chunks[2].length_dw = 2;
110 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
111
112 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
113 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
114 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
115
116 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
117
118 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
119 csc->reloc_indices_hashlist[i] = -1;
120 }
121 return true;
122 }
123
124 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
125 {
126 unsigned i;
127
128 for (i = 0; i < csc->num_relocs; i++) {
129 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
130 radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
131 }
132 for (i = 0; i < csc->num_slab_buffers; ++i) {
133 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
134 radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
135 }
136
137 csc->num_relocs = 0;
138 csc->num_validated_relocs = 0;
139 csc->num_slab_buffers = 0;
140 csc->chunks[0].length_dw = 0;
141 csc->chunks[1].length_dw = 0;
142
143 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
144 csc->reloc_indices_hashlist[i] = -1;
145 }
146 }
147
148 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
149 {
150 radeon_cs_context_cleanup(csc);
151 FREE(csc->slab_buffers);
152 FREE(csc->relocs_bo);
153 FREE(csc->relocs);
154 }
155
156
157 static struct radeon_winsys_cs *
158 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
159 enum ring_type ring_type,
160 void (*flush)(void *ctx, unsigned flags,
161 struct pipe_fence_handle **fence),
162 void *flush_ctx)
163 {
164 struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
165 struct radeon_drm_cs *cs;
166
167 cs = CALLOC_STRUCT(radeon_drm_cs);
168 if (!cs) {
169 return NULL;
170 }
171 util_queue_fence_init(&cs->flush_completed);
172
173 cs->ws = ws;
174 cs->flush_cs = flush;
175 cs->flush_data = flush_ctx;
176
177 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
178 FREE(cs);
179 return NULL;
180 }
181 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
182 radeon_destroy_cs_context(&cs->csc1);
183 FREE(cs);
184 return NULL;
185 }
186
187 /* Set the first command buffer as current. */
188 cs->csc = &cs->csc1;
189 cs->cst = &cs->csc2;
190 cs->base.current.buf = cs->csc->buf;
191 cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
192 cs->ring_type = ring_type;
193
194 p_atomic_inc(&ws->num_cs);
195 return &cs->base;
196 }
197
198 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
199 {
200 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
201 struct radeon_bo_item *buffers;
202 unsigned num_buffers;
203 int i = csc->reloc_indices_hashlist[hash];
204
205 if (bo->handle) {
206 buffers = csc->relocs_bo;
207 num_buffers = csc->num_relocs;
208 } else {
209 buffers = csc->slab_buffers;
210 num_buffers = csc->num_slab_buffers;
211 }
212
213 /* not found or found */
214 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
215 return i;
216
217 /* Hash collision, look for the BO in the list of relocs linearly. */
218 for (i = num_buffers - 1; i >= 0; i--) {
219 if (buffers[i].bo == bo) {
220 /* Put this reloc in the hash list.
221 * This will prevent additional hash collisions if there are
222 * several consecutive lookup_buffer calls for the same buffer.
223 *
224 * Example: Assuming buffers A,B,C collide in the hash list,
225 * the following sequence of relocs:
226 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
227 * will collide here: ^ and here: ^,
228 * meaning that we should get very few collisions in the end. */
229 csc->reloc_indices_hashlist[hash] = i;
230 return i;
231 }
232 }
233 return -1;
234 }
235
236 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
237 struct radeon_bo *bo)
238 {
239 struct radeon_cs_context *csc = cs->csc;
240 struct drm_radeon_cs_reloc *reloc;
241 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
242 int i = -1;
243
244 i = radeon_lookup_buffer(csc, bo);
245
246 if (i >= 0) {
247 /* For async DMA, every add_buffer call must add a buffer to the list
248 * no matter how many duplicates there are. This is due to the fact
249 * the DMA CS checker doesn't use NOP packets for offset patching,
250 * but always uses the i-th buffer from the list to patch the i-th
251 * offset. If there are N offsets in a DMA CS, there must also be N
252 * buffers in the relocation list.
253 *
254 * This doesn't have to be done if virtual memory is enabled,
255 * because there is no offset patching with virtual memory.
256 */
257 if (cs->ring_type != RING_DMA || cs->ws->info.has_virtual_memory) {
258 return i;
259 }
260 }
261
262 /* New relocation, check if the backing array is large enough. */
263 if (csc->num_relocs >= csc->max_relocs) {
264 uint32_t size;
265 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
266
267 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
268 csc->relocs_bo = realloc(csc->relocs_bo, size);
269
270 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
271 csc->relocs = realloc(csc->relocs, size);
272
273 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
274 }
275
276 /* Initialize the new relocation. */
277 csc->relocs_bo[csc->num_relocs].bo = NULL;
278 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
279 radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
280 p_atomic_inc(&bo->num_cs_references);
281 reloc = &csc->relocs[csc->num_relocs];
282 reloc->handle = bo->handle;
283 reloc->read_domains = 0;
284 reloc->write_domain = 0;
285 reloc->flags = 0;
286
287 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
288
289 csc->chunks[1].length_dw += RELOC_DWORDS;
290
291 return csc->num_relocs++;
292 }
293
294 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
295 struct radeon_bo *bo)
296 {
297 struct radeon_cs_context *csc = cs->csc;
298 unsigned hash;
299 struct radeon_bo_item *item;
300 int idx;
301 int real_idx;
302
303 idx = radeon_lookup_buffer(csc, bo);
304 if (idx >= 0)
305 return idx;
306
307 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
308
309 /* Check if the backing array is large enough. */
310 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
311 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
312 (unsigned)(csc->max_slab_buffers * 1.3));
313 struct radeon_bo_item *new_buffers =
314 REALLOC(csc->slab_buffers,
315 csc->max_slab_buffers * sizeof(*new_buffers),
316 new_max * sizeof(*new_buffers));
317 if (!new_buffers) {
318 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
319 return -1;
320 }
321
322 csc->max_slab_buffers = new_max;
323 csc->slab_buffers = new_buffers;
324 }
325
326 /* Initialize the new relocation. */
327 idx = csc->num_slab_buffers++;
328 item = &csc->slab_buffers[idx];
329
330 item->bo = NULL;
331 item->u.slab.real_idx = real_idx;
332 radeon_bo_reference(&item->bo, bo);
333 p_atomic_inc(&bo->num_cs_references);
334
335 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
336 csc->reloc_indices_hashlist[hash] = idx;
337
338 return idx;
339 }
340
341 static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
342 struct pb_buffer *buf,
343 enum radeon_bo_usage usage,
344 enum radeon_bo_domain domains,
345 enum radeon_bo_priority priority)
346 {
347 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
348 struct radeon_bo *bo = (struct radeon_bo*)buf;
349 enum radeon_bo_domain added_domains;
350 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
351 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
352 struct drm_radeon_cs_reloc *reloc;
353 int index;
354
355 if (!bo->handle) {
356 index = radeon_lookup_or_add_slab_buffer(cs, bo);
357 if (index < 0)
358 return 0;
359
360 index = cs->csc->slab_buffers[index].u.slab.real_idx;
361 } else {
362 index = radeon_lookup_or_add_real_buffer(cs, bo);
363 }
364
365 reloc = &cs->csc->relocs[index];
366 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
367 reloc->read_domains |= rd;
368 reloc->write_domain |= wd;
369 reloc->flags = MAX2(reloc->flags, priority);
370 cs->csc->relocs_bo[index].u.real.priority_usage |= 1llu << priority;
371
372 if (added_domains & RADEON_DOMAIN_VRAM)
373 cs->base.used_vram += bo->base.size;
374 else if (added_domains & RADEON_DOMAIN_GTT)
375 cs->base.used_gart += bo->base.size;
376
377 return index;
378 }
379
380 static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
381 struct pb_buffer *buf)
382 {
383 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
384
385 return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
386 }
387
388 static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
389 {
390 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
391 bool status =
392 cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
393 cs->base.used_vram < cs->ws->info.vram_size * 0.8;
394
395 if (status) {
396 cs->csc->num_validated_relocs = cs->csc->num_relocs;
397 } else {
398 /* Remove lately-added buffers. The validation failed with them
399 * and the CS is about to be flushed because of that. Keep only
400 * the already-validated buffers. */
401 unsigned i;
402
403 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
404 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
405 radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
406 }
407 cs->csc->num_relocs = cs->csc->num_validated_relocs;
408
409 /* Flush if there are any relocs. Clean up otherwise. */
410 if (cs->csc->num_relocs) {
411 cs->flush_cs(cs->flush_data, RADEON_FLUSH_ASYNC, NULL);
412 } else {
413 radeon_cs_context_cleanup(cs->csc);
414 cs->base.used_vram = 0;
415 cs->base.used_gart = 0;
416
417 assert(cs->base.current.cdw == 0);
418 if (cs->base.current.cdw != 0) {
419 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
420 }
421 }
422 }
423 return status;
424 }
425
426 static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
427 {
428 assert(rcs->current.cdw <= rcs->current.max_dw);
429 return rcs->current.max_dw - rcs->current.cdw >= dw;
430 }
431
432 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
433 struct radeon_bo_list_item *list)
434 {
435 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
436 int i;
437
438 if (list) {
439 for (i = 0; i < cs->csc->num_relocs; i++) {
440 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
441 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
442 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
443 }
444 }
445 return cs->csc->num_relocs;
446 }
447
448 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
449 {
450 struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
451 unsigned i;
452 int r;
453
454 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
455 &csc->cs, sizeof(struct drm_radeon_cs));
456 if (r) {
457 if (r == -ENOMEM)
458 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
459 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
460 unsigned i;
461
462 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
463 for (i = 0; i < csc->chunks[0].length_dw; i++) {
464 fprintf(stderr, "0x%08X\n", csc->buf[i]);
465 }
466 } else {
467 fprintf(stderr, "radeon: The kernel rejected CS, "
468 "see dmesg for more information (%i).\n", r);
469 }
470 }
471
472 for (i = 0; i < csc->num_relocs; i++)
473 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
474 for (i = 0; i < csc->num_slab_buffers; i++)
475 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
476
477 radeon_cs_context_cleanup(csc);
478 }
479
480 /*
481 * Make sure previous submission of this cs are completed
482 */
483 void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
484 {
485 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
486
487 /* Wait for any pending ioctl of this CS to complete. */
488 if (util_queue_is_initialized(&cs->ws->cs_queue))
489 util_queue_fence_wait(&cs->flush_completed);
490 }
491
492 /* Add the given fence to a slab buffer fence list.
493 *
494 * There is a potential race condition when bo participates in submissions on
495 * two or more threads simultaneously. Since we do not know which of the
496 * submissions will be sent to the GPU first, we have to keep the fences
497 * of all submissions.
498 *
499 * However, fences that belong to submissions that have already returned from
500 * their respective ioctl do not have to be kept, because we know that they
501 * will signal earlier.
502 */
503 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
504 {
505 unsigned dst;
506
507 assert(fence->num_cs_references);
508
509 /* Cleanup older fences */
510 dst = 0;
511 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
512 if (bo->u.slab.fences[src]->num_cs_references) {
513 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
514 dst++;
515 } else {
516 radeon_bo_reference(&bo->u.slab.fences[src], NULL);
517 }
518 }
519 bo->u.slab.num_fences = dst;
520
521 /* Check available space for the new fence */
522 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
523 unsigned new_max_fences = bo->u.slab.max_fences + 1;
524 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
525 bo->u.slab.max_fences * sizeof(*new_fences),
526 new_max_fences * sizeof(*new_fences));
527 if (!new_fences) {
528 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
529 return;
530 }
531
532 bo->u.slab.fences = new_fences;
533 bo->u.slab.max_fences = new_max_fences;
534 }
535
536 /* Add the new fence */
537 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
538 radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
539 bo->u.slab.num_fences++;
540 }
541
542 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
543
544 static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
545 unsigned flags,
546 struct pipe_fence_handle **pfence)
547 {
548 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
549 struct radeon_cs_context *tmp;
550
551 switch (cs->ring_type) {
552 case RING_DMA:
553 /* pad DMA ring to 8 DWs */
554 if (cs->ws->info.chip_class <= SI) {
555 while (rcs->current.cdw & 7)
556 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
557 } else {
558 while (rcs->current.cdw & 7)
559 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
560 }
561 break;
562 case RING_GFX:
563 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
564 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
565 */
566 if (cs->ws->info.gfx_ib_pad_with_type2) {
567 while (rcs->current.cdw & 7)
568 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
569 } else {
570 while (rcs->current.cdw & 7)
571 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
572 }
573 break;
574 case RING_UVD:
575 while (rcs->current.cdw & 15)
576 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
577 break;
578 default:
579 break;
580 }
581
582 if (rcs->current.cdw > rcs->current.max_dw) {
583 fprintf(stderr, "radeon: command stream overflowed\n");
584 }
585
586 if (pfence || cs->csc->num_slab_buffers) {
587 struct pipe_fence_handle *fence;
588
589 if (cs->next_fence) {
590 fence = cs->next_fence;
591 cs->next_fence = NULL;
592 } else {
593 fence = radeon_cs_create_fence(rcs);
594 }
595
596 if (fence) {
597 if (pfence)
598 radeon_fence_reference(pfence, fence);
599
600 mtx_lock(&cs->ws->bo_fence_lock);
601 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
602 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
603 p_atomic_inc(&bo->num_active_ioctls);
604 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
605 }
606 mtx_unlock(&cs->ws->bo_fence_lock);
607
608 radeon_fence_reference(&fence, NULL);
609 }
610 } else {
611 radeon_fence_reference(&cs->next_fence, NULL);
612 }
613
614 radeon_drm_cs_sync_flush(rcs);
615
616 /* Swap command streams. */
617 tmp = cs->csc;
618 cs->csc = cs->cst;
619 cs->cst = tmp;
620
621 /* If the CS is not empty or overflowed, emit it in a separate thread. */
622 if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
623 unsigned i, num_relocs;
624
625 num_relocs = cs->cst->num_relocs;
626
627 cs->cst->chunks[0].length_dw = cs->base.current.cdw;
628
629 for (i = 0; i < num_relocs; i++) {
630 /* Update the number of active asynchronous CS ioctls for the buffer. */
631 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
632 }
633
634 switch (cs->ring_type) {
635 case RING_DMA:
636 cs->cst->flags[0] = 0;
637 cs->cst->flags[1] = RADEON_CS_RING_DMA;
638 cs->cst->cs.num_chunks = 3;
639 if (cs->ws->info.has_virtual_memory) {
640 cs->cst->flags[0] |= RADEON_CS_USE_VM;
641 }
642 break;
643
644 case RING_UVD:
645 cs->cst->flags[0] = 0;
646 cs->cst->flags[1] = RADEON_CS_RING_UVD;
647 cs->cst->cs.num_chunks = 3;
648 break;
649
650 case RING_VCE:
651 cs->cst->flags[0] = 0;
652 cs->cst->flags[1] = RADEON_CS_RING_VCE;
653 cs->cst->cs.num_chunks = 3;
654 break;
655
656 default:
657 case RING_GFX:
658 case RING_COMPUTE:
659 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
660 cs->cst->flags[1] = RADEON_CS_RING_GFX;
661 cs->cst->cs.num_chunks = 3;
662
663 if (cs->ws->info.has_virtual_memory) {
664 cs->cst->flags[0] |= RADEON_CS_USE_VM;
665 cs->cst->cs.num_chunks = 3;
666 }
667 if (flags & RADEON_FLUSH_END_OF_FRAME) {
668 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
669 cs->cst->cs.num_chunks = 3;
670 }
671 if (cs->ring_type == RING_COMPUTE) {
672 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
673 cs->cst->cs.num_chunks = 3;
674 }
675 break;
676 }
677
678 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
679 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
680 radeon_drm_cs_emit_ioctl_oneshot, NULL);
681 if (!(flags & RADEON_FLUSH_ASYNC))
682 radeon_drm_cs_sync_flush(rcs);
683 } else {
684 radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
685 }
686 } else {
687 radeon_cs_context_cleanup(cs->cst);
688 }
689
690 /* Prepare a new CS. */
691 cs->base.current.buf = cs->csc->buf;
692 cs->base.current.cdw = 0;
693 cs->base.used_vram = 0;
694 cs->base.used_gart = 0;
695
696 if (cs->ring_type == RING_GFX)
697 cs->ws->num_gfx_IBs++;
698 else if (cs->ring_type == RING_DMA)
699 cs->ws->num_sdma_IBs++;
700 return 0;
701 }
702
703 static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
704 {
705 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
706
707 radeon_drm_cs_sync_flush(rcs);
708 util_queue_fence_destroy(&cs->flush_completed);
709 radeon_cs_context_cleanup(&cs->csc1);
710 radeon_cs_context_cleanup(&cs->csc2);
711 p_atomic_dec(&cs->ws->num_cs);
712 radeon_destroy_cs_context(&cs->csc1);
713 radeon_destroy_cs_context(&cs->csc2);
714 radeon_fence_reference(&cs->next_fence, NULL);
715 FREE(cs);
716 }
717
718 static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
719 struct pb_buffer *_buf,
720 enum radeon_bo_usage usage)
721 {
722 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
723 struct radeon_bo *bo = (struct radeon_bo*)_buf;
724 int index;
725
726 if (!bo->num_cs_references)
727 return false;
728
729 index = radeon_lookup_buffer(cs->csc, bo);
730 if (index == -1)
731 return false;
732
733 if (!bo->handle)
734 index = cs->csc->slab_buffers[index].u.slab.real_idx;
735
736 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
737 return true;
738 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
739 return true;
740
741 return false;
742 }
743
744 /* FENCES */
745
746 static struct pipe_fence_handle *
747 radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
748 {
749 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
750 struct pb_buffer *fence;
751
752 /* Create a fence, which is a dummy BO. */
753 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
754 RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
755 if (!fence)
756 return NULL;
757
758 /* Add the fence as a dummy relocation. */
759 cs->ws->base.cs_add_buffer(rcs, fence,
760 RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
761 RADEON_PRIO_FENCE);
762 return (struct pipe_fence_handle*)fence;
763 }
764
765 static bool radeon_fence_wait(struct radeon_winsys *ws,
766 struct pipe_fence_handle *fence,
767 uint64_t timeout)
768 {
769 return ws->buffer_wait((struct pb_buffer*)fence, timeout,
770 RADEON_USAGE_READWRITE);
771 }
772
773 static void radeon_fence_reference(struct pipe_fence_handle **dst,
774 struct pipe_fence_handle *src)
775 {
776 pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
777 }
778
779 static struct pipe_fence_handle *
780 radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
781 {
782 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
783 struct pipe_fence_handle *fence = NULL;
784
785 if (cs->next_fence) {
786 radeon_fence_reference(&fence, cs->next_fence);
787 return fence;
788 }
789
790 fence = radeon_cs_create_fence(rcs);
791 if (!fence)
792 return NULL;
793
794 radeon_fence_reference(&cs->next_fence, fence);
795 return fence;
796 }
797
798 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
799 {
800 ws->base.ctx_create = radeon_drm_ctx_create;
801 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
802 ws->base.cs_create = radeon_drm_cs_create;
803 ws->base.cs_destroy = radeon_drm_cs_destroy;
804 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
805 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
806 ws->base.cs_validate = radeon_drm_cs_validate;
807 ws->base.cs_check_space = radeon_drm_cs_check_space;
808 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
809 ws->base.cs_flush = radeon_drm_cs_flush;
810 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
811 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
812 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
813 ws->base.fence_wait = radeon_fence_wait;
814 ws->base.fence_reference = radeon_fence_reference;
815 }