winsys/amdgpu: add a parallel compute IB coupled with a gfx IB
[mesa.git] / src / gallium / winsys / radeon / drm / radeon_drm_cs.c
1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <maraeo@gmail.com>
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
18 * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * The above copyright notice and this permission notice (including the
24 * next paragraph) shall be included in all copies or substantial portions
25 * of the Software.
26 */
27
28 /*
29 This file replaces libdrm's radeon_cs_gem with our own implemention.
30 It's optimized specifically for Radeon DRM.
31 Adding buffers and space checking are faster and simpler than their
32 counterparts in libdrm (the time complexity of all the functions
33 is O(1) in nearly all scenarios, thanks to hashing).
34
35 It works like this:
36
37 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
38 also adds the size of 'buf' to the used_gart and used_vram winsys variables
39 based on the domains, which are simply or'd for the accounting purposes.
40 The adding is skipped if the reloc is already present in the list, but it
41 accounts any newly-referenced domains.
42
43 cs_validate is then called, which just checks:
44 used_vram/gart < vram/gart_size * 0.8
45 The 0.8 number allows for some memory fragmentation. If the validation
46 fails, the pipe driver flushes CS and tries do the validation again,
47 i.e. it validates only that one operation. If it fails again, it drops
48 the operation on the floor and prints some nasty message to stderr.
49 (done in the pipe driver)
50
51 cs_write_reloc(cs, buf) just writes a reloc that has been added using
52 cs_add_buffer. The read_domain and write_domain parameters have been removed,
53 because we already specify them in cs_add_buffer.
54 */
55
56 #include "radeon_drm_cs.h"
57
58 #include "util/u_memory.h"
59 #include "util/os_time.h"
60
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <xf86drm.h>
65
66
67 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
68
69 static struct pipe_fence_handle *
70 radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
71 static void radeon_fence_reference(struct pipe_fence_handle **dst,
72 struct pipe_fence_handle *src);
73
74 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws)
75 {
76 /* No context support here. Just return the winsys pointer
77 * as the "context". */
78 return (struct radeon_winsys_ctx*)ws;
79 }
80
81 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
82 {
83 /* No context support here. */
84 }
85
86 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
87 struct radeon_drm_winsys *ws)
88 {
89 int i;
90
91 csc->fd = ws->fd;
92
93 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
94 csc->chunks[0].length_dw = 0;
95 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
96 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
97 csc->chunks[1].length_dw = 0;
98 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
99 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
100 csc->chunks[2].length_dw = 2;
101 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
102
103 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
104 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
105 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
106
107 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
108
109 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
110 csc->reloc_indices_hashlist[i] = -1;
111 }
112 return true;
113 }
114
115 static void radeon_cs_context_cleanup(struct radeon_cs_context *csc)
116 {
117 unsigned i;
118
119 for (i = 0; i < csc->num_relocs; i++) {
120 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
121 radeon_bo_reference(&csc->relocs_bo[i].bo, NULL);
122 }
123 for (i = 0; i < csc->num_slab_buffers; ++i) {
124 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
125 radeon_bo_reference(&csc->slab_buffers[i].bo, NULL);
126 }
127
128 csc->num_relocs = 0;
129 csc->num_validated_relocs = 0;
130 csc->num_slab_buffers = 0;
131 csc->chunks[0].length_dw = 0;
132 csc->chunks[1].length_dw = 0;
133
134 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
135 csc->reloc_indices_hashlist[i] = -1;
136 }
137 }
138
139 static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
140 {
141 radeon_cs_context_cleanup(csc);
142 FREE(csc->slab_buffers);
143 FREE(csc->relocs_bo);
144 FREE(csc->relocs);
145 }
146
147
148 static struct radeon_cmdbuf *
149 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
150 enum ring_type ring_type,
151 void (*flush)(void *ctx, unsigned flags,
152 struct pipe_fence_handle **fence),
153 void *flush_ctx,
154 bool stop_exec_on_failure)
155 {
156 struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
157 struct radeon_drm_cs *cs;
158
159 cs = CALLOC_STRUCT(radeon_drm_cs);
160 if (!cs) {
161 return NULL;
162 }
163 util_queue_fence_init(&cs->flush_completed);
164
165 cs->ws = ws;
166 cs->flush_cs = flush;
167 cs->flush_data = flush_ctx;
168
169 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
170 FREE(cs);
171 return NULL;
172 }
173 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
174 radeon_destroy_cs_context(&cs->csc1);
175 FREE(cs);
176 return NULL;
177 }
178
179 /* Set the first command buffer as current. */
180 cs->csc = &cs->csc1;
181 cs->cst = &cs->csc2;
182 cs->base.current.buf = cs->csc->buf;
183 cs->base.current.max_dw = ARRAY_SIZE(cs->csc->buf);
184 cs->ring_type = ring_type;
185
186 p_atomic_inc(&ws->num_cs);
187 return &cs->base;
188 }
189
190 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo)
191 {
192 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
193 struct radeon_bo_item *buffers;
194 unsigned num_buffers;
195 int i = csc->reloc_indices_hashlist[hash];
196
197 if (bo->handle) {
198 buffers = csc->relocs_bo;
199 num_buffers = csc->num_relocs;
200 } else {
201 buffers = csc->slab_buffers;
202 num_buffers = csc->num_slab_buffers;
203 }
204
205 /* not found or found */
206 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
207 return i;
208
209 /* Hash collision, look for the BO in the list of relocs linearly. */
210 for (i = num_buffers - 1; i >= 0; i--) {
211 if (buffers[i].bo == bo) {
212 /* Put this reloc in the hash list.
213 * This will prevent additional hash collisions if there are
214 * several consecutive lookup_buffer calls for the same buffer.
215 *
216 * Example: Assuming buffers A,B,C collide in the hash list,
217 * the following sequence of relocs:
218 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
219 * will collide here: ^ and here: ^,
220 * meaning that we should get very few collisions in the end. */
221 csc->reloc_indices_hashlist[hash] = i;
222 return i;
223 }
224 }
225 return -1;
226 }
227
228 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
229 struct radeon_bo *bo)
230 {
231 struct radeon_cs_context *csc = cs->csc;
232 struct drm_radeon_cs_reloc *reloc;
233 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
234 int i = -1;
235
236 i = radeon_lookup_buffer(csc, bo);
237
238 if (i >= 0) {
239 /* For async DMA, every add_buffer call must add a buffer to the list
240 * no matter how many duplicates there are. This is due to the fact
241 * the DMA CS checker doesn't use NOP packets for offset patching,
242 * but always uses the i-th buffer from the list to patch the i-th
243 * offset. If there are N offsets in a DMA CS, there must also be N
244 * buffers in the relocation list.
245 *
246 * This doesn't have to be done if virtual memory is enabled,
247 * because there is no offset patching with virtual memory.
248 */
249 if (cs->ring_type != RING_DMA || cs->ws->info.r600_has_virtual_memory) {
250 return i;
251 }
252 }
253
254 /* New relocation, check if the backing array is large enough. */
255 if (csc->num_relocs >= csc->max_relocs) {
256 uint32_t size;
257 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
258
259 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
260 csc->relocs_bo = realloc(csc->relocs_bo, size);
261
262 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
263 csc->relocs = realloc(csc->relocs, size);
264
265 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
266 }
267
268 /* Initialize the new relocation. */
269 csc->relocs_bo[csc->num_relocs].bo = NULL;
270 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
271 radeon_bo_reference(&csc->relocs_bo[csc->num_relocs].bo, bo);
272 p_atomic_inc(&bo->num_cs_references);
273 reloc = &csc->relocs[csc->num_relocs];
274 reloc->handle = bo->handle;
275 reloc->read_domains = 0;
276 reloc->write_domain = 0;
277 reloc->flags = 0;
278
279 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
280
281 csc->chunks[1].length_dw += RELOC_DWORDS;
282
283 return csc->num_relocs++;
284 }
285
286 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
287 struct radeon_bo *bo)
288 {
289 struct radeon_cs_context *csc = cs->csc;
290 unsigned hash;
291 struct radeon_bo_item *item;
292 int idx;
293 int real_idx;
294
295 idx = radeon_lookup_buffer(csc, bo);
296 if (idx >= 0)
297 return idx;
298
299 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
300
301 /* Check if the backing array is large enough. */
302 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
303 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
304 (unsigned)(csc->max_slab_buffers * 1.3));
305 struct radeon_bo_item *new_buffers =
306 REALLOC(csc->slab_buffers,
307 csc->max_slab_buffers * sizeof(*new_buffers),
308 new_max * sizeof(*new_buffers));
309 if (!new_buffers) {
310 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
311 return -1;
312 }
313
314 csc->max_slab_buffers = new_max;
315 csc->slab_buffers = new_buffers;
316 }
317
318 /* Initialize the new relocation. */
319 idx = csc->num_slab_buffers++;
320 item = &csc->slab_buffers[idx];
321
322 item->bo = NULL;
323 item->u.slab.real_idx = real_idx;
324 radeon_bo_reference(&item->bo, bo);
325 p_atomic_inc(&bo->num_cs_references);
326
327 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
328 csc->reloc_indices_hashlist[hash] = idx;
329
330 return idx;
331 }
332
333 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
334 struct pb_buffer *buf,
335 enum radeon_bo_usage usage,
336 enum radeon_bo_domain domains,
337 enum radeon_bo_priority priority)
338 {
339 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
340 struct radeon_bo *bo = (struct radeon_bo*)buf;
341 enum radeon_bo_domain added_domains;
342
343 /* If VRAM is just stolen system memory, allow both VRAM and
344 * GTT, whichever has free space. If a buffer is evicted from
345 * VRAM to GTT, it will stay there.
346 */
347 if (!cs->ws->info.has_dedicated_vram)
348 domains |= RADEON_DOMAIN_GTT;
349
350 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
351 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
352 struct drm_radeon_cs_reloc *reloc;
353 int index;
354
355 if (!bo->handle) {
356 index = radeon_lookup_or_add_slab_buffer(cs, bo);
357 if (index < 0)
358 return 0;
359
360 index = cs->csc->slab_buffers[index].u.slab.real_idx;
361 } else {
362 index = radeon_lookup_or_add_real_buffer(cs, bo);
363 }
364
365 reloc = &cs->csc->relocs[index];
366 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
367 reloc->read_domains |= rd;
368 reloc->write_domain |= wd;
369 reloc->flags = MAX2(reloc->flags, priority);
370 cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
371
372 if (added_domains & RADEON_DOMAIN_VRAM)
373 cs->base.used_vram += bo->base.size;
374 else if (added_domains & RADEON_DOMAIN_GTT)
375 cs->base.used_gart += bo->base.size;
376
377 return index;
378 }
379
380 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
381 struct pb_buffer *buf)
382 {
383 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
384
385 return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
386 }
387
388 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
389 {
390 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
391 bool status =
392 cs->base.used_gart < cs->ws->info.gart_size * 0.8 &&
393 cs->base.used_vram < cs->ws->info.vram_size * 0.8;
394
395 if (status) {
396 cs->csc->num_validated_relocs = cs->csc->num_relocs;
397 } else {
398 /* Remove lately-added buffers. The validation failed with them
399 * and the CS is about to be flushed because of that. Keep only
400 * the already-validated buffers. */
401 unsigned i;
402
403 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
404 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
405 radeon_bo_reference(&cs->csc->relocs_bo[i].bo, NULL);
406 }
407 cs->csc->num_relocs = cs->csc->num_validated_relocs;
408
409 /* Flush if there are any relocs. Clean up otherwise. */
410 if (cs->csc->num_relocs) {
411 cs->flush_cs(cs->flush_data,
412 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
413 } else {
414 radeon_cs_context_cleanup(cs->csc);
415 cs->base.used_vram = 0;
416 cs->base.used_gart = 0;
417
418 assert(cs->base.current.cdw == 0);
419 if (cs->base.current.cdw != 0) {
420 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
421 }
422 }
423 }
424 return status;
425 }
426
427 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
428 {
429 assert(rcs->current.cdw <= rcs->current.max_dw);
430 return rcs->current.max_dw - rcs->current.cdw >= dw;
431 }
432
433 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
434 struct radeon_bo_list_item *list)
435 {
436 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
437 int i;
438
439 if (list) {
440 for (i = 0; i < cs->csc->num_relocs; i++) {
441 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
442 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
443 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
444 }
445 }
446 return cs->csc->num_relocs;
447 }
448
449 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index)
450 {
451 struct radeon_cs_context *csc = ((struct radeon_drm_cs*)job)->cst;
452 unsigned i;
453 int r;
454
455 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
456 &csc->cs, sizeof(struct drm_radeon_cs));
457 if (r) {
458 if (r == -ENOMEM)
459 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
460 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
461 unsigned i;
462
463 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
464 for (i = 0; i < csc->chunks[0].length_dw; i++) {
465 fprintf(stderr, "0x%08X\n", csc->buf[i]);
466 }
467 } else {
468 fprintf(stderr, "radeon: The kernel rejected CS, "
469 "see dmesg for more information (%i).\n", r);
470 }
471 }
472
473 for (i = 0; i < csc->num_relocs; i++)
474 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
475 for (i = 0; i < csc->num_slab_buffers; i++)
476 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
477
478 radeon_cs_context_cleanup(csc);
479 }
480
481 /*
482 * Make sure previous submission of this cs are completed
483 */
484 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
485 {
486 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
487
488 /* Wait for any pending ioctl of this CS to complete. */
489 if (util_queue_is_initialized(&cs->ws->cs_queue))
490 util_queue_fence_wait(&cs->flush_completed);
491 }
492
493 /* Add the given fence to a slab buffer fence list.
494 *
495 * There is a potential race condition when bo participates in submissions on
496 * two or more threads simultaneously. Since we do not know which of the
497 * submissions will be sent to the GPU first, we have to keep the fences
498 * of all submissions.
499 *
500 * However, fences that belong to submissions that have already returned from
501 * their respective ioctl do not have to be kept, because we know that they
502 * will signal earlier.
503 */
504 static void radeon_bo_slab_fence(struct radeon_bo *bo, struct radeon_bo *fence)
505 {
506 unsigned dst;
507
508 assert(fence->num_cs_references);
509
510 /* Cleanup older fences */
511 dst = 0;
512 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
513 if (bo->u.slab.fences[src]->num_cs_references) {
514 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
515 dst++;
516 } else {
517 radeon_bo_reference(&bo->u.slab.fences[src], NULL);
518 }
519 }
520 bo->u.slab.num_fences = dst;
521
522 /* Check available space for the new fence */
523 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
524 unsigned new_max_fences = bo->u.slab.max_fences + 1;
525 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
526 bo->u.slab.max_fences * sizeof(*new_fences),
527 new_max_fences * sizeof(*new_fences));
528 if (!new_fences) {
529 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
530 return;
531 }
532
533 bo->u.slab.fences = new_fences;
534 bo->u.slab.max_fences = new_max_fences;
535 }
536
537 /* Add the new fence */
538 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
539 radeon_bo_reference(&bo->u.slab.fences[bo->u.slab.num_fences], fence);
540 bo->u.slab.num_fences++;
541 }
542
543 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
544
545 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
546 unsigned flags,
547 struct pipe_fence_handle **pfence)
548 {
549 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
550 struct radeon_cs_context *tmp;
551
552 switch (cs->ring_type) {
553 case RING_DMA:
554 /* pad DMA ring to 8 DWs */
555 if (cs->ws->info.chip_class <= GFX6) {
556 while (rcs->current.cdw & 7)
557 radeon_emit(&cs->base, 0xf0000000); /* NOP packet */
558 } else {
559 while (rcs->current.cdw & 7)
560 radeon_emit(&cs->base, 0x00000000); /* NOP packet */
561 }
562 break;
563 case RING_GFX:
564 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
565 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
566 */
567 if (cs->ws->info.gfx_ib_pad_with_type2) {
568 while (rcs->current.cdw & 7)
569 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
570 } else {
571 while (rcs->current.cdw & 7)
572 radeon_emit(&cs->base, 0xffff1000); /* type3 nop packet */
573 }
574 break;
575 case RING_UVD:
576 while (rcs->current.cdw & 15)
577 radeon_emit(&cs->base, 0x80000000); /* type2 nop packet */
578 break;
579 default:
580 break;
581 }
582
583 if (rcs->current.cdw > rcs->current.max_dw) {
584 fprintf(stderr, "radeon: command stream overflowed\n");
585 }
586
587 if (pfence || cs->csc->num_slab_buffers) {
588 struct pipe_fence_handle *fence;
589
590 if (cs->next_fence) {
591 fence = cs->next_fence;
592 cs->next_fence = NULL;
593 } else {
594 fence = radeon_cs_create_fence(rcs);
595 }
596
597 if (fence) {
598 if (pfence)
599 radeon_fence_reference(pfence, fence);
600
601 mtx_lock(&cs->ws->bo_fence_lock);
602 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
603 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
604 p_atomic_inc(&bo->num_active_ioctls);
605 radeon_bo_slab_fence(bo, (struct radeon_bo *)fence);
606 }
607 mtx_unlock(&cs->ws->bo_fence_lock);
608
609 radeon_fence_reference(&fence, NULL);
610 }
611 } else {
612 radeon_fence_reference(&cs->next_fence, NULL);
613 }
614
615 radeon_drm_cs_sync_flush(rcs);
616
617 /* Swap command streams. */
618 tmp = cs->csc;
619 cs->csc = cs->cst;
620 cs->cst = tmp;
621
622 /* If the CS is not empty or overflowed, emit it in a separate thread. */
623 if (cs->base.current.cdw && cs->base.current.cdw <= cs->base.current.max_dw && !debug_get_option_noop()) {
624 unsigned i, num_relocs;
625
626 num_relocs = cs->cst->num_relocs;
627
628 cs->cst->chunks[0].length_dw = cs->base.current.cdw;
629
630 for (i = 0; i < num_relocs; i++) {
631 /* Update the number of active asynchronous CS ioctls for the buffer. */
632 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
633 }
634
635 switch (cs->ring_type) {
636 case RING_DMA:
637 cs->cst->flags[0] = 0;
638 cs->cst->flags[1] = RADEON_CS_RING_DMA;
639 cs->cst->cs.num_chunks = 3;
640 if (cs->ws->info.r600_has_virtual_memory) {
641 cs->cst->flags[0] |= RADEON_CS_USE_VM;
642 }
643 break;
644
645 case RING_UVD:
646 cs->cst->flags[0] = 0;
647 cs->cst->flags[1] = RADEON_CS_RING_UVD;
648 cs->cst->cs.num_chunks = 3;
649 break;
650
651 case RING_VCE:
652 cs->cst->flags[0] = 0;
653 cs->cst->flags[1] = RADEON_CS_RING_VCE;
654 cs->cst->cs.num_chunks = 3;
655 break;
656
657 default:
658 case RING_GFX:
659 case RING_COMPUTE:
660 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
661 cs->cst->flags[1] = RADEON_CS_RING_GFX;
662 cs->cst->cs.num_chunks = 3;
663
664 if (cs->ws->info.r600_has_virtual_memory) {
665 cs->cst->flags[0] |= RADEON_CS_USE_VM;
666 cs->cst->cs.num_chunks = 3;
667 }
668 if (flags & PIPE_FLUSH_END_OF_FRAME) {
669 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
670 cs->cst->cs.num_chunks = 3;
671 }
672 if (cs->ring_type == RING_COMPUTE) {
673 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
674 cs->cst->cs.num_chunks = 3;
675 }
676 break;
677 }
678
679 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
680 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
681 radeon_drm_cs_emit_ioctl_oneshot, NULL);
682 if (!(flags & PIPE_FLUSH_ASYNC))
683 radeon_drm_cs_sync_flush(rcs);
684 } else {
685 radeon_drm_cs_emit_ioctl_oneshot(cs, 0);
686 }
687 } else {
688 radeon_cs_context_cleanup(cs->cst);
689 }
690
691 /* Prepare a new CS. */
692 cs->base.current.buf = cs->csc->buf;
693 cs->base.current.cdw = 0;
694 cs->base.used_vram = 0;
695 cs->base.used_gart = 0;
696
697 if (cs->ring_type == RING_GFX)
698 cs->ws->num_gfx_IBs++;
699 else if (cs->ring_type == RING_DMA)
700 cs->ws->num_sdma_IBs++;
701 return 0;
702 }
703
704 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
705 {
706 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
707
708 radeon_drm_cs_sync_flush(rcs);
709 util_queue_fence_destroy(&cs->flush_completed);
710 radeon_cs_context_cleanup(&cs->csc1);
711 radeon_cs_context_cleanup(&cs->csc2);
712 p_atomic_dec(&cs->ws->num_cs);
713 radeon_destroy_cs_context(&cs->csc1);
714 radeon_destroy_cs_context(&cs->csc2);
715 radeon_fence_reference(&cs->next_fence, NULL);
716 FREE(cs);
717 }
718
719 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
720 struct pb_buffer *_buf,
721 enum radeon_bo_usage usage)
722 {
723 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
724 struct radeon_bo *bo = (struct radeon_bo*)_buf;
725 int index;
726
727 if (!bo->num_cs_references)
728 return false;
729
730 index = radeon_lookup_buffer(cs->csc, bo);
731 if (index == -1)
732 return false;
733
734 if (!bo->handle)
735 index = cs->csc->slab_buffers[index].u.slab.real_idx;
736
737 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
738 return true;
739 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
740 return true;
741
742 return false;
743 }
744
745 /* FENCES */
746
747 static struct pipe_fence_handle *
748 radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
749 {
750 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
751 struct pb_buffer *fence;
752
753 /* Create a fence, which is a dummy BO. */
754 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
755 RADEON_DOMAIN_GTT, RADEON_FLAG_NO_SUBALLOC);
756 if (!fence)
757 return NULL;
758
759 /* Add the fence as a dummy relocation. */
760 cs->ws->base.cs_add_buffer(rcs, fence,
761 RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
762 RADEON_PRIO_FENCE);
763 return (struct pipe_fence_handle*)fence;
764 }
765
766 static bool radeon_fence_wait(struct radeon_winsys *ws,
767 struct pipe_fence_handle *fence,
768 uint64_t timeout)
769 {
770 return ws->buffer_wait((struct pb_buffer*)fence, timeout,
771 RADEON_USAGE_READWRITE);
772 }
773
774 static void radeon_fence_reference(struct pipe_fence_handle **dst,
775 struct pipe_fence_handle *src)
776 {
777 pb_reference((struct pb_buffer**)dst, (struct pb_buffer*)src);
778 }
779
780 static struct pipe_fence_handle *
781 radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
782 {
783 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
784 struct pipe_fence_handle *fence = NULL;
785
786 if (cs->next_fence) {
787 radeon_fence_reference(&fence, cs->next_fence);
788 return fence;
789 }
790
791 fence = radeon_cs_create_fence(rcs);
792 if (!fence)
793 return NULL;
794
795 radeon_fence_reference(&cs->next_fence, fence);
796 return fence;
797 }
798
799 static void
800 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
801 struct pipe_fence_handle *fence,
802 unsigned dependency_flags)
803 {
804 /* TODO: Handle the following unlikely multi-threaded scenario:
805 *
806 * Thread 1 / Context 1 Thread 2 / Context 2
807 * -------------------- --------------------
808 * f = cs_get_next_fence()
809 * cs_add_fence_dependency(f)
810 * cs_flush()
811 * cs_flush()
812 *
813 * We currently assume that this does not happen because we don't support
814 * asynchronous flushes on Radeon.
815 */
816 }
817
818 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
819 {
820 ws->base.ctx_create = radeon_drm_ctx_create;
821 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
822 ws->base.cs_create = radeon_drm_cs_create;
823 ws->base.cs_destroy = radeon_drm_cs_destroy;
824 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
825 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
826 ws->base.cs_validate = radeon_drm_cs_validate;
827 ws->base.cs_check_space = radeon_drm_cs_check_space;
828 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
829 ws->base.cs_flush = radeon_drm_cs_flush;
830 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
831 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
832 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
833 ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
834 ws->base.fence_wait = radeon_fence_wait;
835 ws->base.fence_reference = radeon_fence_reference;
836 }